@tollgateai/sdk 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +281 -81
- package/dist/index.cjs +33 -19
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +7 -7
- package/dist/index.d.ts +7 -7
- package/dist/index.js +33 -19
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,14 +1,60 @@
|
|
|
1
|
-
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://tollgateai.vercel.app/logo.png" alt="Tollgate" width="120" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">@tollgateai/sdk</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>Real-time gross-margin observability for AI-powered products.</strong><br />
|
|
9
|
+
Track every LLM call's cost, attribute it to a customer, and know whether you're making money — before the invoice goes out.
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
<p align="center">
|
|
13
|
+
<a href="https://www.npmjs.com/package/@tollgateai/sdk"><img src="https://img.shields.io/npm/v/@tollgateai/sdk?color=blue&label=npm" alt="npm" /></a>
|
|
14
|
+
<a href="https://www.npmjs.com/package/@tollgateai/sdk"><img src="https://img.shields.io/npm/dm/@tollgateai/sdk?color=green" alt="downloads" /></a>
|
|
15
|
+
<img src="https://img.shields.io/badge/node-%3E%3D18-brightgreen" alt="node" />
|
|
16
|
+
<img src="https://img.shields.io/badge/dependencies-0-brightgreen" alt="zero deps" />
|
|
17
|
+
<img src="https://img.shields.io/badge/license-MIT-blue" alt="license" />
|
|
18
|
+
</p>
|
|
19
|
+
|
|
20
|
+
<p align="center">
|
|
21
|
+
<a href="https://tollgateai.vercel.app">Dashboard</a> ·
|
|
22
|
+
<a href="https://pypi.org/project/tollgateai/">Python SDK</a> ·
|
|
23
|
+
<a href="#quick-start">Quick Start</a> ·
|
|
24
|
+
<a href="#api-reference">API Reference</a>
|
|
25
|
+
</p>
|
|
2
26
|
|
|
3
|
-
|
|
27
|
+
---
|
|
4
28
|
|
|
5
|
-
|
|
29
|
+
## Why Tollgate?
|
|
6
30
|
|
|
7
|
-
|
|
31
|
+
AI products bill customers on plans (per ticket, per seat, usage-based) but pay providers per token. **Tollgate joins the two in real time** — giving you per-customer, per-agent, per-run gross margin the moment each LLM call completes.
|
|
8
32
|
|
|
9
|
-
|
|
33
|
+
- **2-line integration** — wrap your provider client once; every call is tracked automatically.
|
|
34
|
+
- **Zero runtime dependencies** — ships as a single ESM + CJS bundle.
|
|
35
|
+
- **Non-blocking** — usage reporting is fire-and-forget with automatic retries. Failures never break your LLM calls.
|
|
36
|
+
- **Privacy-first** — no prompt content is ever transmitted. Only token counts, model identifiers, and metadata.
|
|
37
|
+
- **Universal coverage** — Anthropic, OpenAI, Google Gemini, AWS Bedrock, and every OpenAI-compatible gateway.
|
|
10
38
|
|
|
11
|
-
|
|
39
|
+
```
|
|
40
|
+
┌─────────────┐ ┌──────────────┐ ┌───────────────┐
|
|
41
|
+
│ Your App │───▶│ LLM Provider │───▶│ Provider │
|
|
42
|
+
│ (SDK wrap) │◀──│ (Anthropic, │◀──│ Response │
|
|
43
|
+
│ │ │ OpenAI, …) │ │ (tokens, id) │
|
|
44
|
+
└──────┬───────┘ └──────────────┘ └───────────────┘
|
|
45
|
+
│ POST /api/track (background, non-blocking)
|
|
46
|
+
▼
|
|
47
|
+
┌──────────────────────────────────────────────────────┐
|
|
48
|
+
│ Tollgate Server │
|
|
49
|
+
│ ┌────────────┐ ┌──────────┐ ┌──────────────────┐ │
|
|
50
|
+
│ │ Rate Card │ │ Plan │ │ Margin Rollups │ │
|
|
51
|
+
│ │ (1,500+ │ │ Revenue │ │ (per customer, │ │
|
|
52
|
+
│ │ models) │ │ Config │ │ agent, run) │ │
|
|
53
|
+
│ └────────────┘ └──────────┘ └──────────────────┘ │
|
|
54
|
+
└──────────────────────────────────────────────────────┘
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
---
|
|
12
58
|
|
|
13
59
|
## Installation
|
|
14
60
|
|
|
@@ -20,7 +66,9 @@ npm install @tollgateai/sdk
|
|
|
20
66
|
pnpm add @tollgateai/sdk # or yarn add @tollgateai/sdk
|
|
21
67
|
```
|
|
22
68
|
|
|
23
|
-
|
|
69
|
+
**Requirements:** Node.js 18+ · Zero runtime dependencies · ESM and CommonJS supported
|
|
70
|
+
|
|
71
|
+
---
|
|
24
72
|
|
|
25
73
|
## Quick Start
|
|
26
74
|
|
|
@@ -50,24 +98,30 @@ await tollgate.resolve({
|
|
|
50
98
|
});
|
|
51
99
|
```
|
|
52
100
|
|
|
101
|
+
---
|
|
102
|
+
|
|
53
103
|
## Provider Support
|
|
54
104
|
|
|
55
|
-
| Provider | Wrapper | Streaming |
|
|
105
|
+
| Provider | Wrapper | Streaming | Extracted Fields |
|
|
56
106
|
|---|---|---|---|
|
|
57
|
-
| **Anthropic** | `wrapAnthropic` | Automatic |
|
|
58
|
-
| **OpenAI** | `wrapOpenAI` | `stream_options: { include_usage: true }` |
|
|
59
|
-
| **Google Gemini** | `wrapGemini` | Automatic |
|
|
60
|
-
| **OpenAI-compatible** | `wrapOpenAI` + `provider: 'openai_compatible'` | Same as OpenAI | Same as OpenAI |
|
|
61
|
-
| **AWS Bedrock** | `wrapBedrock` | Automatic |
|
|
107
|
+
| **Anthropic** | `wrapAnthropic` | Automatic | Input/output tokens, cache read/write, web search requests, tool calls, latency |
|
|
108
|
+
| **OpenAI** | `wrapOpenAI` | `stream_options: { include_usage: true }` | Input/output tokens, reasoning, cached, audio in/out, text in/out, prediction tokens, service tier, tool calls, latency |
|
|
109
|
+
| **Google Gemini** | `wrapGemini` | Automatic | Input/output tokens, thinking, cached, audio/image/video per-modality, web search (grounding), tool calls, latency |
|
|
110
|
+
| **OpenAI-compatible** | `wrapOpenAI` + `provider: 'openai_compatible'` | Same as OpenAI | Same as OpenAI + gateway-reported cost (when available) |
|
|
111
|
+
| **AWS Bedrock** | `wrapBedrock` | Automatic | Input/output tokens, cache read/write (per-TTL split), tool calls, latency |
|
|
112
|
+
|
|
113
|
+
---
|
|
62
114
|
|
|
63
115
|
## Configuration
|
|
64
116
|
|
|
65
|
-
|
|
66
|
-
|---|---|---|
|
|
67
|
-
| `TOLLGATE_API_KEY` | Yes | — |
|
|
68
|
-
| `TOLLGATE_BASE_URL` | No | `https://tollgateai.vercel.app` |
|
|
117
|
+
### Environment Variables
|
|
69
118
|
|
|
70
|
-
|
|
119
|
+
| Variable | Required | Default | Description |
|
|
120
|
+
|---|---|---|---|
|
|
121
|
+
| `TOLLGATE_API_KEY` | Yes | — | Your account API key (`tg_live_…`) |
|
|
122
|
+
| `TOLLGATE_BASE_URL` | No | `https://tollgateai.vercel.app` | Self-hosted deployment URL |
|
|
123
|
+
|
|
124
|
+
### Programmatic Configuration
|
|
71
125
|
|
|
72
126
|
```ts
|
|
73
127
|
const tollgate = createTollgateClient({
|
|
@@ -82,7 +136,7 @@ const tollgate = createTollgateClient({
|
|
|
82
136
|
|
|
83
137
|
## Auto-Instrumentation
|
|
84
138
|
|
|
85
|
-
Wrap your provider client once. Every `create` / `generateContent` call reports usage in the background — non-blocking, fire-and-forget. Failures go to `onError` (default: `console.warn`) and never break your LLM call.
|
|
139
|
+
Wrap your provider client once. Every `create` / `generateContent` / `send` call reports usage in the background — non-blocking, fire-and-forget. Failures go to `onError` (default: `console.warn`) and never break your LLM call.
|
|
86
140
|
|
|
87
141
|
### Anthropic
|
|
88
142
|
|
|
@@ -137,7 +191,7 @@ const result = await model.generateContent('Explain quantum computing');
|
|
|
137
191
|
|
|
138
192
|
### OpenAI-Compatible Gateways
|
|
139
193
|
|
|
140
|
-
|
|
194
|
+
Works with any OpenAI-compatible endpoint — OpenRouter, Groq, Together, Nebius, Vercel AI Gateway, local vLLM, and more.
|
|
141
195
|
|
|
142
196
|
```ts
|
|
143
197
|
import OpenAI from 'openai';
|
|
@@ -156,6 +210,8 @@ await groq.chat.completions.create({
|
|
|
156
210
|
});
|
|
157
211
|
```
|
|
158
212
|
|
|
213
|
+
> When a gateway returns cost inline (e.g. OpenRouter's `usage.cost`), the SDK captures it automatically as `providerCostCents`. The server uses it verbatim, bypassing the rate card. Gateways that don't return cost fall through to rate-card pricing. An explicit `providerCostCents` in the wrapper options always takes precedence.
|
|
214
|
+
|
|
159
215
|
### AWS Bedrock
|
|
160
216
|
|
|
161
217
|
```ts
|
|
@@ -194,40 +250,140 @@ for await (const chunk of stream) { /* render to UI */ }
|
|
|
194
250
|
|
|
195
251
|
---
|
|
196
252
|
|
|
197
|
-
##
|
|
253
|
+
## Tracked Fields
|
|
198
254
|
|
|
199
255
|
Every auto-instrumented call captures these fields from the provider response:
|
|
200
256
|
|
|
201
257
|
| Field | Providers | Description |
|
|
202
258
|
|---|---|---|
|
|
203
|
-
| `tokensIn` | All | Input tokens
|
|
204
|
-
| `tokensOut` | All | Output tokens
|
|
205
|
-
| `reasoningTokens` | OpenAI,
|
|
259
|
+
| `tokensIn` | All | Input tokens (deduplicated — excludes cached/audio for OpenAI; excludes cached/audio/image/video for Gemini) |
|
|
260
|
+
| `tokensOut` | All | Output tokens (deduplicated — excludes reasoning/audio for OpenAI; excludes audio/image for Gemini) |
|
|
261
|
+
| `reasoningTokens` | OpenAI, Gemini | Reasoning/thinking tokens (billed at reasoning rate) |
|
|
206
262
|
| `cachedTokens` | All | Prompt cache read tokens (reduced rate) |
|
|
207
|
-
| `cacheWrite5mTokens` | Anthropic, Bedrock | 5-
|
|
208
|
-
| `cacheWrite1hTokens` |
|
|
209
|
-
| `audioTokensIn` | OpenAI | Audio
|
|
210
|
-
| `
|
|
211
|
-
| `imageTokensIn` | Gemini | Image/vision input tokens |
|
|
212
|
-
| `imageTokensOut` | Gemini | Image generation output tokens |
|
|
263
|
+
| `cacheWrite5mTokens` | Anthropic, Bedrock | Cache creation tokens (5-minute TTL) |
|
|
264
|
+
| `cacheWrite1hTokens` | Bedrock | Cache creation tokens (1-hour TTL) |
|
|
265
|
+
| `audioTokensIn` / `Out` | OpenAI, Gemini | Audio modality tokens (GPT-4o audio, Gemini multimodal) |
|
|
266
|
+
| `imageTokensIn` / `Out` | Gemini | Image/vision input and generation output tokens |
|
|
213
267
|
| `videoTokensIn` | Gemini | Video input tokens |
|
|
214
|
-
| `textTokensIn` | OpenAI, Gemini | Text-only
|
|
215
|
-
| `textTokensOut` | OpenAI, Gemini | Text-only output tokens |
|
|
268
|
+
| `textTokensIn` / `Out` | OpenAI, Gemini | Text-only modality tokens |
|
|
216
269
|
| `webSearchRequests` | Anthropic, Gemini | Web search requests (server tools / grounding) |
|
|
217
270
|
| `acceptedPredictionTokens` | OpenAI | Predicted Outputs: accepted tokens |
|
|
218
|
-
| `rejectedPredictionTokens` | OpenAI | Predicted Outputs: rejected
|
|
219
|
-
| `serviceTier` | OpenAI | Service tier
|
|
271
|
+
| `rejectedPredictionTokens` | OpenAI | Predicted Outputs: rejected (waste) tokens |
|
|
272
|
+
| `serviceTier` | OpenAI | Service tier (`default`, `flex`, `priority`) |
|
|
220
273
|
| `latencyMs` | All | SDK-measured request duration in milliseconds |
|
|
221
274
|
| `toolCalls` | All | Number of tool calls in the response |
|
|
275
|
+
| `providerCostCents` | OpenAI-compatible | Gateway-reported cost — used verbatim, bypasses rate card |
|
|
222
276
|
| `model` | All | Model identifier as reported by the provider |
|
|
223
277
|
|
|
224
|
-
Cost is computed **server-side** from token counts and a rate card that auto-syncs daily from the LiteLLM registry (1,500+ models). Rate cards include per-token pricing for
|
|
278
|
+
Cost is computed **server-side** from token counts and a rate card that auto-syncs daily from the LiteLLM registry (1,500+ models). Rate cards include per-token pricing for every modality, cache tier, reasoning, and web search. Unknown models are priced at $0 and flagged in logs.
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## Provider Field Coverage
|
|
283
|
+
|
|
284
|
+
<details>
|
|
285
|
+
<summary><strong>Anthropic</strong> — Messages API</summary>
|
|
286
|
+
|
|
287
|
+
| Anthropic API Field | SDK Field | Notes |
|
|
288
|
+
|---|---|---|
|
|
289
|
+
| `usage.input_tokens` | `tokensIn` | Input tokens (excludes cached) |
|
|
290
|
+
| `usage.output_tokens` | `tokensOut` | Output tokens (includes reasoning — billed at output rate) |
|
|
291
|
+
| `usage.cache_read_input_tokens` | `cachedTokens` | Prompt cache read tokens |
|
|
292
|
+
| `usage.cache_creation_input_tokens` | `cacheWrite5mTokens` | Prompt cache creation tokens |
|
|
293
|
+
| `usage.server_tool_use.web_search_requests` | `webSearchRequests` | Web search server tool requests |
|
|
294
|
+
| `response.content[]` (type `tool_use`) | `toolCalls` | Count of tool-use content blocks |
|
|
295
|
+
| *(SDK-measured)* | `latencyMs` | Request duration |
|
|
296
|
+
|
|
297
|
+
Anthropic bills reasoning tokens at the output rate. The SDK reports the full `output_tokens` count; the server-side rate card applies the matching output rate.
|
|
298
|
+
|
|
299
|
+
In streaming mode, `message_start` carries input/cache counts and `message_delta` carries the output count. The SDK accumulates both automatically.
|
|
300
|
+
|
|
301
|
+
</details>
|
|
302
|
+
|
|
303
|
+
<details>
|
|
304
|
+
<summary><strong>OpenAI</strong> — Chat Completions API</summary>
|
|
305
|
+
|
|
306
|
+
| OpenAI API Field | SDK Field | Notes |
|
|
307
|
+
|---|---|---|
|
|
308
|
+
| `usage.prompt_tokens` | `tokensIn` | **Minus** cached and audio tokens to prevent double-billing |
|
|
309
|
+
| `usage.completion_tokens` | `tokensOut` | **Minus** reasoning and audio tokens to prevent double-billing |
|
|
310
|
+
| `usage.completion_tokens_details.reasoning_tokens` | `reasoningTokens` | Reasoning/thinking tokens |
|
|
311
|
+
| `usage.prompt_tokens_details.cached_tokens` | `cachedTokens` | Prompt cache read tokens |
|
|
312
|
+
| `usage.prompt_tokens_details.audio_tokens` | `audioTokensIn` | Audio input tokens |
|
|
313
|
+
| `usage.completion_tokens_details.audio_tokens` | `audioTokensOut` | Audio output tokens |
|
|
314
|
+
| `usage.prompt_tokens_details.text_tokens` | `textTokensIn` | Text modality input tokens |
|
|
315
|
+
| `usage.completion_tokens_details.text_tokens` | `textTokensOut` | Text modality output tokens |
|
|
316
|
+
| `usage.completion_tokens_details.accepted_prediction_tokens` | `acceptedPredictionTokens` | Predicted Outputs: accepted |
|
|
317
|
+
| `usage.completion_tokens_details.rejected_prediction_tokens` | `rejectedPredictionTokens` | Predicted Outputs: rejected |
|
|
318
|
+
| `service_tier` | `serviceTier` | Service tier used |
|
|
319
|
+
| `choices[].message.tool_calls` | `toolCalls` | Tool call count |
|
|
320
|
+
| *(SDK-measured)* | `latencyMs` | Request duration |
|
|
321
|
+
|
|
322
|
+
OpenAI's `prompt_tokens` and `completion_tokens` are totals that include sub-category tokens. The SDK subtracts each sub-category so every token is costed at exactly one rate.
|
|
323
|
+
|
|
324
|
+
</details>
|
|
325
|
+
|
|
326
|
+
<details>
|
|
327
|
+
<summary><strong>Google Gemini</strong> — Generative AI / Vertex AI</summary>
|
|
328
|
+
|
|
329
|
+
| Google API Field | SDK Field | Notes |
|
|
330
|
+
|---|---|---|
|
|
331
|
+
| `usageMetadata.promptTokenCount` | `tokensIn` | **Minus** cached, audio, image, video to prevent double-billing |
|
|
332
|
+
| `usageMetadata.candidatesTokenCount` | `tokensOut` | **Minus** audio and image output (thinking is already excluded by Google) |
|
|
333
|
+
| `usageMetadata.thoughtsTokenCount` | `reasoningTokens` | Thinking/reasoning tokens (Gemini 2.x) |
|
|
334
|
+
| `usageMetadata.cachedContentTokenCount` | `cachedTokens` | Prompt cache read tokens |
|
|
335
|
+
| `promptTokensDetails[AUDIO]` | `audioTokensIn` | Audio input modality |
|
|
336
|
+
| `candidatesTokensDetails[AUDIO]` | `audioTokensOut` | Audio output modality |
|
|
337
|
+
| `promptTokensDetails[IMAGE]` | `imageTokensIn` | Image/vision input |
|
|
338
|
+
| `candidatesTokensDetails[IMAGE]` | `imageTokensOut` | Image generation output |
|
|
339
|
+
| `promptTokensDetails[VIDEO]` | `videoTokensIn` | Video input |
|
|
340
|
+
| `promptTokensDetails[TEXT]` | `textTokensIn` | Text input |
|
|
341
|
+
| `candidatesTokensDetails[TEXT]` | `textTokensOut` | Text output |
|
|
342
|
+
| `candidates[].groundingMetadata.webSearchQueries` | `webSearchRequests` | Google Search grounding |
|
|
343
|
+
| `candidates[].content.parts[].functionCall` | `toolCalls` | Function call count |
|
|
344
|
+
| *(SDK-measured)* | `latencyMs` | Request duration |
|
|
345
|
+
|
|
346
|
+
Google's `candidatesTokenCount` does **not** include `thoughtsTokenCount`, so reasoning tokens are not subtracted. However, it **does** include audio and image output tokens, so the SDK subtracts those to prevent double-billing.
|
|
347
|
+
|
|
348
|
+
</details>
|
|
349
|
+
|
|
350
|
+
<details>
|
|
351
|
+
<summary><strong>AWS Bedrock</strong> — Converse API</summary>
|
|
352
|
+
|
|
353
|
+
| Bedrock API Field | SDK Field | Notes |
|
|
354
|
+
|---|---|---|
|
|
355
|
+
| `usage.inputTokens` | `tokensIn` | Input tokens |
|
|
356
|
+
| `usage.outputTokens` | `tokensOut` | Output tokens (includes reasoning — Bedrock does not split) |
|
|
357
|
+
| `usage.cacheReadInputTokens` | `cachedTokens` | Prompt cache read tokens |
|
|
358
|
+
| `usage.cacheDetails[ttl="5m"]` | `cacheWrite5mTokens` | Cache creation (5-minute TTL) |
|
|
359
|
+
| `usage.cacheDetails[ttl="1h"]` | `cacheWrite1hTokens` | Cache creation (1-hour TTL, higher rate) |
|
|
360
|
+
| `output.message.content[].toolUse` | `toolCalls` | Tool-use content block count |
|
|
361
|
+
| *(SDK-measured)* | `latencyMs` | Request duration |
|
|
362
|
+
|
|
363
|
+
Bedrock's `cacheDetails` array provides per-TTL breakdowns. The SDK splits these into `cacheWrite5mTokens` and `cacheWrite1hTokens`. When `cacheDetails` is absent, `cacheWriteInputTokens` falls back to the 5m bucket.
|
|
364
|
+
|
|
365
|
+
In streaming mode (`ConverseStream`), the final `metadata` event carries usage totals. Tool calls are accumulated from `contentBlockStart` events.
|
|
366
|
+
|
|
367
|
+
</details>
|
|
225
368
|
|
|
226
369
|
---
|
|
227
370
|
|
|
228
|
-
##
|
|
371
|
+
## Pricing Models
|
|
229
372
|
|
|
230
|
-
|
|
373
|
+
### Per-Call Revenue
|
|
374
|
+
|
|
375
|
+
For simple per-call billing, pass `revenueUnitCents` in the wrapper options:
|
|
376
|
+
|
|
377
|
+
```ts
|
|
378
|
+
const anthropic = wrapAnthropic(new Anthropic(), tollgate, {
|
|
379
|
+
customerId: 'cust_acme',
|
|
380
|
+
revenueUnitCents: 50, // $0.50 earned per LLM call
|
|
381
|
+
});
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
### Outcome-Based Pricing
|
|
385
|
+
|
|
386
|
+
Under per-resolution pricing, only a **resolved** run earns revenue. Escalated or failed runs earn $0, but provider costs still count against margin.
|
|
231
387
|
|
|
232
388
|
```ts
|
|
233
389
|
const runId = 'ticket_8842';
|
|
@@ -246,24 +402,20 @@ await tollgate.resolve({
|
|
|
246
402
|
});
|
|
247
403
|
```
|
|
248
404
|
|
|
249
|
-
|
|
405
|
+
### External Tool Costs
|
|
250
406
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
## External Tool Costs
|
|
254
|
-
|
|
255
|
-
Report costs from external services (image generation, code sandboxes, search APIs) alongside LLM calls:
|
|
407
|
+
Report costs from non-LLM services (image generation, code sandboxes, search APIs) alongside LLM calls:
|
|
256
408
|
|
|
257
409
|
```ts
|
|
258
410
|
await tollgate.track({
|
|
259
411
|
customerId: 'cust_acme',
|
|
260
412
|
runId: 'ticket_8842',
|
|
261
413
|
provider: 'openai',
|
|
262
|
-
model: '
|
|
263
|
-
tokensIn:
|
|
264
|
-
tokensOut:
|
|
414
|
+
model: 'dall-e-3',
|
|
415
|
+
tokensIn: 0,
|
|
416
|
+
tokensOut: 0,
|
|
265
417
|
externalCostCents: 4.0, // $0.04 for the DALL-E call
|
|
266
|
-
idempotencyKey: 'ticket_8842#
|
|
418
|
+
idempotencyKey: 'ticket_8842#dalle',
|
|
267
419
|
});
|
|
268
420
|
```
|
|
269
421
|
|
|
@@ -271,7 +423,7 @@ await tollgate.track({
|
|
|
271
423
|
|
|
272
424
|
## Customer & Plan Setup
|
|
273
425
|
|
|
274
|
-
Create customers and assign plans before sending usage so plan-priced revenue is recognized from the first event. Idempotent.
|
|
426
|
+
Create customers and assign plans before sending usage so plan-priced revenue is recognized from the first event. Idempotent — safe to call on every app boot.
|
|
275
427
|
|
|
276
428
|
```ts
|
|
277
429
|
await tollgate.upsertCustomer({
|
|
@@ -287,6 +439,34 @@ await tollgate.upsertCustomer({
|
|
|
287
439
|
|
|
288
440
|
---
|
|
289
441
|
|
|
442
|
+
## Error Handling
|
|
443
|
+
|
|
444
|
+
The SDK separates **tracking errors** (non-fatal) from **client errors** (actionable):
|
|
445
|
+
|
|
446
|
+
```ts
|
|
447
|
+
// Tracking errors are swallowed by default (console.warn).
|
|
448
|
+
// Override with onError to route to your observability stack:
|
|
449
|
+
const anthropic = wrapAnthropic(new Anthropic(), tollgate, {
|
|
450
|
+
customerId: 'cust_acme',
|
|
451
|
+
onError: (err) => Sentry.captureException(err),
|
|
452
|
+
});
|
|
453
|
+
|
|
454
|
+
// Client errors (missing API key, invalid plan) throw TollgateError:
|
|
455
|
+
import { TollgateError } from '@tollgateai/sdk';
|
|
456
|
+
|
|
457
|
+
try {
|
|
458
|
+
await tollgate.upsertCustomer({ customerId: 'cust_acme' });
|
|
459
|
+
} catch (err) {
|
|
460
|
+
if (err instanceof TollgateError) {
|
|
461
|
+
console.error(err.status, err.body); // HTTP status + response body
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
**Retry behavior:** The client retries on 5xx and 429 responses with exponential backoff (200ms, 400ms, ...). Deterministic 4xx errors (400, 401, 403, 404, 422) fail immediately.
|
|
467
|
+
|
|
468
|
+
---
|
|
469
|
+
|
|
290
470
|
## API Reference
|
|
291
471
|
|
|
292
472
|
### Exports
|
|
@@ -294,7 +474,7 @@ await tollgate.upsertCustomer({
|
|
|
294
474
|
```ts
|
|
295
475
|
// Client
|
|
296
476
|
createTollgateClient(options?) // -> TollgateClient
|
|
297
|
-
TollgateError //
|
|
477
|
+
TollgateError // Error with status & body
|
|
298
478
|
|
|
299
479
|
// Auto-instrumentation wrappers
|
|
300
480
|
wrapAnthropic(client, tollgate, options) // -> instrumented Anthropic client
|
|
@@ -307,17 +487,33 @@ anthropicEventFrom(msg, options) // -> TrackEventInput | null
|
|
|
307
487
|
openAIEventFrom(completion, options) // -> TrackEventInput | null
|
|
308
488
|
bedrockEventFrom(usage, model, options) // -> TrackEventInput | null
|
|
309
489
|
geminiEventFrom(response, options) // -> TrackEventInput | null
|
|
490
|
+
|
|
491
|
+
// Types
|
|
492
|
+
Provider // 'anthropic' | 'openai' | 'openai_compatible' | 'bedrock' | 'google'
|
|
493
|
+
RunOutcome // 'resolved' | 'escalated' | 'failed'
|
|
494
|
+
PricingModel // 'per_unit' | 'per_resolution' | 'usage_based' | 'per_seat' | 'flat' | 'hybrid'
|
|
495
|
+
TrackEventInput // Full event payload type
|
|
310
496
|
```
|
|
311
497
|
|
|
312
|
-
### TollgateClient
|
|
498
|
+
### `TollgateClient`
|
|
313
499
|
|
|
314
500
|
| Method | Description |
|
|
315
501
|
|---|---|
|
|
316
|
-
| `track(event)` | Report a single usage event. Idempotent on `idempotencyKey`. |
|
|
317
|
-
| `resolve(input)` | Close a run with an outcome. Books revenue only when `outcome
|
|
318
|
-
| `upsertCustomer(input)` | Create or update a customer and optionally assign a plan. |
|
|
502
|
+
| `track(event: TrackEventInput)` | Report a single usage event. Idempotent on `idempotencyKey`. Returns `{ status, eventId }`. |
|
|
503
|
+
| `resolve(input: ResolveInput)` | Close a run with an outcome. Books revenue only when `outcome === 'resolved'`. |
|
|
504
|
+
| `upsertCustomer(input: UpsertCustomerInput)` | Create or update a customer and optionally assign a plan. Returns `{ status, customerId, id, planId }`. |
|
|
505
|
+
|
|
506
|
+
### `TollgateClientOptions`
|
|
507
|
+
|
|
508
|
+
| Field | Type | Default | Description |
|
|
509
|
+
|---|---|---|---|
|
|
510
|
+
| `apiKey` | `string` | `TOLLGATE_API_KEY` env | Account API key |
|
|
511
|
+
| `baseUrl` | `string` | `https://tollgateai.vercel.app` | Tollgate server URL |
|
|
512
|
+
| `timeoutMs` | `number` | `10000` | Per-request timeout in milliseconds |
|
|
513
|
+
| `maxRetries` | `number` | `2` | Retry attempts on 5xx / 429 / network errors |
|
|
514
|
+
| `fetch` | `typeof fetch` | `globalThis.fetch` | Custom fetch implementation |
|
|
319
515
|
|
|
320
|
-
### InstrumentOptions
|
|
516
|
+
### `InstrumentOptions`
|
|
321
517
|
|
|
322
518
|
| Field | Type | Required | Description |
|
|
323
519
|
|---|---|---|---|
|
|
@@ -326,49 +522,53 @@ geminiEventFrom(response, options) // -> TrackEventInput | null
|
|
|
326
522
|
| `runId` | `string \| () => string` | No | Logical run ID (defaults to provider response ID) |
|
|
327
523
|
| `provider` | `Provider` | No | Override the reported provider |
|
|
328
524
|
| `revenueUnitCents` | `number \| (response) => number` | No | Revenue per call in cents |
|
|
329
|
-
| `providerCostCents` | `number \| (response) => number` | No | Exact cost override (skips rate card) |
|
|
330
|
-
| `onError` | `(err) => void` | No | Error handler for background tracking |
|
|
525
|
+
| `providerCostCents` | `number \| (response) => number` | No | Exact cost override in cents (skips rate card) |
|
|
526
|
+
| `onError` | `(err) => void` | No | Error handler for background tracking (default: `console.warn`) |
|
|
331
527
|
|
|
332
528
|
---
|
|
333
529
|
|
|
334
530
|
## How It Works
|
|
335
531
|
|
|
336
|
-
1. **Proxy wrappers** intercept provider calls without modifying the request or response.
|
|
337
|
-
2. After the provider responds, the wrapper extracts token counts
|
|
338
|
-
3. A `POST /api/track` fires **in the background** with automatic retries on transient failures.
|
|
339
|
-
4. The server computes cost from tokens via rate cards (
|
|
340
|
-
5. Events are **idempotent** on `idempotencyKey` (auto-set to the provider response ID).
|
|
532
|
+
1. **Proxy wrappers** intercept provider calls without modifying the request or response. Your code sees the exact same types and behavior as without the SDK.
|
|
533
|
+
2. After the provider responds, the wrapper extracts token counts by modality, tool calls, service tier, and latency from the response object.
|
|
534
|
+
3. A `POST /api/track` fires **in the background** with automatic retries on transient failures. Your application code continues immediately.
|
|
535
|
+
4. The server computes cost from tokens via rate cards (per modality, cache tier, reasoning, and web search), joins it with plan-configured revenue, and updates real-time margin rollups.
|
|
536
|
+
5. Events are **idempotent** — deduplication is based on `idempotencyKey` (auto-set to the provider response ID).
|
|
341
537
|
|
|
342
|
-
##
|
|
538
|
+
## Security & Privacy
|
|
343
539
|
|
|
344
|
-
- **No prompt content is ever
|
|
345
|
-
-
|
|
346
|
-
-
|
|
540
|
+
- **No prompt content is ever transmitted.** Only token counts, model identifiers, and metadata.
|
|
541
|
+
- **Idempotent ingestion** — duplicate events are safely deduplicated server-side.
|
|
542
|
+
- **Non-invasive** — background tracking never throws into your application code.
|
|
543
|
+
- **Transport security** — all communication over HTTPS with Bearer token authentication.
|
|
347
544
|
|
|
348
545
|
---
|
|
349
546
|
|
|
350
|
-
##
|
|
547
|
+
## Changelog
|
|
548
|
+
|
|
549
|
+
### v0.7.0
|
|
550
|
+
|
|
551
|
+
- Google Gemini: fixed double-billing of multimodal input tokens — `tokensIn` now subtracts cached, audio, image, and video tokens
|
|
552
|
+
- Aligned Anthropic extraction with actual API response fields
|
|
553
|
+
- Simplified Anthropic streaming accumulation
|
|
554
|
+
- Verified field coverage for all five providers
|
|
555
|
+
|
|
556
|
+
### v0.6.0
|
|
351
557
|
|
|
352
|
-
-
|
|
353
|
-
-
|
|
354
|
-
-
|
|
355
|
-
- **Fix: Multimodal-only events** — audio, image, video, and web search events now trigger rate-card lookup even when text token counts are zero.
|
|
356
|
-
- `reasoningTokens` is now extracted from **all three** providers: OpenAI, Anthropic, and Gemini.
|
|
558
|
+
- OpenAI: fixed double-counting of reasoning, audio, and cached tokens
|
|
559
|
+
- Multimodal-only events now trigger rate-card lookup
|
|
560
|
+
- Reasoning token extraction for OpenAI and Gemini
|
|
357
561
|
|
|
358
562
|
### v0.5.0
|
|
359
563
|
|
|
360
|
-
- Google Gemini / Vertex AI support
|
|
361
|
-
- Audio
|
|
362
|
-
-
|
|
363
|
-
-
|
|
364
|
-
- Latency measurement on all wrappers
|
|
365
|
-
- OpenAI Predicted Outputs (`acceptedPredictionTokens` / `rejectedPredictionTokens`)
|
|
366
|
-
- Service tier tracking (OpenAI `flex` / `priority`, Anthropic `priority`)
|
|
367
|
-
- Text modality split for accurate cost attribution in mixed-modal requests
|
|
368
|
-
- Expanded rate card sync: audio, image, video, and web search rates from LiteLLM
|
|
564
|
+
- Google Gemini / Vertex AI support with full multimodal extraction
|
|
565
|
+
- Audio, image, video, and text modality token tracking
|
|
566
|
+
- Web search request tracking (Anthropic, Gemini)
|
|
567
|
+
- OpenAI Predicted Outputs and service tier tracking
|
|
568
|
+
- Latency measurement on all wrappers
|
|
369
569
|
|
|
370
570
|
---
|
|
371
571
|
|
|
372
572
|
## License
|
|
373
573
|
|
|
374
|
-
|
|
574
|
+
MIT — see [LICENSE](LICENSE) for details.
|
package/dist/index.cjs
CHANGED
|
@@ -158,11 +158,7 @@ function anthropicEventFrom(msg, opts) {
|
|
|
158
158
|
const usage = msg?.usage;
|
|
159
159
|
if (!usage) return null;
|
|
160
160
|
const runId = resolveRunId(opts, msg.id);
|
|
161
|
-
const fivem = usage.cache_creation?.ephemeral_5m_input_tokens;
|
|
162
|
-
const oneh = usage.cache_creation?.ephemeral_1h_input_tokens;
|
|
163
|
-
const hasSplit = fivem !== void 0 || oneh !== void 0;
|
|
164
161
|
const toolCalls = Array.isArray(msg.content) ? msg.content.filter((b) => b.type === "tool_use").length : 0;
|
|
165
|
-
const thinkingTokens = usage.output_tokens_details?.thinking_tokens ?? 0;
|
|
166
162
|
const event = {
|
|
167
163
|
customerId: opts.customerId,
|
|
168
164
|
agentId: opts.agentId,
|
|
@@ -170,11 +166,9 @@ function anthropicEventFrom(msg, opts) {
|
|
|
170
166
|
provider: opts.provider ?? "anthropic",
|
|
171
167
|
model: msg.model ?? "unknown",
|
|
172
168
|
tokensIn: usage.input_tokens ?? 0,
|
|
173
|
-
tokensOut:
|
|
174
|
-
reasoningTokens: thinkingTokens,
|
|
169
|
+
tokensOut: usage.output_tokens ?? 0,
|
|
175
170
|
cachedTokens: usage.cache_read_input_tokens ?? 0,
|
|
176
|
-
cacheWrite5mTokens:
|
|
177
|
-
cacheWrite1hTokens: hasSplit ? oneh ?? 0 : 0,
|
|
171
|
+
cacheWrite5mTokens: usage.cache_creation_input_tokens ?? 0,
|
|
178
172
|
webSearchRequests: usage.server_tool_use?.web_search_requests ?? 0,
|
|
179
173
|
toolCalls,
|
|
180
174
|
revenueUnitCents: resolveRevenue(opts, msg),
|
|
@@ -201,8 +195,7 @@ function wrapAnthropic(client, tollgate, opts) {
|
|
|
201
195
|
} else if (ev.type === "message_delta" && ev.usage) {
|
|
202
196
|
msg.usage = {
|
|
203
197
|
...msg.usage ?? {},
|
|
204
|
-
output_tokens: ev.usage.output_tokens
|
|
205
|
-
output_tokens_details: ev.usage.output_tokens_details
|
|
198
|
+
output_tokens: ev.usage.output_tokens
|
|
206
199
|
};
|
|
207
200
|
} else if (ev.type === "content_block_start" && ev.content_block?.type === "tool_use") {
|
|
208
201
|
toolUseBlocks.push(ev.content_block);
|
|
@@ -268,6 +261,10 @@ function openAIEventFrom(completion, opts) {
|
|
|
268
261
|
revenueUnitCents: resolveRevenue(opts, completion),
|
|
269
262
|
idempotencyKey: completion.id ?? `${runId}#${randomId()}`
|
|
270
263
|
};
|
|
264
|
+
const gatewayCostUsd = usage.cost;
|
|
265
|
+
if (gatewayCostUsd != null && gatewayCostUsd > 0 && resolveCost(opts, completion) === void 0) {
|
|
266
|
+
event.providerCostCents = gatewayCostUsd * 100;
|
|
267
|
+
}
|
|
271
268
|
return withCost(event, opts, completion);
|
|
272
269
|
}
|
|
273
270
|
function wrapOpenAI(client, tollgate, opts) {
|
|
@@ -333,6 +330,16 @@ function wrapOpenAI(client, tollgate, opts) {
|
|
|
333
330
|
function bedrockEventFrom(usage, model, opts, response = void 0, toolCalls = 0) {
|
|
334
331
|
if (!usage) return null;
|
|
335
332
|
const runId = resolveRunId(opts, void 0);
|
|
333
|
+
let cacheWrite5m = 0;
|
|
334
|
+
let cacheWrite1h = 0;
|
|
335
|
+
if (usage.cacheDetails?.length) {
|
|
336
|
+
for (const d of usage.cacheDetails) {
|
|
337
|
+
if (d.ttl === "1h") cacheWrite1h += d.inputTokens ?? 0;
|
|
338
|
+
else cacheWrite5m += d.inputTokens ?? 0;
|
|
339
|
+
}
|
|
340
|
+
} else {
|
|
341
|
+
cacheWrite5m = usage.cacheWriteInputTokens ?? 0;
|
|
342
|
+
}
|
|
336
343
|
const event = {
|
|
337
344
|
customerId: opts.customerId,
|
|
338
345
|
agentId: opts.agentId,
|
|
@@ -342,7 +349,8 @@ function bedrockEventFrom(usage, model, opts, response = void 0, toolCalls = 0)
|
|
|
342
349
|
tokensIn: usage.inputTokens ?? 0,
|
|
343
350
|
tokensOut: usage.outputTokens ?? 0,
|
|
344
351
|
cachedTokens: usage.cacheReadInputTokens ?? 0,
|
|
345
|
-
cacheWrite5mTokens:
|
|
352
|
+
cacheWrite5mTokens: cacheWrite5m,
|
|
353
|
+
cacheWrite1hTokens: cacheWrite1h,
|
|
346
354
|
toolCalls,
|
|
347
355
|
revenueUnitCents: resolveRevenue(opts, response),
|
|
348
356
|
idempotencyKey: `${runId}#${randomId()}`
|
|
@@ -409,21 +417,27 @@ function geminiEventFrom(response, opts) {
|
|
|
409
417
|
}, 0);
|
|
410
418
|
const promptDetails = usage.promptTokensDetails;
|
|
411
419
|
const candidateDetails = usage.candidatesTokensDetails;
|
|
420
|
+
const cachedIn = usage.cachedContentTokenCount ?? 0;
|
|
421
|
+
const audioIn = modalityTokens(promptDetails, "AUDIO");
|
|
422
|
+
const imageIn = modalityTokens(promptDetails, "IMAGE");
|
|
423
|
+
const videoIn = modalityTokens(promptDetails, "VIDEO");
|
|
424
|
+
const audioOut = modalityTokens(candidateDetails, "AUDIO");
|
|
425
|
+
const imageOut = modalityTokens(candidateDetails, "IMAGE");
|
|
412
426
|
const event = {
|
|
413
427
|
customerId: opts.customerId,
|
|
414
428
|
agentId: opts.agentId,
|
|
415
429
|
runId,
|
|
416
430
|
provider: opts.provider ?? "google",
|
|
417
431
|
model: "unknown",
|
|
418
|
-
tokensIn: usage.promptTokenCount ?? 0,
|
|
419
|
-
tokensOut: usage.candidatesTokenCount ?? 0,
|
|
432
|
+
tokensIn: (usage.promptTokenCount ?? 0) - cachedIn - audioIn - imageIn - videoIn,
|
|
433
|
+
tokensOut: (usage.candidatesTokenCount ?? 0) - audioOut - imageOut,
|
|
420
434
|
reasoningTokens: usage.thoughtsTokenCount ?? 0,
|
|
421
|
-
cachedTokens:
|
|
422
|
-
audioTokensIn:
|
|
423
|
-
audioTokensOut:
|
|
424
|
-
imageTokensIn:
|
|
425
|
-
imageTokensOut:
|
|
426
|
-
videoTokensIn:
|
|
435
|
+
cachedTokens: cachedIn,
|
|
436
|
+
audioTokensIn: audioIn,
|
|
437
|
+
audioTokensOut: audioOut,
|
|
438
|
+
imageTokensIn: imageIn,
|
|
439
|
+
imageTokensOut: imageOut,
|
|
440
|
+
videoTokensIn: videoIn,
|
|
427
441
|
textTokensIn: modalityTokens(promptDetails, "TEXT"),
|
|
428
442
|
textTokensOut: modalityTokens(candidateDetails, "TEXT"),
|
|
429
443
|
webSearchRequests,
|