token-limit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- token_limit-0.1.0/LICENSE +35 -0
- token_limit-0.1.0/PKG-INFO +532 -0
- token_limit-0.1.0/README.md +507 -0
- token_limit-0.1.0/pyproject.toml +46 -0
- token_limit-0.1.0/src/token_limit/__init__.py +8 -0
- token_limit-0.1.0/src/token_limit/config.py +35 -0
- token_limit-0.1.0/src/token_limit/exceptions.py +5 -0
- token_limit-0.1.0/src/token_limit/meter.py +479 -0
- token_limit-0.1.0/src/token_limit/patches/__init__.py +14 -0
- token_limit-0.1.0/src/token_limit/patches/_base_patch.py +397 -0
- token_limit-0.1.0/src/token_limit/patches/anthropic_patch.py +627 -0
- token_limit-0.1.0/src/token_limit/patches/deepseek_patch.py +707 -0
- token_limit-0.1.0/src/token_limit/patches/google_patch.py +677 -0
- token_limit-0.1.0/src/token_limit/patches/openai_patch.py +1199 -0
- token_limit-0.1.0/src/token_limit/patches/openrouter_patch.py +400 -0
- token_limit-0.1.0/src/token_limit/transport/http_client.py +311 -0
- token_limit-0.1.0/src/token_limit/transport/queue.py +95 -0
- token_limit-0.1.0/src/token_limit/types.py +92 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Business Source License 1.1
|
|
2
|
+
|
|
3
|
+
Licensed Work: TokenLimit
|
|
4
|
+
|
|
5
|
+
Licensor: Ali Ezatyar Ahmadyar
|
|
6
|
+
|
|
7
|
+
Additional Use Grant:
|
|
8
|
+
|
|
9
|
+
You may copy, modify, and use this software for any non-production purpose and for internal business use. You may not offer this software, or a substantially similar service, to third parties as a hosted or managed commercial service without prior written permission from the Licensor.
|
|
10
|
+
|
|
11
|
+
Change Date:
|
|
12
|
+
|
|
13
|
+
January 1, 2030
|
|
14
|
+
|
|
15
|
+
Change License:
|
|
16
|
+
|
|
17
|
+
Apache License, Version 2.0
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Business Source License 1.1
|
|
22
|
+
|
|
23
|
+
Copyright (c) 2026 Ali Ezatyar Ahmadyar
|
|
24
|
+
|
|
25
|
+
Licensed under the Business Source License 1.1 (the "License"); you may not use this file except in compliance with the License.
|
|
26
|
+
|
|
27
|
+
You may obtain a copy of the License at:
|
|
28
|
+
|
|
29
|
+
https://mariadb.com/bsl11/
|
|
30
|
+
|
|
31
|
+
The Licensor hereby grants you the right to copy, modify, create derivative works, redistribute, and make non-production use of the Licensed Work, subject to the terms of the Business Source License 1.1 and the Additional Use Grant above.
|
|
32
|
+
|
|
33
|
+
On the Change Date, or the fourth anniversary of the first public distribution of this Licensed Work, whichever comes first, this License will automatically convert to the Change License specified above.
|
|
34
|
+
|
|
35
|
+
THE LICENSED WORK IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, AND NON-INFRINGEMENT. IN NO EVENT SHALL THE LICENSOR BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY ARISING FROM THE LICENSED WORK.
|
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: token-limit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Usage metering and cost enforcement per tenant for LLM applications.
|
|
5
|
+
License: BUSL-1.1
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: llm,openai,anthropic,google-ai,token-metering,cost-tracking
|
|
8
|
+
Author: Ali Ezatyar Ahmadyar
|
|
9
|
+
Author-email: aliezatyar@gmail.com
|
|
10
|
+
Requires-Python: >=3.10,<4.0
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: Other/Proprietary License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
|
+
Classifier: Topic :: Internet
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
22
|
+
Project-URL: Homepage, https://github.com/AliEzatyar/token-limit
|
|
23
|
+
Project-URL: Repository, https://github.com/AliEzatyar/token-limit
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# token-limit
|
|
27
|
+
|
|
28
|
+
Usage metering and cost enforcement for LLM calls, built for multi-tenant B2B applications.
|
|
29
|
+
One call instruments every OpenAI, Anthropic, Google AI, DeepSeek, and OpenRouter request — no changes required in your LLM call sites.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## How it works
|
|
34
|
+
|
|
35
|
+
`token-limit` monkey-patches the official provider SDKs at startup. Every LLM call your application makes is automatically intercepted, token usage is extracted from the response, and a lightweight event is queued and batched to your backend ingest endpoint in the background. Your LLM calls are never blocked or slowed down.
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
Your code → [patched SDK] → LLM provider
|
|
39
|
+
↓
|
|
40
|
+
LLMEvent captured
|
|
41
|
+
↓
|
|
42
|
+
EventQueue (in-memory, daemon thread)
|
|
43
|
+
↓ (every 5s or 50 events)
|
|
44
|
+
POST /v1/ingest → Your backend → Dashboard
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
### 1. Initialize once at application startup
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from token_limit import Meter, MeterConfig
|
|
53
|
+
|
|
54
|
+
meter = Meter(MeterConfig(
|
|
55
|
+
api_key="sk-...",
|
|
56
|
+
url="https://api.yoursaas.com/v1/ingest",
|
|
57
|
+
))
|
|
58
|
+
|
|
59
|
+
meter.patch_all() # patches OpenAI, Anthropic, Google, DeepSeek — all at once
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 2. Tag requests per tenant
|
|
63
|
+
|
|
64
|
+
Use the context manager to scope a block of LLM calls to a tenant. Thread-safe and async-safe via `contextvars` — concurrent requests with different tenants are fully isolated.
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
with meter.for_tenant("acme-corp"):
|
|
68
|
+
try:
|
|
69
|
+
response = client.chat.completions.create(
|
|
70
|
+
model="gpt-4o-mini",
|
|
71
|
+
messages=[{"role": "user", "content": "Hello"}],
|
|
72
|
+
)
|
|
73
|
+
except LimitExceededException:
|
|
74
|
+
show_upgrade_message()
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
For middleware or request handlers where a context manager isn't convenient:
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
meter.set_tenant(request.tenant_id) # sets for current thread/async task
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Supported providers
|
|
86
|
+
|
|
87
|
+
| Provider | What gets patched | Tokens captured |
|
|
88
|
+
|---|---|---|
|
|
89
|
+
| **OpenAI** | `chat.completions.create`, `responses.create`, `completions.create` (legacy), `embeddings.create`, `audio.transcriptions.create`, `audio.translations.create`, `audio.speech.create`, `images.generate`, `images.edit` — all sync + async | input, output, cached, plus endpoint-specific fields (character count, image dimensions, audio duration) |
|
|
90
|
+
| **Anthropic** | `messages.Messages.create` (sync + async) | input, output, cached (cache read), cache_creation |
|
|
91
|
+
| **Google AI** | `Models.generate_content`, `Models.generate_content_stream`, `AsyncModels.generate_content`, `AsyncModels.generate_content_stream` — all via `google.genai` | input, output, total, cached |
|
|
92
|
+
| **DeepSeek** | `chat.completions.create`, `fim.completions.create`, `beta.chat.completions.create` — sync + async; covers both first-party `deepseek` SDK and `openai` client pointed at `api.deepseek.com` | input, output, cached (cache hit), cache_miss, reasoning (deepseek-reasoner) |
|
|
93
|
+
| **OpenRouter** | `chat.completions.create` on registered client instances — sync + async, streaming and non-streaming | input, output, cached, cost_usd (when billing header enabled), upstream_provider |
|
|
94
|
+
|
|
95
|
+
All patches are installed/uninstalled cleanly — original methods are always restored on `unpatch_all()` or process exit.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Cost enforcement
|
|
100
|
+
|
|
101
|
+
Spend limits are configured per tenant in USD and enforced on every intercepted LLM call.
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
# Per-month limit (default)
|
|
105
|
+
meter.set_limit("tenant-id-456", limit_usd=50.00)
|
|
106
|
+
|
|
107
|
+
# Per-day limit
|
|
108
|
+
meter.set_limit("tenant-id-456", limit_usd=5.00, frequency="per_day")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
When a tenant reaches its configured limit, the next intercepted LLM call raises `LimitExceededException` before any API traffic is sent. Handle it and show an upgrade prompt:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
with meter.for_tenant("acme-corp"):
|
|
115
|
+
try:
|
|
116
|
+
response = client.chat.completions.create(...)
|
|
117
|
+
except LimitExceededException:
|
|
118
|
+
return {"detail": "upgrade_plan"}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
`set_limit()` immediately invalidates the local cache for that tenant so the new threshold is honored on the very next call, without waiting for the TTL to expire.
|
|
122
|
+
|
|
123
|
+
**Limit check caching.** `check_limit()` and `async_check_limit()` are called on every patched SDK call. Results are cached per tenant for `limit_check_cache_ttl` seconds (default 5 s) to avoid a network round-trip on every LLM call. A local token trip-wire also catches runaway bursts within the TTL window without waiting for the next backend sync.
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## OpenAI patch details
|
|
128
|
+
|
|
129
|
+
Patches SDK methods at the **class level**, so every `openai.OpenAI` / `openai.AsyncOpenAI` client created before or after patching is automatically covered.
|
|
130
|
+
|
|
131
|
+
### Patched surfaces
|
|
132
|
+
|
|
133
|
+
**`chat.completions.create`** (sync + async)
|
|
134
|
+
ChatCompletions for all models. Handles `stream=True` transparently: forces `stream_options={"include_usage": True}` so the final chunk carries a usage summary, then proxies the iterator to the caller while capturing that summary in a `finally` block.
|
|
135
|
+
Fields: `input_tokens`, `output_tokens`, `total_tokens`, `cached_tokens`, `request_id`, `model`, `stream`, `duration_ms`.
|
|
136
|
+
|
|
137
|
+
**`responses.create`** (sync + async, openai >= 1.30)
|
|
138
|
+
Responses API. Also captures five image-billing dimensions when the `image_generation` tool is active, and `image_count` for audit.
|
|
139
|
+
Fields: `input_tokens`, `output_tokens`, `total_tokens`, `cached_tokens`, image billing dimensions, `image_count`.
|
|
140
|
+
|
|
141
|
+
**`completions.create`** (legacy `/v1/completions`, sync + async)
|
|
142
|
+
Legacy text-completion endpoint for models such as `gpt-3.5-turbo-instruct`. Streaming handled identically to chat completions.
|
|
143
|
+
Fields: `input_tokens`, `output_tokens`, `total_tokens`, `model`, `stream`, `duration_ms`.
|
|
144
|
+
|
|
145
|
+
**`embeddings.create`** (sync + async)
|
|
146
|
+
Text-embedding endpoint (`text-embedding-3-*`, `ada-002`, etc.). `output_tokens` is always 0.
|
|
147
|
+
Fields: `input_tokens`, `output_tokens` (0), `total_tokens`, `model`, `duration_ms`.
|
|
148
|
+
|
|
149
|
+
**`audio.transcriptions.create`** (sync + async)
|
|
150
|
+
Whisper STT. Two billing modes handled automatically:
|
|
151
|
+
- Per-minute models (`whisper-1`): reads `response.duration` (requires `response_format="verbose_json"`; emits a warning if omitted).
|
|
152
|
+
- Per-token models (`gpt-4o-transcribe`, `gpt-4o-mini-transcribe`): reads `usage.input_tokens` / `usage.output_tokens`.
|
|
153
|
+
|
|
154
|
+
Fields: `input_tokens`, `output_tokens`, `audio_input_tokens`, `audio_output_tokens`, `duration_seconds`, `duration_unavailable`, `model`, `duration_ms`.
|
|
155
|
+
|
|
156
|
+
**`audio.translations.create`** (sync + async)
|
|
157
|
+
Whisper translation. Identical billing logic to `audio.transcriptions`; endpoint tag differs.
|
|
158
|
+
|
|
159
|
+
**`audio.speech.create`** (TTS, sync + async)
|
|
160
|
+
Two billing modes:
|
|
161
|
+
- Per-character models (`tts-1`, `tts-1-hd`): no `usage` object; `character_count` derived from the caller's `input` kwarg.
|
|
162
|
+
- Per-token models (`gpt-4o-mini-tts`): reads `usage.input_tokens` / `usage.output_tokens`; sets `character_count=0` to prevent double-billing.
|
|
163
|
+
|
|
164
|
+
Fields: `input_tokens`, `output_tokens`, `character_count`, `model`, `duration_ms`.
|
|
165
|
+
|
|
166
|
+
**`images.generate`** and **`images.edit`** (sync + async)
|
|
167
|
+
Image generation and editing for `gpt-image-*` models. Captures five token billing dimensions from `usage.input_tokens_details` and `usage.output_tokens`, plus `image_count`.
|
|
168
|
+
Fields: `input_text_tokens`, `cached_input_text_tokens`, `input_image_tokens`, `cached_input_image_tokens`, `output_image_tokens`, `total_tokens`, `image_count`.
|
|
169
|
+
|
|
170
|
+
### Not patched (OpenAI)
|
|
171
|
+
|
|
172
|
+
- `moderations.create` — free endpoint, no per-token cost.
|
|
173
|
+
- `fine_tuning.jobs.*` — billed on a separate training rate; not real-time.
|
|
174
|
+
- `beta.assistants.*` / `beta.threads.*` / `beta.runs.*` — usage only available after async run completion; not yet supported.
|
|
175
|
+
- `uploads.*` / `beta.vector_stores` / `files` — storage-billed, not token-billed.
|
|
176
|
+
- `realtime.*` — persistent WebSocket; no discrete `.create()` to wrap.
|
|
177
|
+
- `audio.transcriptions.create` with `stream=True` — streaming transcription path not yet captured.
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
## Anthropic patch details
|
|
182
|
+
|
|
183
|
+
Patches `Messages.create` and `AsyncMessages.create` at the **class level**, so all `anthropic.Anthropic` / `anthropic.AsyncAnthropic` clients created before or after patching are automatically covered.
|
|
184
|
+
|
|
185
|
+
### Patched surfaces
|
|
186
|
+
|
|
187
|
+
**`messages.Messages.create`** (sync) and **`messages.AsyncMessages.create`** (async)
|
|
188
|
+
Claude chat/completion for all `claude-*` models. Both `stream=False` (default) and `stream=True` are handled. For streaming, a helper proxies the iterator unchanged while accumulating usage across events (`message_start` → input tokens; `message_delta` → output + cache tokens).
|
|
189
|
+
Fields: `input_tokens`, `output_tokens`, `total_tokens`, `cached_tokens` (cache read hits), `cache_creation_tokens` (cache write), `request_id`, `model`, `stream`, `duration_ms`, `tenant_id`, `error`, `input_tokens_details` (SDK >= 0.26, model-dependent).
|
|
190
|
+
|
|
191
|
+
### Not patched (Anthropic)
|
|
192
|
+
|
|
193
|
+
- `beta.messages.batches.*` — asynchronous batch completion; results fetched separately from submission. Not yet supported.
|
|
194
|
+
- Embeddings — Anthropic does not offer a text-embedding API.
|
|
195
|
+
- Audio/TTS — Anthropic does not offer speech endpoints.
|
|
196
|
+
- Image generation — Claude is vision-input only; image tokens are already counted inside `input_tokens`.
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## Google AI patch details
|
|
201
|
+
|
|
202
|
+
Patches four methods at the **class level** on `google.genai.models.Models` and `google.genai.models.AsyncModels`. Unlike OpenAI, the `google.genai` SDK exposes streaming as a **separate method** rather than a `stream=True` flag.
|
|
203
|
+
|
|
204
|
+
### Patched surfaces
|
|
205
|
+
|
|
206
|
+
**`Models.generate_content`** (sync, non-streaming)
|
|
207
|
+
Usage read from `response.usage_metadata` directly after the call returns.
|
|
208
|
+
|
|
209
|
+
**`Models.generate_content_stream`** (sync, streaming)
|
|
210
|
+
Returns a synchronous iterator of `GenerateContentResponse` chunks. Usage is only present on the last chunk; the helper tracks `last_chunk` across the full iteration and reads its `usage_metadata` in a `finally` block.
|
|
211
|
+
|
|
212
|
+
**`AsyncModels.generate_content`** (async, non-streaming)
|
|
213
|
+
Awaits `meter.async_check_limit()` to avoid blocking the event loop.
|
|
214
|
+
|
|
215
|
+
**`AsyncModels.generate_content_stream`** (async, streaming)
|
|
216
|
+
Handles both coroutine-returning and direct async-iterator forms via `inspect.isawaitable`.
|
|
217
|
+
|
|
218
|
+
Fields (all four surfaces): `input_tokens` (`prompt_token_count`), `output_tokens` (`candidates_token_count`, includes thinking tokens on the direct Gemini API), `total_tokens` (read from response, not derived), `cached_tokens` (`cached_content_token_count`), `stream`, `request_id`, `duration_ms`, `tenant_id`, `error`.
|
|
219
|
+
|
|
220
|
+
### Not patched (Google AI)
|
|
221
|
+
|
|
222
|
+
- Vertex AI SDK (`google.cloud.aiplatform`) — separate SDK, not yet supported.
|
|
223
|
+
- `models.embed_content` / `models.embed_content_batch` — not yet supported.
|
|
224
|
+
- `models.generate_images` / `models.upscale_image` — billed per image, not per token.
|
|
225
|
+
- `models.generate_videos` — billed per second of output, not yet supported.
|
|
226
|
+
- `live.*` — WebSocket-based session; no discrete call to wrap.
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## DeepSeek patch details
|
|
231
|
+
|
|
232
|
+
Covers **both** integration paths: the first-party `deepseek` package and an `openai` client pointed at `api.deepseek.com`. Both paths are attempted independently — a failure in one does not prevent the other from being installed.
|
|
233
|
+
|
|
234
|
+
### Patched surfaces
|
|
235
|
+
|
|
236
|
+
**`chat.completions.create`** (sync + async)
|
|
237
|
+
Standard chat completions. Streaming handled identically to OpenAI: forces `stream_options={"include_usage": True}` and captures usage from the final chunk.
|
|
238
|
+
|
|
239
|
+
**`fim.completions.create`** (sync + async)
|
|
240
|
+
DeepSeek-specific fill-in-middle (FIM) endpoint. Records `fim_prefix` (from `kwargs["prompt"]` or `kwargs["prefix"]`) and `fim_suffix` alongside standard token counts.
|
|
241
|
+
|
|
242
|
+
**`beta.chat.completions.create`** (sync + async)
|
|
243
|
+
Beta chat namespace alias present in SDK >= 1.x; uses the same extractor as the main chat surface.
|
|
244
|
+
|
|
245
|
+
### DeepSeek-specific fields
|
|
246
|
+
|
|
247
|
+
| Event field | Source |
|
|
248
|
+
|---|---|
|
|
249
|
+
| `cached_tokens` | `usage.prompt_cache_hit_tokens` |
|
|
250
|
+
| `cache_miss_tokens` | `usage.prompt_cache_miss_tokens` |
|
|
251
|
+
| `reasoning_tokens` | `usage.completion_tokens_details.reasoning_tokens` (deepseek-reasoner only) |
|
|
252
|
+
| `fim_prefix` | `kwargs["prompt"]` or `kwargs["prefix"]` |
|
|
253
|
+
| `fim_suffix` | `kwargs["suffix"]` |
|
|
254
|
+
|
|
255
|
+
### Not patched (DeepSeek)
|
|
256
|
+
|
|
257
|
+
- `models.list` — metadata endpoint, no token cost.
|
|
258
|
+
- `files.*` — file upload/management, not billed per token.
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## OpenRouter patch details
|
|
263
|
+
|
|
264
|
+
OpenRouter exposes an OpenAI-compatible REST API, so developers typically point a standard `openai.OpenAI` (or `AsyncOpenAI`) client at `https://openrouter.ai/api/v1`. Unlike the other providers, **OpenRouter is patched at the instance level** rather than the class level — only the specific client instances you register are instrumented, leaving any other OpenAI clients untouched.
|
|
265
|
+
|
|
266
|
+
### Registration
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
# Pattern 1 — sync factory (recommended)
|
|
270
|
+
client = meter.openrouter_client(api_key="sk-or-v1-...")
|
|
271
|
+
|
|
272
|
+
# Pattern 2 — async factory
|
|
273
|
+
client = meter.async_openrouter_client(api_key="sk-or-v1-...")
|
|
274
|
+
|
|
275
|
+
# Pattern 3 — register an existing client
|
|
276
|
+
meter.register_openrouter_client(existing_client)
|
|
277
|
+
|
|
278
|
+
# Pattern 4 — fully manual
|
|
279
|
+
meter.track_manually(provider="openrouter", model="...", input_tokens=..., output_tokens=...)
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### Patched surfaces
|
|
283
|
+
|
|
284
|
+
**`chat.completions.create`** (sync + async, on registered instances only)
|
|
285
|
+
Streaming handled identically to OpenAI: `stream_options={"include_usage": True}` is injected automatically so the final chunk carries usage. The wrapper is installed directly on `client.chat.completions.create` and is guarded against double-patching.
|
|
286
|
+
|
|
287
|
+
### OpenRouter-specific fields
|
|
288
|
+
|
|
289
|
+
| Event field | Source |
|
|
290
|
+
|---|---|
|
|
291
|
+
| `upstream_provider` | Portion before `/` in the model string, e.g. `"anthropic"` from `"anthropic/claude-3-5-sonnet"` |
|
|
292
|
+
| `cost_usd` | `usage.cost` — actual USD cost when the caller passes `X-Or-Billing: true` |
|
|
293
|
+
|
|
294
|
+
Fields (all calls): `input_tokens` (`usage.prompt_tokens`), `output_tokens` (`usage.completion_tokens`), `total_tokens`, `cached_tokens` (`usage.prompt_tokens_details.cached_tokens`), `cost_usd`, `upstream_provider`, `model`, `stream`, `request_id`, `duration_ms`, `tenant_id`.
|
|
295
|
+
|
|
296
|
+
---
|
|
297
|
+
|
|
298
|
+
## Configuration reference
|
|
299
|
+
|
|
300
|
+
All configuration lives in `MeterConfig`, passed once at startup:
|
|
301
|
+
|
|
302
|
+
```python
|
|
303
|
+
from token_limit import Meter, MeterConfig
|
|
304
|
+
|
|
305
|
+
meter = Meter(MeterConfig(
|
|
306
|
+
# Required
|
|
307
|
+
api_key="your-api-key", # authenticates event ingest and limit checks
|
|
308
|
+
url="https://...", # POST endpoint that receives event batches
|
|
309
|
+
|
|
310
|
+
# Batching — tune for your traffic volume
|
|
311
|
+
flush_interval=5.0, # seconds between background flushes
|
|
312
|
+
max_batch_size=50, # flush early when queue reaches this size
|
|
313
|
+
max_queue_size=1000, # drop oldest events if queue overflows
|
|
314
|
+
|
|
315
|
+
# Limit checks
|
|
316
|
+
limit_check_cache_ttl=5.0, # seconds a check_limit() result is cached per tenant
|
|
317
|
+
|
|
318
|
+
# Behaviour
|
|
319
|
+
debug=False, # log every captured event to stdout
|
|
320
|
+
raise_on_error=False, # re-raise exceptions from within patches
|
|
321
|
+
|
|
322
|
+
# Hooks
|
|
323
|
+
on_event=None, # Callable[[LLMEvent], None] — called after every capture
|
|
324
|
+
on_flush_error=None, # Callable[[Exception], None] — called on send failure
|
|
325
|
+
|
|
326
|
+
# Which providers to patch (default = all four built-ins)
|
|
327
|
+
patches=["openai", "anthropic", "langchain", "google"],
|
|
328
|
+
))
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
---
|
|
332
|
+
|
|
333
|
+
## Event shape
|
|
334
|
+
|
|
335
|
+
Every captured call produces an `LLMEvent`. Fields are sourced directly from `types.py`:
|
|
336
|
+
|
|
337
|
+
```python
|
|
338
|
+
@dataclass
|
|
339
|
+
class LLMEvent:
|
|
340
|
+
# identity
|
|
341
|
+
event_id: str # UUID, auto-generated
|
|
342
|
+
tenant_id: str # set via for_tenant() or set_tenant()
|
|
343
|
+
session_id: Optional[str]
|
|
344
|
+
|
|
345
|
+
# provider / model
|
|
346
|
+
provider: str # "openai" | "anthropic" | "google" | "deepseek" | "openrouter"
|
|
347
|
+
model: str # e.g. "gpt-4o", "claude-3-5-sonnet-20241022"
|
|
348
|
+
endpoint: str # e.g. "chat.completions", "messages", "fim.completions"
|
|
349
|
+
|
|
350
|
+
# text token usage
|
|
351
|
+
input_tokens: int
|
|
352
|
+
output_tokens: int
|
|
353
|
+
total_tokens: int
|
|
354
|
+
cached_tokens: int # OpenAI: sub-field of input_tokens
|
|
355
|
+
# Anthropic/Google: separate pool, not in input_tokens
|
|
356
|
+
|
|
357
|
+
# latency
|
|
358
|
+
duration_ms: float # wall-clock time of the LLM call
|
|
359
|
+
timestamp: float # unix timestamp of capture
|
|
360
|
+
|
|
361
|
+
# request metadata
|
|
362
|
+
request_id: Optional[str] # x-request-id from provider response headers
|
|
363
|
+
stream: bool
|
|
364
|
+
error: Optional[str] # set if the LLM call raised an exception
|
|
365
|
+
|
|
366
|
+
# audio (transcription / translation)
|
|
367
|
+
duration_seconds: Optional[float] # per-minute path (whisper-1, verbose_json only)
|
|
368
|
+
audio_input_tokens: Optional[int] # per-token path (gpt-4o-transcribe etc.)
|
|
369
|
+
audio_output_tokens: Optional[int]
|
|
370
|
+
|
|
371
|
+
# speech / TTS
|
|
372
|
+
character_count: Optional[int] # per-character path (tts-1, tts-1-hd)
|
|
373
|
+
|
|
374
|
+
# images (gpt-image-* models)
|
|
375
|
+
input_text_tokens: Optional[int] # text prompt tokens
|
|
376
|
+
cached_input_text_tokens: Optional[int]
|
|
377
|
+
input_image_tokens: Optional[int] # reference-image tokens (edit only)
|
|
378
|
+
cached_input_image_tokens: Optional[int]
|
|
379
|
+
output_image_tokens: Optional[int] # generated-image tokens
|
|
380
|
+
image_count: Optional[int] # number of images returned (audit only)
|
|
381
|
+
|
|
382
|
+
# extras
|
|
383
|
+
tags: dict # arbitrary metadata you can attach
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
All `Optional` fields are omitted from `to_dict()` when `None`, keeping ingest payloads lean.
|
|
387
|
+
|
|
388
|
+
---
|
|
389
|
+
|
|
390
|
+
## Advanced usage
|
|
391
|
+
|
|
392
|
+
### Selective patching
|
|
393
|
+
|
|
394
|
+
```python
|
|
395
|
+
# Specify providers in MeterConfig
|
|
396
|
+
meter = Meter(MeterConfig(
|
|
397
|
+
api_key="...",
|
|
398
|
+
url="...",
|
|
399
|
+
patches=["openai", "anthropic"], # skip google and deepseek
|
|
400
|
+
))
|
|
401
|
+
meter.patch_all()
|
|
402
|
+
|
|
403
|
+
# Or patch / unpatch one provider at a time
|
|
404
|
+
meter.patch("deepseek")
|
|
405
|
+
meter.unpatch("deepseek")
|
|
406
|
+
|
|
407
|
+
# Unpatch everything and restore original SDK methods
|
|
408
|
+
meter.unpatch_all()
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
### Use as a context manager
|
|
412
|
+
|
|
413
|
+
`Meter` supports the context manager protocol — `__exit__` calls `unpatch_all()` and shuts down the background flush queue automatically:
|
|
414
|
+
|
|
415
|
+
```python
|
|
416
|
+
with Meter(MeterConfig(api_key="...", url="...")).patch_all() as meter:
|
|
417
|
+
with meter.for_tenant("acme-corp"):
|
|
418
|
+
client.chat.completions.create(...)
|
|
419
|
+
# all patches restored, queue flushed on exit
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
### Manual tracking
|
|
423
|
+
|
|
424
|
+
For providers not yet patched, or custom logic:
|
|
425
|
+
|
|
426
|
+
```python
|
|
427
|
+
meter.track_manually(
|
|
428
|
+
provider="cohere",
|
|
429
|
+
model="command-r-plus",
|
|
430
|
+
input_tokens=512,
|
|
431
|
+
output_tokens=128,
|
|
432
|
+
tenant_id="acme-corp",
|
|
433
|
+
)
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
Any extra keyword arguments are passed through as event fields (unknown fields are dropped with a debug log when `debug=True`).
|
|
437
|
+
|
|
438
|
+
### Event hook — real-time logging or custom logic
|
|
439
|
+
|
|
440
|
+
```python
|
|
441
|
+
def my_hook(event: LLMEvent) -> None:
|
|
442
|
+
print(f"[{event.tenant_id}] {event.model}: {event.total_tokens} tokens")
|
|
443
|
+
|
|
444
|
+
meter = Meter(MeterConfig(
|
|
445
|
+
api_key="...",
|
|
446
|
+
url="...",
|
|
447
|
+
on_event=my_hook,
|
|
448
|
+
))
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
### Flush error handling
|
|
452
|
+
|
|
453
|
+
```python
|
|
454
|
+
def on_flush_error(exc: Exception) -> None:
|
|
455
|
+
sentry_sdk.capture_exception(exc)
|
|
456
|
+
|
|
457
|
+
meter = Meter(MeterConfig(
|
|
458
|
+
api_key="...",
|
|
459
|
+
url="...",
|
|
460
|
+
on_flush_error=on_flush_error,
|
|
461
|
+
))
|
|
462
|
+
```
|
|
463
|
+
|
|
464
|
+
### Force a flush
|
|
465
|
+
|
|
466
|
+
```python
|
|
467
|
+
# Useful at the end of a batch job or CLI script
|
|
468
|
+
meter._queue.flush_now()
|
|
469
|
+
```
|
|
470
|
+
|
|
471
|
+
---
|
|
472
|
+
|
|
473
|
+
## Adding a custom provider patch
|
|
474
|
+
|
|
475
|
+
All provider patches inherit from `BasePatch`. Implement `_install` and an extractor function, then register in `PATCH_REGISTRY`:
|
|
476
|
+
|
|
477
|
+
```python
|
|
478
|
+
from token_limit.patches._base import BasePatch
|
|
479
|
+
from token_limit.patches import PATCH_REGISTRY
|
|
480
|
+
|
|
481
|
+
def _extract(response, args, kwargs, error):
|
|
482
|
+
return {
|
|
483
|
+
"provider": "cohere",
|
|
484
|
+
"endpoint": "chat",
|
|
485
|
+
"model": kwargs.get("model", ""),
|
|
486
|
+
"input_tokens": getattr(response, "meta", {}).get("billed_units", {}).get("input_tokens", 0),
|
|
487
|
+
"output_tokens": getattr(response, "meta", {}).get("billed_units", {}).get("output_tokens", 0),
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
class CoherePatch(BasePatch):
|
|
491
|
+
name = "cohere"
|
|
492
|
+
|
|
493
|
+
def _install(self):
|
|
494
|
+
import cohere
|
|
495
|
+
self._swap(
|
|
496
|
+
cohere.Client, "chat",
|
|
497
|
+
self._make_sync_wrapper(cohere.Client.chat, _extract),
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
PATCH_REGISTRY["cohere"] = CoherePatch
|
|
501
|
+
|
|
502
|
+
meter.patch("cohere")
|
|
503
|
+
```
|
|
504
|
+
|
|
505
|
+
---
|
|
506
|
+
|
|
507
|
+
## Project structure
|
|
508
|
+
|
|
509
|
+
```
|
|
510
|
+
token_limit/
|
|
511
|
+
├── __init__.py ← public API: Meter, MeterConfig, LLMEvent
|
|
512
|
+
├── meter.py ← Meter class (patch_all, for_tenant, set_limit)
|
|
513
|
+
├── config.py ← MeterConfig dataclass
|
|
514
|
+
├── types.py ← LLMEvent dataclass
|
|
515
|
+
├── exceptions.py ← LimitExceededException exception class
|
|
516
|
+
├── patches/
|
|
517
|
+
│ ├── _base.py ← BasePatch ABC + sync/async wrapper factories
|
|
518
|
+
│ ├── openai_patch.py
|
|
519
|
+
│ ├── anthropic_patch.py
|
|
520
|
+
│ ├── google_patch.py
|
|
521
|
+
│ ├── deepseek_patch.py
|
|
522
|
+
│ └── openrouter_patch.py
|
|
523
|
+
└── transport/
|
|
524
|
+
├── queue.py ← thread-safe EventQueue with background flush
|
|
525
|
+
└── http_client.py ← gzip POST, auto-selects httpx/requests/urllib
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
---
|
|
529
|
+
|
|
530
|
+
## License
|
|
531
|
+
|
|
532
|
+
MIT
|