infermesh 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
infermesh/__init__.py ADDED
@@ -0,0 +1,147 @@
1
+ """infermesh — a researcher-first batching client built on LiteLLM.
2
+
3
+ `infermesh` wraps [LiteLLM](https://docs.litellm.ai) with the workflow pieces
4
+ that tend to show up once an experiment stops being "one request in, one string
5
+ out": concurrent batch execution, notebook-safe sync calls, partial-failure
6
+ handling, client-side throttling, and optional multi-replica routing. Public
7
+ results are typed in both synchronous and asynchronous Python. Multimodal (VLM)
8
+ inputs are supported via the standard OpenAI content-block format; use
9
+ [image_block][infermesh.image_block] to encode local image files or raw bytes
10
+ before sending. It supports two operating modes:
11
+
12
+ **Single-endpoint mode** — one model, one server:
13
+
14
+ ```python
15
+ from infermesh import LMClient
16
+
17
+ with LMClient(
18
+ model="openai/gpt-4o-mini", api_base="http://localhost:8000/v1"
19
+ ) as client:
20
+ result = client.generate("What is 2 + 2?")
21
+ print(result.output_text) # "4"
22
+ ```
23
+
24
+ **Router mode** — multiple replicas with automatic load-balancing:
25
+
26
+ ```python
27
+ from infermesh import LMClient, DeploymentConfig
28
+
29
+ client = LMClient(
30
+ deployments={
31
+ "gpu-0": DeploymentConfig(
32
+ model="hosted_vllm/meta-llama/Meta-Llama-3-8B-Instruct",
33
+ api_base="http://gpu0:8000/v1",
34
+ ),
35
+ "gpu-1": DeploymentConfig(
36
+ model="hosted_vllm/meta-llama/Meta-Llama-3-8B-Instruct",
37
+ api_base="http://gpu1:8000/v1",
38
+ ),
39
+ }
40
+ )
41
+ ```
42
+
43
+ **Rate limiting** — pass `rpm` and/or `tpm` to enable automatic throttling:
44
+
45
+ ```python
46
+ client = LMClient(
47
+ model="openai/gpt-4o",
48
+ api_base="https://api.openai.com/v1",
49
+ rpm=500,
50
+ tpm=100_000,
51
+ )
52
+ ```
53
+
54
+ When targeting hosted providers, export the relevant provider environment
55
+ variable (for example `OPENAI_API_KEY`) before constructing the client.
56
+ Advanced library integrations can still pass `api_key` directly when the
57
+ secret comes from a secret manager or another in-process credential source.
58
+
59
+ If you only need a small number of single requests, plain LiteLLM or the
60
+ provider SDK is usually simpler. `infermesh` is most useful when you want to
61
+ push larger workloads through a notebook, script, or local inference stack.
62
+
63
+ **Async usage** — every public method has an `a`-prefixed async counterpart:
64
+
65
+ ```python
66
+ import asyncio
67
+ from infermesh import LMClient
68
+
69
+
70
+ async def main() -> None:
71
+ async with LMClient(...) as client:
72
+ results = await client.agenerate_batch(["prompt 1", "prompt 2"])
73
+ for r in results:
74
+ if r is not None:
75
+ print(r.output_text)
76
+
77
+
78
+ asyncio.run(main())
79
+ ```
80
+
81
+ **Public symbols:**
82
+
83
+ - [LMClient][infermesh.LMClient] — main client; generation, embedding, and
84
+ transcription in both sync and async forms, with optional rate limiting and
85
+ router mode.
86
+ - [DeploymentConfig][infermesh.DeploymentConfig] — per-replica configuration
87
+ used in router mode.
88
+ - [BatchResult][infermesh.BatchResult] — generic container returned by
89
+ `*_batch` methods.
90
+ - [GenerationResult][infermesh.GenerationResult] — typed result from a
91
+ text-generation call.
92
+ - [EmbeddingResult][infermesh.EmbeddingResult] — typed result from an
93
+ embedding call.
94
+ - [TranscriptionResult][infermesh.TranscriptionResult] — typed result from an
95
+ audio-transcription call.
96
+ - [RateLimiter][infermesh.RateLimiter] — async token-bucket rate limiter;
97
+ created automatically by [LMClient][infermesh.LMClient] when `rpm` / `tpm`
98
+ are supplied, but can also be used standalone.
99
+ - [RateLimiterAcquisitionHandle][infermesh.RateLimiterAcquisitionHandle] —
100
+ opaque handle returned by [acquire][infermesh.RateLimiter.acquire]; passed
101
+ back to [adjust][infermesh.RateLimiter.adjust] after the request completes.
102
+ - [TokenUsage][infermesh.TokenUsage] — token-count breakdown attached to
103
+ generation and embedding results.
104
+ - [RequestMetrics][infermesh.RequestMetrics] — per-request timing and routing
105
+ metadata.
106
+ - [ToolCall][infermesh.ToolCall] — a structured tool-call emitted by a model
107
+ during generation.
108
+ - [image_block][infermesh.image_block] — build an image content block from a
109
+ local file, raw bytes, or URL for multimodal (VLM) chat messages.
110
+ """
111
+
112
+ from importlib.metadata import PackageNotFoundError, version
113
+
114
+ try:
115
+ __version__ = version("infermesh")
116
+ except PackageNotFoundError:
117
+ __version__ = "unknown"
118
+
119
+ from infermesh.client import LMClient
120
+ from infermesh.rate_limiter import RateLimiter, RateLimiterAcquisitionHandle
121
+ from infermesh.types import (
122
+ BatchResult,
123
+ DeploymentConfig,
124
+ EmbeddingResult,
125
+ GenerationResult,
126
+ RequestMetrics,
127
+ TokenUsage,
128
+ ToolCall,
129
+ TranscriptionResult,
130
+ image_block,
131
+ )
132
+
133
+ __all__ = [
134
+ "__version__",
135
+ "BatchResult",
136
+ "DeploymentConfig",
137
+ "EmbeddingResult",
138
+ "GenerationResult",
139
+ "image_block",
140
+ "LMClient",
141
+ "RateLimiter",
142
+ "RateLimiterAcquisitionHandle",
143
+ "RequestMetrics",
144
+ "TokenUsage",
145
+ "ToolCall",
146
+ "TranscriptionResult",
147
+ ]
infermesh/_bucket.py ADDED
@@ -0,0 +1,359 @@
1
+ """Token-bucket implementation for rate limiting.
2
+
3
+ A `Bucket` is a fixed-capacity counter that refills at a constant rate.
4
+ Tokens are consumed when a request is dispatched and added back over time as
5
+ the rate window progresses. The bucket also supports synchronisation from
6
+ server-side `x-ratelimit-*` response headers so that the local estimate stays
7
+ aligned with the provider's authoritative view.
8
+
9
+ This module is internal to `infermesh`; external callers should interact with
10
+ [RateLimiter][infermesh.RateLimiter] instead.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ import math
17
+ import time
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class Bucket:
23
+ """A fixed-capacity token bucket that refills at a constant rate.
24
+
25
+ The bucket starts full (at `capacity` tokens) and is drained by
26
+ `consume_tokens`. Tokens are added continuously at a rate of
27
+ `tokens_per_period / time_period` tokens per second. The level is
28
+ capped at `capacity`.
29
+
30
+ After a call to `sync_from_response_header`, the bucket adopts a
31
+ *server-derived effective rate* computed from the remaining tokens and
32
+ reset timestamp reported in `x-ratelimit-*` headers. This adjusted rate
33
+ is used for up to `SYNC_RATE_VALIDITY_DURATION` seconds before
34
+ falling back to the configured rate.
35
+
36
+ Parameters
37
+ ----------
38
+ tokens_per_period : int
39
+ The number of tokens the bucket receives every ``time_period`` seconds.
40
+ Must be a positive integer.
41
+ capacity : int or None, optional
42
+ Maximum number of tokens the bucket can hold. When ``None`` (default),
43
+ the capacity equals ``tokens_per_period``, meaning no burst above the
44
+ base rate is permitted. Set this higher to allow short bursts.
45
+ time_period : int, optional
46
+ The refill window in seconds. Default is ``60`` (per-minute buckets).
47
+ Use ``86400`` for per-day buckets.
48
+
49
+ Raises
50
+ ------
51
+ ValueError
52
+ If ``tokens_per_period``, ``capacity``, or ``time_period`` are not
53
+ positive integers.
54
+
55
+ Examples
56
+ --------
57
+ Create a 100 RPM bucket with no burst allowance:
58
+
59
+ >>> bucket = Bucket(tokens_per_period=100, time_period=60)
60
+ >>> bucket.capacity
61
+ 100
62
+ >>> bucket.consume_tokens(1) # one request
63
+ True
64
+
65
+ Create a 1000 TPM bucket with a 200-token burst capacity:
66
+
67
+ >>> tpm_bucket = Bucket(tokens_per_period=1000, capacity=200, time_period=60)
68
+ """
69
+
70
+ SYNC_RATE_VALIDITY_DURATION = 60.0
71
+ """Seconds for which a server-synced rate stays valid.
72
+
73
+ After this window, the bucket reverts to its configured
74
+ ``tokens_per_period / time_period`` rate.
75
+ """
76
+
77
+ def __init__(
78
+ self,
79
+ tokens_per_period: int,
80
+ capacity: int | None = None,
81
+ time_period: int = 60,
82
+ ) -> None:
83
+ if not isinstance(tokens_per_period, int) or tokens_per_period <= 0:
84
+ raise ValueError(
85
+ "Expected ``tokens_per_period`` to be a positive integer, "
86
+ f"but got {tokens_per_period}"
87
+ )
88
+ if capacity is not None and (not isinstance(capacity, int) or capacity <= 0):
89
+ raise ValueError(
90
+ f"Expected ``capacity`` to be a positive integer, but got {capacity}"
91
+ )
92
+ if not isinstance(time_period, int) or time_period <= 0:
93
+ raise ValueError(
94
+ "Expected ``time_period`` to be a positive integer, "
95
+ f"but got {time_period}"
96
+ )
97
+
98
+ self._tokens_per_period = tokens_per_period
99
+ self._capacity = capacity or tokens_per_period
100
+ self._time_period = time_period
101
+ self._rate_per_second = tokens_per_period / time_period
102
+ self._last_refill_time = time.monotonic()
103
+ self._tokens = float(self.capacity)
104
+ self._last_sync_time: float = 0.0
105
+ self._server_reset_time: float | None = None
106
+ self._last_sync_rate: float | None = None
107
+
108
+ @property
109
+ def capacity(self) -> int:
110
+ """Maximum number of tokens the bucket can hold.
111
+
112
+ Returns
113
+ -------
114
+ int
115
+ The bucket capacity. Equal to ``tokens_per_period`` when no
116
+ explicit capacity was provided.
117
+ """
118
+ return self._capacity
119
+
120
+ @property
121
+ def tokens_per_period(self) -> int:
122
+ """Configured token replenishment per ``time_period``.
123
+
124
+ Returns
125
+ -------
126
+ int
127
+ The ``tokens_per_period`` value supplied at construction.
128
+ """
129
+ return self._tokens_per_period
130
+
131
+ @property
132
+ def time_period(self) -> int:
133
+ """Refill window duration in seconds.
134
+
135
+ Returns
136
+ -------
137
+ int
138
+ The ``time_period`` value supplied at construction (e.g. ``60``
139
+ for per-minute buckets, ``86400`` for per-day buckets).
140
+ """
141
+ return self._time_period
142
+
143
+ def get_bucket_level(self, current_time: float | None = None) -> int:
144
+ """Return the current number of available tokens (floor).
145
+
146
+ Applies any accumulated refill since the last call before returning the
147
+ level.
148
+
149
+ Parameters
150
+ ----------
151
+ current_time : float or None, optional
152
+ Monotonic timestamp (from `time.monotonic`). When ``None``,
153
+ the current time is obtained automatically.
154
+
155
+ Returns
156
+ -------
157
+ int
158
+ Available tokens, floored to the nearest integer.
159
+ """
160
+ current_time = current_time or time.monotonic()
161
+ self._refill(current_time)
162
+ return math.floor(self._tokens)
163
+
164
+ def consume_tokens(
165
+ self,
166
+ num_tokens_needed: int,
167
+ current_time: float | None = None,
168
+ ) -> bool:
169
+ """Consume tokens from the bucket if sufficient capacity exists.
170
+
171
+ Parameters
172
+ ----------
173
+ num_tokens_needed : int
174
+ Number of tokens to consume. If ``<= 0``, the call is a no-op
175
+ and returns ``True``.
176
+ current_time : float or None, optional
177
+ Monotonic timestamp. Obtained automatically when ``None``.
178
+
179
+ Returns
180
+ -------
181
+ bool
182
+ ``True`` if the tokens were consumed, ``False`` if the bucket did
183
+ not have enough tokens (no tokens are consumed in this case).
184
+ """
185
+ if num_tokens_needed <= 0:
186
+ return True
187
+
188
+ current_time = current_time or time.monotonic()
189
+ if self.get_bucket_level(current_time) >= num_tokens_needed:
190
+ self._tokens -= float(num_tokens_needed)
191
+ self._last_sync_rate = None
192
+ self._server_reset_time = None
193
+ return True
194
+ return False
195
+
196
+ def adjust_bucket_level(
197
+ self,
198
+ delta: int,
199
+ current_time: float | None = None,
200
+ ) -> None:
201
+ """Add or remove tokens from the bucket by a signed delta.
202
+
203
+ Used to correct the bucket after a request fails (refund) or to
204
+ reconcile over- / under-estimation of token usage. The resulting level
205
+ is clamped to ``[0, capacity]``.
206
+
207
+ Parameters
208
+ ----------
209
+ delta : int
210
+ Signed token adjustment. Positive values add tokens back
211
+ (e.g. refund after a failed request); negative values remove them.
212
+ A delta of ``0`` is a no-op.
213
+ current_time : float or None, optional
214
+ Monotonic timestamp. Obtained automatically when ``None``.
215
+ """
216
+ if delta == 0:
217
+ return
218
+
219
+ current_time = current_time or time.monotonic()
220
+ self._refill(current_time)
221
+ self._tokens = max(0.0, min(self.capacity, self._tokens + delta))
222
+ self._last_sync_rate = None
223
+ self._server_reset_time = None
224
+
225
+ def estimate_next_refill_time(
226
+ self,
227
+ num_tokens_needed: int,
228
+ current_time: float | None = None,
229
+ ) -> float:
230
+ """Estimate the seconds until ``num_tokens_needed`` tokens are available.
231
+
232
+ Used by the rate limiter to schedule wake-up timers for waiting
233
+ requests.
234
+
235
+ Parameters
236
+ ----------
237
+ num_tokens_needed : int
238
+ The number of tokens a waiting request requires.
239
+ current_time : float or None, optional
240
+ Monotonic timestamp. Obtained automatically when ``None``.
241
+
242
+ Returns
243
+ -------
244
+ float
245
+ Estimated seconds until the bucket will have enough tokens.
246
+ Returns ``0.0`` if the bucket already has sufficient tokens.
247
+ Returns ``float('inf')`` if the effective refill rate is zero and
248
+ no future server reset time is known (a warning is logged).
249
+
250
+ Notes
251
+ -----
252
+ A tiny epsilon (``1e-9``) is added to the estimate to avoid waking up
253
+ fractionally too early and immediately re-sleeping.
254
+ """
255
+ current_time = current_time or time.monotonic()
256
+ available_tokens = self.get_bucket_level(current_time)
257
+ shortfall = num_tokens_needed - available_tokens
258
+ if shortfall <= 0:
259
+ return 0.0
260
+
261
+ effective_rate = self._get_effective_rate(current_time)
262
+ if effective_rate <= 0:
263
+ if (
264
+ self._server_reset_time is not None
265
+ and self._server_reset_time > current_time
266
+ ):
267
+ return max(0.0, self._server_reset_time - current_time) + 1e-9
268
+
269
+ logger.warning(
270
+ "Bucket refill rate is zero and no future reset time is known."
271
+ )
272
+ return float("inf")
273
+
274
+ return (shortfall / effective_rate) + 1e-9
275
+
276
+ def sync_from_response_header(
277
+ self,
278
+ server_token_limit: int | None,
279
+ server_tokens_remaining: int,
280
+ server_reset_time: float,
281
+ current_time: float | None = None,
282
+ ) -> None:
283
+ """Synchronise the bucket with the server's authoritative rate-limit state.
284
+
285
+ After calling this method, the bucket's level is set to
286
+ `server_tokens_remaining` (capped at `capacity`) and its
287
+ effective refill rate is recalculated from the remaining tokens and the
288
+ time until the reset window ends. The server-derived rate stays active
289
+ for up to `SYNC_RATE_VALIDITY_DURATION` seconds.
290
+
291
+ This method is called automatically by
292
+ [adjust][infermesh.RateLimiter.adjust] when `x-ratelimit-*` response
293
+ headers are present.
294
+
295
+ Parameters
296
+ ----------
297
+ server_token_limit : int or None
298
+ The provider's stated limit for this window (from the
299
+ ``x-ratelimit-limit-*`` header). ``None`` if not available; the
300
+ bucket's own capacity is used as the refill target.
301
+ server_tokens_remaining : int
302
+ Tokens remaining in the current window, from the
303
+ ``x-ratelimit-remaining-*`` header. Must be non-negative.
304
+ server_reset_time : float
305
+ Monotonic timestamp at which the provider will reset the window,
306
+ derived from the ``x-ratelimit-reset-*`` header. Must be in the
307
+ future relative to ``current_time``.
308
+ current_time : float or None, optional
309
+ Monotonic timestamp. Obtained automatically when ``None``.
310
+
311
+ Raises
312
+ ------
313
+ ValueError
314
+ If ``server_token_limit`` is not positive, ``server_tokens_remaining``
315
+ is negative, or ``server_reset_time`` is not in the future.
316
+ """
317
+ current_time = current_time or time.monotonic()
318
+ self._refill(current_time)
319
+
320
+ if server_token_limit is not None and server_token_limit <= 0:
321
+ raise ValueError("Expected ``server_token_limit`` to be positive.")
322
+ if server_tokens_remaining < 0:
323
+ raise ValueError("Expected ``server_tokens_remaining`` to be non-negative.")
324
+ if server_reset_time <= current_time:
325
+ raise ValueError("Expected ``server_reset_time`` to be in the future.")
326
+
327
+ self._tokens = float(min(self.capacity, server_tokens_remaining))
328
+ self._last_refill_time = current_time
329
+ self._server_reset_time = server_reset_time
330
+ if server_token_limit is None:
331
+ refill_target = self.capacity
332
+ else:
333
+ refill_target = min(self.capacity, server_token_limit)
334
+ delay = server_reset_time - current_time
335
+ self._last_sync_rate = max(0.0, (refill_target - self._tokens) / delay)
336
+ self._last_sync_time = current_time
337
+
338
+ def _get_effective_rate(self, current_time: float) -> float:
339
+ """Return the active refill rate (tokens per second).
340
+
341
+ Returns the server-synced rate when it is still within its validity
342
+ window, otherwise falls back to the configured rate.
343
+ """
344
+ if (
345
+ self._last_sync_rate is not None
346
+ and (current_time - self._last_sync_time)
347
+ <= self.SYNC_RATE_VALIDITY_DURATION
348
+ ):
349
+ return self._last_sync_rate
350
+ return self._rate_per_second
351
+
352
+ def _refill(self, current_time: float) -> None:
353
+ """Add tokens proportional to elapsed time since the last refill."""
354
+ elapsed = max(0.0, current_time - self._last_refill_time)
355
+ if elapsed <= 0:
356
+ return
357
+ refill_amount = elapsed * self._get_effective_rate(current_time)
358
+ self._tokens = min(self.capacity, self._tokens + refill_amount)
359
+ self._last_refill_time = current_time