infermesh 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infermesh-0.2.0/PKG-INFO +521 -0
- infermesh-0.2.0/README.md +497 -0
- infermesh-0.2.0/pyproject.toml +146 -0
- infermesh-0.2.0/src/infermesh/__init__.py +147 -0
- infermesh-0.2.0/src/infermesh/_bucket.py +359 -0
- infermesh-0.2.0/src/infermesh/_cli_bench.py +474 -0
- infermesh-0.2.0/src/infermesh/_cli_support.py +349 -0
- infermesh-0.2.0/src/infermesh/_client_runtime.py +515 -0
- infermesh-0.2.0/src/infermesh/_utils.py +615 -0
- infermesh-0.2.0/src/infermesh/cli.py +673 -0
- infermesh-0.2.0/src/infermesh/client.py +1053 -0
- infermesh-0.2.0/src/infermesh/py.typed +0 -0
- infermesh-0.2.0/src/infermesh/rate_limiter.py +641 -0
- infermesh-0.2.0/src/infermesh/sync_runner.py +149 -0
- infermesh-0.2.0/src/infermesh/types.py +684 -0
infermesh-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,521 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: infermesh
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Run large LLM batches from notebooks and scripts without rewriting concurrency or rate-limit glue.
|
|
5
|
+
Keywords: llm,litellm,async,rate-limiting,inference
|
|
6
|
+
Author: Franklin Ogidi
|
|
7
|
+
Author-email: Franklin Ogidi <franklin.ogidi@vectorinstitute.ai>
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Typing :: Typed
|
|
17
|
+
Requires-Dist: jsonschema>=4.0.0
|
|
18
|
+
Requires-Dist: litellm>=1.83.0
|
|
19
|
+
Requires-Dist: pydantic>=2.13.0
|
|
20
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
21
|
+
Requires-Dist: tqdm>=4.0.0
|
|
22
|
+
Requires-Python: >=3.12
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# infermesh
|
|
26
|
+
|
|
27
|
+
`infermesh` is for researchers and engineers who need to run large LLM jobs
|
|
28
|
+
from notebooks, scripts, or local inference stacks without rebuilding the same
|
|
29
|
+
concurrency and quota-control layer each time.
|
|
30
|
+
|
|
31
|
+
It sits on top of LiteLLM and focuses on the parts that usually show up once an
|
|
32
|
+
experiment becomes real work:
|
|
33
|
+
|
|
34
|
+
- concurrent batch generation with ordered results
|
|
35
|
+
- notebook-safe sync APIs
|
|
36
|
+
- per-item failure handling for long runs
|
|
37
|
+
- crash-resilient batches with incremental writes and `--resume` support
|
|
38
|
+
- automatic retries with exponential backoff for transient errors
|
|
39
|
+
- client-side RPM and TPM throttling
|
|
40
|
+
- typed results with token usage and timing metadata
|
|
41
|
+
- multi-replica routing for local or clustered inference endpoints
|
|
42
|
+
|
|
43
|
+
If you only need a handful of one-off requests, use the provider SDK or plain
|
|
44
|
+
LiteLLM. `infermesh` earns its keep when throughput control and batch ergonomics
|
|
45
|
+
matter more than raw minimalism.
|
|
46
|
+
|
|
47
|
+
## Install
|
|
48
|
+
|
|
49
|
+
Python `3.12+` is required.
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
python -m pip install infermesh
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
If you use `uv`:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
uv add infermesh
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Contributor setup, editable installs, and clone-based workflows live in
|
|
62
|
+
[CONTRIBUTING.md](https://github.com/VectorInstitute/infermesh/blob/main/CONTRIBUTING.md).
|
|
63
|
+
|
|
64
|
+
## Quick Start
|
|
65
|
+
|
|
66
|
+
Set the provider key in your environment first:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
export OPENAI_API_KEY=sk-...
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
The core workflow is "run a batch, keep the results you want, inspect the failures,
|
|
73
|
+
and retry only what broke":
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from infermesh import LMClient
|
|
77
|
+
|
|
78
|
+
prompts = [
|
|
79
|
+
"Summarize section 1 in two bullet points.",
|
|
80
|
+
"Summarize section 2 in two bullet points.",
|
|
81
|
+
"Summarize section 3 in two bullet points.",
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
with LMClient(
|
|
85
|
+
model="openai/gpt-4.1-mini",
|
|
86
|
+
rpm=500,
|
|
87
|
+
tpm=100_000,
|
|
88
|
+
) as client:
|
|
89
|
+
batch = client.generate_batch(prompts)
|
|
90
|
+
|
|
91
|
+
retry_prompts: list[str] = []
|
|
92
|
+
|
|
93
|
+
for i, result in enumerate(batch):
|
|
94
|
+
if result is None:
|
|
95
|
+
print(f"FAILED: {prompts[i]}\n {batch.errors[i]}")
|
|
96
|
+
retry_prompts.append(prompts[i])
|
|
97
|
+
else:
|
|
98
|
+
print(result.output_text)
|
|
99
|
+
if result.token_usage is not None:
|
|
100
|
+
print("tokens:", result.token_usage.total_tokens)
|
|
101
|
+
|
|
102
|
+
if retry_prompts:
|
|
103
|
+
with LMClient(model="openai/gpt-4.1-mini") as retry_client:
|
|
104
|
+
retry_batch = retry_client.generate_batch(retry_prompts)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
One failing request does not abort the whole batch. Failed items are `None` in
|
|
108
|
+
`batch.results`; the exception is in `batch.errors[i]`. This is deliberate: a single
|
|
109
|
+
provider error should not wipe out a long experiment.
|
|
110
|
+
|
|
111
|
+
This code works in Jupyter notebooks without any `asyncio` setup. The sync API runs a
|
|
112
|
+
background event loop so you do not have to.
|
|
113
|
+
|
|
114
|
+
For a single one-off request:
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
with LMClient(model="openai/gpt-4.1-mini") as client:
|
|
118
|
+
result = client.generate("What is the capital of France?")
|
|
119
|
+
print(result.output_text)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
The `model` string follows LiteLLM's `provider/model-name` format. See the
|
|
123
|
+
[LiteLLM model list](https://docs.litellm.ai/docs/providers) for all supported
|
|
124
|
+
providers:
|
|
125
|
+
|
|
126
|
+
| Provider | Example |
|
|
127
|
+
| --- | --- |
|
|
128
|
+
| OpenAI | `"openai/gpt-4.1-mini"` |
|
|
129
|
+
| Anthropic | `"anthropic/claude-3-5-sonnet-20241022"` |
|
|
130
|
+
| Local vLLM | `"hosted_vllm/meta-llama/Meta-Llama-3-8B-Instruct"` |
|
|
131
|
+
|
|
132
|
+
`api_base` is optional for hosted providers — LiteLLM already knows their endpoints.
|
|
133
|
+
Set it explicitly for local servers or custom deployments. Keep provider secrets in
|
|
134
|
+
environment variables (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`); local servers that
|
|
135
|
+
require no auth work without an `api_key`.
|
|
136
|
+
|
|
137
|
+
## Generate Text
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
result = client.generate("Say hello in one sentence.")
|
|
141
|
+
|
|
142
|
+
print(result.output_text) # generated text
|
|
143
|
+
print(result.token_usage) # prompt / completion / total token counts
|
|
144
|
+
print(result.finish_reason) # "stop", "length", …
|
|
145
|
+
print(result.request_id) # provider-assigned ID for debugging
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Create Embeddings
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
# Single string → EmbeddingResult
|
|
152
|
+
result = client.embed("The quick brown fox")
|
|
153
|
+
print(result.embedding) # list[float]
|
|
154
|
+
|
|
155
|
+
# Multiple strings → sent in one API call
|
|
156
|
+
batch = client.embed_batch(["sentence one", "sentence two", "sentence three"])
|
|
157
|
+
vectors = [r.embedding for r in batch if r is not None]
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Transcribe Audio
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
result = client.transcribe("recording.wav") # path, bytes, or file-like object
|
|
164
|
+
print(result.text)
|
|
165
|
+
print(result.language) # detected language code, e.g. "en"
|
|
166
|
+
print(result.duration_s) # audio length in seconds
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## CLI
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
# Set your key first (or use --env-file .env)
|
|
173
|
+
export OPENAI_API_KEY=sk-...
|
|
174
|
+
|
|
175
|
+
# Generate — single prompt
|
|
176
|
+
infermesh generate \
|
|
177
|
+
--model openai/gpt-4.1-mini \
|
|
178
|
+
--api-base https://api.openai.com/v1 \
|
|
179
|
+
--prompt "Hello"
|
|
180
|
+
|
|
181
|
+
# Generate — from a JSONL file, results to another JSONL file
|
|
182
|
+
# Each input line: {"prompt": "..."} or {"messages": [...]} or {"responses_input": "..."}
|
|
183
|
+
# Output includes an _index field so interrupted runs can be resumed.
|
|
184
|
+
infermesh generate \
|
|
185
|
+
--model openai/gpt-4.1-mini \
|
|
186
|
+
--api-base https://api.openai.com/v1 \
|
|
187
|
+
--input-jsonl prompts.jsonl \
|
|
188
|
+
--output-jsonl results.jsonl
|
|
189
|
+
|
|
190
|
+
# Resume an interrupted run — skips already-completed rows and appends new ones
|
|
191
|
+
infermesh generate \
|
|
192
|
+
--model openai/gpt-4.1-mini \
|
|
193
|
+
--api-base https://api.openai.com/v1 \
|
|
194
|
+
--input-jsonl prompts.jsonl \
|
|
195
|
+
--output-jsonl results.jsonl \
|
|
196
|
+
--resume
|
|
197
|
+
|
|
198
|
+
# Create embeddings
|
|
199
|
+
infermesh embed \
|
|
200
|
+
--model text-embedding-3-small \
|
|
201
|
+
--api-base https://api.openai.com/v1 \
|
|
202
|
+
--text "hello world"
|
|
203
|
+
|
|
204
|
+
# Transcribe audio
|
|
205
|
+
infermesh transcribe --model whisper-1 \
|
|
206
|
+
--api-base https://api.openai.com/v1 \
|
|
207
|
+
recording.wav
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Advanced Usage
|
|
211
|
+
|
|
212
|
+
<details>
|
|
213
|
+
<summary>Crash-resilient batches (on_result)</summary>
|
|
214
|
+
|
|
215
|
+
For long runs, pass `on_result` to write each result to disk as it arrives.
|
|
216
|
+
A crash or interruption only loses the requests that were in-flight at that
|
|
217
|
+
moment — everything already completed is safe on disk.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
import json
|
|
221
|
+
from infermesh import LMClient
|
|
222
|
+
|
|
223
|
+
with open("results.jsonl", "w") as out, \
|
|
224
|
+
LMClient(model="openai/gpt-4.1-mini") as client:
|
|
225
|
+
|
|
226
|
+
def save(index: int, result, error) -> None:
|
|
227
|
+
row = {"index": index}
|
|
228
|
+
if error is not None:
|
|
229
|
+
row["error"] = str(error)
|
|
230
|
+
else:
|
|
231
|
+
row["output_text"] = result.output_text
|
|
232
|
+
out.write(json.dumps(row) + "\n")
|
|
233
|
+
out.flush()
|
|
234
|
+
|
|
235
|
+
client.generate_batch(prompts, on_result=save)
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
The CLI automates this with `--resume` — see the CLI section above and the
|
|
239
|
+
[User Guide](docs/guide.md) for the full checkpoint/resume pattern.
|
|
240
|
+
|
|
241
|
+
</details>
|
|
242
|
+
|
|
243
|
+
<details>
|
|
244
|
+
<summary>Rate limiting</summary>
|
|
245
|
+
|
|
246
|
+
Pass any combination of `rpm` / `tpm` / `rpd` / `tpd` to activate the built-in rate
|
|
247
|
+
limiter. The client queues requests automatically and respects all four limits
|
|
248
|
+
simultaneously.
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
client = LMClient(
|
|
252
|
+
model="openai/gpt-4.1-mini",
|
|
253
|
+
rpm=500, # requests per minute
|
|
254
|
+
tpm=100_000, # tokens per minute
|
|
255
|
+
)
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
Find your tier's limits in the provider dashboard: for OpenAI check **Settings →
|
|
259
|
+
Limits**; for Anthropic check **Console → Settings → Limits**.
|
|
260
|
+
|
|
261
|
+
Use `max_request_burst` / `max_token_burst` to allow short bursts above the steady-state
|
|
262
|
+
rate (token-bucket algorithm). Use `default_output_tokens` to pre-reserve output tokens
|
|
263
|
+
for rate-limit accounting when you don't set `max_tokens` per request.
|
|
264
|
+
|
|
265
|
+
Provider rate-limit headers (`x-ratelimit-*`) are read automatically after each response
|
|
266
|
+
to keep the client's internal counters in sync with the server's view. Use
|
|
267
|
+
`header_bucket_scope` to control whether headers are routed to the per-minute or
|
|
268
|
+
per-day buckets.
|
|
269
|
+
|
|
270
|
+
CLI flags: `--rpm`, `--tpm`, `--rpd`, `--tpd`, `--max-request-burst`, `--max-token-burst`.
|
|
271
|
+
|
|
272
|
+
</details>
|
|
273
|
+
|
|
274
|
+
<details>
|
|
275
|
+
<summary>Multi-replica routing (vLLM / SGLang)</summary>
|
|
276
|
+
|
|
277
|
+
When you run multiple inference servers for the same model, pass a `deployments` dict
|
|
278
|
+
to spread load across them. `model` is the logical name the router exposes; each
|
|
279
|
+
`DeploymentConfig.model` is the backend string sent to that server.
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
from infermesh import DeploymentConfig, LMClient
|
|
283
|
+
|
|
284
|
+
client = LMClient(
|
|
285
|
+
model="llama-3-8b",
|
|
286
|
+
deployments={
|
|
287
|
+
"gpu-0": DeploymentConfig(
|
|
288
|
+
model="hosted_vllm/meta-llama/Meta-Llama-3-8B-Instruct",
|
|
289
|
+
api_base="http://host1:8000/v1",
|
|
290
|
+
),
|
|
291
|
+
"gpu-1": DeploymentConfig(
|
|
292
|
+
model="hosted_vllm/meta-llama/Meta-Llama-3-8B-Instruct",
|
|
293
|
+
api_base="http://host2:8000/v1",
|
|
294
|
+
),
|
|
295
|
+
"gpu-2": DeploymentConfig(
|
|
296
|
+
model="hosted_vllm/meta-llama/Meta-Llama-3-8B-Instruct",
|
|
297
|
+
api_base="http://host3:8000/v1",
|
|
298
|
+
),
|
|
299
|
+
},
|
|
300
|
+
routing_strategy="least-busy", # or "simple-shuffle" (default), "latency-based-routing"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
result = client.generate("Summarise this paper in one paragraph.")
|
|
304
|
+
print(result.metrics.deployment) # e.g. "gpu-1"
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
`DeploymentConfig` is a plain dataclass, so it maps naturally to Hydra / OmegaConf
|
|
308
|
+
structured config. Deployment keys (`"gpu-0"` etc.) are free-form labels.
|
|
309
|
+
|
|
310
|
+
**CLI — repeated `--api-base` flags:**
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
infermesh generate \
|
|
314
|
+
--model llama-3-8b \
|
|
315
|
+
--api-base http://host1:8000/v1 \
|
|
316
|
+
--api-base http://host2:8000/v1 \
|
|
317
|
+
--api-base http://host3:8000/v1 \
|
|
318
|
+
--prompt "Hello"
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
**CLI — TOML file for more control:**
|
|
322
|
+
|
|
323
|
+
```toml
|
|
324
|
+
# deployments.toml
|
|
325
|
+
[deployments.gpu-0]
|
|
326
|
+
model = "hosted_vllm/meta-llama/Meta-Llama-3-8B-Instruct"
|
|
327
|
+
api_base = "http://host1:8000/v1"
|
|
328
|
+
|
|
329
|
+
[deployments.gpu-1]
|
|
330
|
+
model = "hosted_vllm/meta-llama/Meta-Llama-3-8B-Instruct"
|
|
331
|
+
api_base = "http://host2:8000/v1"
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
```bash
|
|
335
|
+
infermesh generate \
|
|
336
|
+
--model llama-3-8b \
|
|
337
|
+
--deployments-toml deployments.toml \
|
|
338
|
+
--prompt "Hello"
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
Keep API keys out of TOML files — use environment variables or `--env-file` instead.
|
|
342
|
+
|
|
343
|
+
</details>
|
|
344
|
+
|
|
345
|
+
<details>
|
|
346
|
+
<summary>Async API</summary>
|
|
347
|
+
|
|
348
|
+
All methods have async counterparts prefixed with `a`. The sync methods work in
|
|
349
|
+
notebooks and scripts by running a background event loop thread — you don't need to
|
|
350
|
+
manage the event loop yourself.
|
|
351
|
+
|
|
352
|
+
```python
|
|
353
|
+
import asyncio
|
|
354
|
+
from infermesh import LMClient
|
|
355
|
+
|
|
356
|
+
async def main():
|
|
357
|
+
async with LMClient(model="openai/gpt-4.1-mini") as client:
|
|
358
|
+
result = await client.agenerate("Hello")
|
|
359
|
+
batch = await client.agenerate_batch(["prompt A", "prompt B", "prompt C"])
|
|
360
|
+
emb = await client.aembed("The quick brown fox")
|
|
361
|
+
embs = await client.aembed_batch(["text a", "text b"])
|
|
362
|
+
|
|
363
|
+
asyncio.run(main())
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
`async with` calls `close()` automatically. For sync code, use `with` or call
|
|
367
|
+
`client.close()` when done.
|
|
368
|
+
|
|
369
|
+
</details>
|
|
370
|
+
|
|
371
|
+
<details>
|
|
372
|
+
<summary>Structured output</summary>
|
|
373
|
+
|
|
374
|
+
Pass a Pydantic model as `response_format` and the output is parsed automatically:
|
|
375
|
+
|
|
376
|
+
```python
|
|
377
|
+
from pydantic import BaseModel
|
|
378
|
+
|
|
379
|
+
class Answer(BaseModel):
|
|
380
|
+
value: int
|
|
381
|
+
confidence: float
|
|
382
|
+
|
|
383
|
+
result = client.generate(
|
|
384
|
+
"What is 2 + 2? Respond in JSON.",
|
|
385
|
+
response_format=Answer,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
print(result.output_text) # raw JSON string
|
|
389
|
+
print(result.output_parsed) # Answer(value=4, confidence=0.99)
|
|
390
|
+
```
|
|
391
|
+
|
|
392
|
+
A plain `dict` (JSON schema) is also accepted in place of a Pydantic model; the output
|
|
393
|
+
is returned as a plain Python object. Parse failures are logged as warnings;
|
|
394
|
+
`output_parsed` is `None` if parsing fails.
|
|
395
|
+
|
|
396
|
+
</details>
|
|
397
|
+
|
|
398
|
+
<details>
|
|
399
|
+
<summary>Automatic retries</summary>
|
|
400
|
+
|
|
401
|
+
By default, `LMClient` retries transient provider errors up to 3 times with
|
|
402
|
+
exponential backoff. This covers 429 rate-limit spikes, 503 unavailability,
|
|
403
|
+
500 server errors, network failures, and timeouts.
|
|
404
|
+
|
|
405
|
+
```python
|
|
406
|
+
client = LMClient(
|
|
407
|
+
model="openai/gpt-4.1-mini",
|
|
408
|
+
max_retries=3, # default; set to 0 to disable
|
|
409
|
+
)
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
Backoff formula: `min(2 ** attempt, 60)` seconds plus up to 0.5 s jitter. If the
|
|
413
|
+
provider returns a `Retry-After` header its value is used instead (capped at 60 s).
|
|
414
|
+
Non-transient errors (`BadRequestError`, `AuthenticationError`, etc.) are not retried.
|
|
415
|
+
|
|
416
|
+
```python
|
|
417
|
+
result = client.generate("Hello")
|
|
418
|
+
print(result.metrics.retries) # 0 on first-attempt success
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
CLI flag: `--max-retries`.
|
|
422
|
+
|
|
423
|
+
</details>
|
|
424
|
+
|
|
425
|
+
<details>
|
|
426
|
+
<summary>Timeout and per-request overrides</summary>
|
|
427
|
+
|
|
428
|
+
Set a default timeout for every request at construction time:
|
|
429
|
+
|
|
430
|
+
```python
|
|
431
|
+
client = LMClient(
|
|
432
|
+
model="openai/gpt-4.1-mini",
|
|
433
|
+
timeout=30.0, # seconds
|
|
434
|
+
)
|
|
435
|
+
```
|
|
436
|
+
|
|
437
|
+
Any LiteLLM keyword argument passed to a `generate` / `embed` / `transcribe` call
|
|
438
|
+
overrides the default for that request:
|
|
439
|
+
|
|
440
|
+
```python
|
|
441
|
+
result = client.generate("Hello", timeout=5.0, max_tokens=64)
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
Use `default_request_kwargs` to set persistent overrides for all requests:
|
|
445
|
+
|
|
446
|
+
```python
|
|
447
|
+
client = LMClient(
|
|
448
|
+
model="openai/gpt-4.1-mini",
|
|
449
|
+
default_request_kwargs={"max_tokens": 256, "temperature": 0.7},
|
|
450
|
+
)
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
</details>
|
|
454
|
+
|
|
455
|
+
<details>
|
|
456
|
+
<summary>Benchmarking</summary>
|
|
457
|
+
|
|
458
|
+
`infermesh bench` measures client-side throughput across a concurrency sweep. It is
|
|
459
|
+
intentionally a **client** benchmark — it tells you the best `max_parallel_requests`
|
|
460
|
+
setting for your workload, not the server's maximum capacity.
|
|
461
|
+
|
|
462
|
+
```bash
|
|
463
|
+
infermesh bench generate \
|
|
464
|
+
--model openai/gpt-4.1-mini \
|
|
465
|
+
--api-base https://api.openai.com/v1 \
|
|
466
|
+
--prompt "Write a haiku." \
|
|
467
|
+
--warmup 5 \
|
|
468
|
+
--requests 50 \
|
|
469
|
+
--output-json bench.json
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
Output:
|
|
473
|
+
|
|
474
|
+
```
|
|
475
|
+
c=1 rps=3.14 p50=0.401s p95=0.412s p99=0.420s svc_p95=0.410s q_p95=0.001s err=0/50 elapsed=15.9s
|
|
476
|
+
c=2 rps=5.81 p50=0.470s p95=0.487s p99=0.501s svc_p95=0.480s q_p95=0.002s err=0/50 elapsed=8.6s
|
|
477
|
+
recommended_max_parallel_requests=8
|
|
478
|
+
```
|
|
479
|
+
|
|
480
|
+
`c` is the concurrency level. `svc_p95` is the P95 of net provider response time
|
|
481
|
+
(excluding queue wait). `q_p95` is the P95 time a request spent in the client queue.
|
|
482
|
+
High `q_p95` relative to `svc_p95` means the client is the bottleneck, not the server.
|
|
483
|
+
|
|
484
|
+
Use `--input-jsonl` to benchmark with a real prompt distribution. An embedding
|
|
485
|
+
benchmark is available as `infermesh bench embed`.
|
|
486
|
+
|
|
487
|
+
For server-centric metrics (TTFT, TPOT, ITL, request goodput), use a dedicated server
|
|
488
|
+
benchmark:
|
|
489
|
+
[vLLM](https://docs.vllm.ai/en/latest/api/vllm/benchmarks/serve/) ·
|
|
490
|
+
[SGLang](https://docs.sglang.ai/developer_guide/benchmark_and_profiling.html) ·
|
|
491
|
+
[AIPerf](https://github.com/ai-dynamo/aiperf)
|
|
492
|
+
|
|
493
|
+
</details>
|
|
494
|
+
|
|
495
|
+
## Why Not Just Use LiteLLM?
|
|
496
|
+
|
|
497
|
+
Use LiteLLM directly if provider abstraction is the only missing piece.
|
|
498
|
+
|
|
499
|
+
`infermesh` is intentionally narrower:
|
|
500
|
+
|
|
501
|
+
- LiteLLM is the provider abstraction and request layer.
|
|
502
|
+
- `infermesh` adds notebook-safe sync APIs and concurrent batch helpers.
|
|
503
|
+
- `infermesh` preserves partial failures instead of turning a long run into one
|
|
504
|
+
giant exception.
|
|
505
|
+
- `infermesh` adds client-side throttling and replica routing for experiment
|
|
506
|
+
workloads.
|
|
507
|
+
- `infermesh` returns typed result objects so request metadata is easier to
|
|
508
|
+
inspect programmatically.
|
|
509
|
+
|
|
510
|
+
## When Not To Use It
|
|
511
|
+
|
|
512
|
+
- You only make a few single requests.
|
|
513
|
+
- You already have a batching and throttling layer you trust.
|
|
514
|
+
- You want raw provider payloads with as little abstraction as possible.
|
|
515
|
+
|
|
516
|
+
## More Detail
|
|
517
|
+
|
|
518
|
+
- [User Guide](docs/guide.md) for the complete researcher workflow, embeddings,
|
|
519
|
+
transcription, multimodal inputs, rate limiting, routing, async usage, structured
|
|
520
|
+
output, and benchmarking
|
|
521
|
+
- [API Reference](docs/api/client.md) for method signatures and parameter docs
|