python-infrakit-dev 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infrakit/__init__.py +0 -0
- infrakit/cli/__init__.py +1 -0
- infrakit/cli/commands/__init__.py +1 -0
- infrakit/cli/commands/deps.py +530 -0
- infrakit/cli/commands/init.py +129 -0
- infrakit/cli/commands/llm.py +295 -0
- infrakit/cli/commands/logger.py +160 -0
- infrakit/cli/commands/module.py +342 -0
- infrakit/cli/commands/time.py +81 -0
- infrakit/cli/main.py +65 -0
- infrakit/core/__init__.py +0 -0
- infrakit/core/config/__init__.py +0 -0
- infrakit/core/config/converter.py +480 -0
- infrakit/core/config/exporter.py +304 -0
- infrakit/core/config/loader.py +713 -0
- infrakit/core/config/validator.py +389 -0
- infrakit/core/logger/__init__.py +21 -0
- infrakit/core/logger/formatters.py +143 -0
- infrakit/core/logger/handlers.py +322 -0
- infrakit/core/logger/retention.py +176 -0
- infrakit/core/logger/setup.py +314 -0
- infrakit/deps/__init__.py +239 -0
- infrakit/deps/clean.py +141 -0
- infrakit/deps/depfile.py +405 -0
- infrakit/deps/health.py +357 -0
- infrakit/deps/optimizer.py +642 -0
- infrakit/deps/scanner.py +550 -0
- infrakit/llm/__init__.py +35 -0
- infrakit/llm/batch.py +165 -0
- infrakit/llm/client.py +575 -0
- infrakit/llm/key_manager.py +728 -0
- infrakit/llm/llm_readme.md +306 -0
- infrakit/llm/models.py +148 -0
- infrakit/llm/providers/__init__.py +5 -0
- infrakit/llm/providers/base.py +112 -0
- infrakit/llm/providers/gemini.py +164 -0
- infrakit/llm/providers/openai.py +168 -0
- infrakit/llm/rate_limiter.py +54 -0
- infrakit/scaffolder/__init__.py +31 -0
- infrakit/scaffolder/ai.py +508 -0
- infrakit/scaffolder/backend.py +555 -0
- infrakit/scaffolder/cli_tool.py +386 -0
- infrakit/scaffolder/generator.py +338 -0
- infrakit/scaffolder/pipeline.py +562 -0
- infrakit/scaffolder/registry.py +121 -0
- infrakit/time/__init__.py +60 -0
- infrakit/time/profiler.py +511 -0
- python_infrakit_dev-0.1.0.dist-info/METADATA +124 -0
- python_infrakit_dev-0.1.0.dist-info/RECORD +51 -0
- python_infrakit_dev-0.1.0.dist-info/WHEEL +4 -0
- python_infrakit_dev-0.1.0.dist-info/entry_points.txt +3 -0
infrakit/llm/client.py
ADDED
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
"""
|
|
2
|
+
infrakit.llm.client
|
|
3
|
+
--------------------
|
|
4
|
+
LLMClient — the single entry point for all LLM interactions.
|
|
5
|
+
|
|
6
|
+
Quick start::
|
|
7
|
+
|
|
8
|
+
from infrakit.llm import LLMClient, Prompt
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
client = LLMClient(
|
|
12
|
+
keys={"openai_keys": ["sk-..."], "gemini_keys": ["AIza..."]},
|
|
13
|
+
storage_dir="./logs",
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# simple generate
|
|
17
|
+
response = client.generate(Prompt(user="What is 2+2?"), provider="openai")
|
|
18
|
+
print(response.content)
|
|
19
|
+
|
|
20
|
+
# structured output
|
|
21
|
+
class Answer(BaseModel):
|
|
22
|
+
value: int
|
|
23
|
+
explanation: str
|
|
24
|
+
|
|
25
|
+
response = client.generate(
|
|
26
|
+
Prompt(system="Be concise.", user="What is 2+2?"),
|
|
27
|
+
provider="openai",
|
|
28
|
+
response_model=Answer,
|
|
29
|
+
)
|
|
30
|
+
if response.schema_matched:
|
|
31
|
+
print(response.parsed.value)
|
|
32
|
+
|
|
33
|
+
# batch
|
|
34
|
+
prompts = [Prompt(user=f"Translate '{w}' to French") for w in ["cat", "dog", "bird"]]
|
|
35
|
+
batch = client.batch_generate(prompts, provider="gemini")
|
|
36
|
+
for r in batch.results:
|
|
37
|
+
print(r.content)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
import asyncio
|
|
43
|
+
import time
|
|
44
|
+
from pathlib import Path
|
|
45
|
+
from typing import Any, Literal, Optional, Type
|
|
46
|
+
|
|
47
|
+
from pydantic import BaseModel
|
|
48
|
+
|
|
49
|
+
from .batch import async_batch, threaded_batch
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _run_async(coro):
|
|
53
|
+
"""
|
|
54
|
+
Run an async coroutine from sync code, safely.
|
|
55
|
+
|
|
56
|
+
- If there is no running event loop (normal script / CLI):
|
|
57
|
+
use asyncio.run() directly.
|
|
58
|
+
- If there IS a running loop (Jupyter, FastAPI, nested asyncio):
|
|
59
|
+
spin up a new loop in a background thread and block until done.
|
|
60
|
+
This avoids the "asyncio.run() cannot be called from a running
|
|
61
|
+
event loop" error without requiring the caller to be async.
|
|
62
|
+
"""
|
|
63
|
+
import concurrent.futures
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
loop = asyncio.get_running_loop()
|
|
67
|
+
except RuntimeError:
|
|
68
|
+
loop = None
|
|
69
|
+
|
|
70
|
+
if loop is None:
|
|
71
|
+
return asyncio.run(coro)
|
|
72
|
+
|
|
73
|
+
# Running inside an existing loop — use a thread with its own loop
|
|
74
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
75
|
+
future = pool.submit(asyncio.run, coro)
|
|
76
|
+
return future.result()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
from .key_manager import KeyManager
|
|
80
|
+
from .models import BatchResult, LLMResponse, Prompt, Provider, QuotaConfig, RequestMeta
|
|
81
|
+
from .providers.base import BaseProvider
|
|
82
|
+
from .providers.gemini import GeminiProvider
|
|
83
|
+
from .providers.openai import OpenAIProvider
|
|
84
|
+
from .rate_limiter import RateLimiter
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ── defaults ───────────────────────────────────────────────────────────────
|
|
88
|
+
|
|
89
|
+
_DEFAULT_MAX_CONCURRENT = 3
|
|
90
|
+
_DEFAULT_KEY_RETRIES = 2 # retry same key N times before rotating
|
|
91
|
+
_DEFAULT_SCHEMA_RETRIES = 2 # attempts to parse structured output
|
|
92
|
+
_DEFAULT_META_WINDOW = 50 # recent request metadata records per key
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class LLMClient:
|
|
96
|
+
"""
|
|
97
|
+
Unified client for OpenAI and Gemini.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
keys API keys dict::
|
|
102
|
+
|
|
103
|
+
{
|
|
104
|
+
"openai_keys": ["sk-key1", "sk-key2"],
|
|
105
|
+
"gemini_keys": ["AIza-key1"],
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
storage_dir Path to a folder where key state is persisted.
|
|
109
|
+
Defaults to ``~/.infrakit/llm/`` if not given.
|
|
110
|
+
mode ``"async"`` — asyncio + semaphore concurrency.
|
|
111
|
+
``"threaded"`` — ThreadPoolExecutor concurrency.
|
|
112
|
+
Default: ``"async"``.
|
|
113
|
+
max_concurrent Max simultaneous in-flight requests for batch calls.
|
|
114
|
+
Default: 3.
|
|
115
|
+
key_retries How many times to retry the *same* key on a transient
|
|
116
|
+
error before rotating to the next one. Default: 2.
|
|
117
|
+
schema_retries How many times to retry JSON parsing/validation before
|
|
118
|
+
giving up and returning schema_matched=False. Default: 2.
|
|
119
|
+
meta_window How many recent request metadata records to keep per key.
|
|
120
|
+
Default: 50.
|
|
121
|
+
openai_model Default OpenAI model. Default: ``"gpt-4o-mini"``.
|
|
122
|
+
gemini_model Default Gemini model. Default: ``"gemini-1.5-flash"``.
|
|
123
|
+
show_progress Show tqdm progress bar during batch calls. Default: True.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
def __init__(
|
|
127
|
+
self,
|
|
128
|
+
keys: dict[str, list[str]],
|
|
129
|
+
storage_dir: Optional[str | Path] = None,
|
|
130
|
+
quota_file: Optional[str | Path] = None,
|
|
131
|
+
mode: Literal["async", "threaded"] = "async",
|
|
132
|
+
max_concurrent: int = _DEFAULT_MAX_CONCURRENT,
|
|
133
|
+
key_retries: int = _DEFAULT_KEY_RETRIES,
|
|
134
|
+
schema_retries: int = _DEFAULT_SCHEMA_RETRIES,
|
|
135
|
+
meta_window: int = _DEFAULT_META_WINDOW,
|
|
136
|
+
openai_model: Optional[str] = None,
|
|
137
|
+
gemini_model: Optional[str] = None,
|
|
138
|
+
show_progress: bool = True,
|
|
139
|
+
) -> None:
|
|
140
|
+
self._mode = mode
|
|
141
|
+
self._max_concurrent = max_concurrent
|
|
142
|
+
self._key_retries = key_retries
|
|
143
|
+
self._schema_retries = schema_retries
|
|
144
|
+
self._show_progress = show_progress
|
|
145
|
+
|
|
146
|
+
# Key manager (persistence, rotation, quota)
|
|
147
|
+
self._km = KeyManager(
|
|
148
|
+
keys=keys,
|
|
149
|
+
storage_dir=storage_dir,
|
|
150
|
+
quota_file=quota_file,
|
|
151
|
+
meta_window=meta_window,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Rate limiter (RPM/TPM gating)
|
|
155
|
+
self._rl = RateLimiter(self._km)
|
|
156
|
+
|
|
157
|
+
# Providers
|
|
158
|
+
self._providers: dict[str, BaseProvider] = {
|
|
159
|
+
Provider.OPENAI: OpenAIProvider(model=openai_model),
|
|
160
|
+
Provider.GEMINI: GeminiProvider(model=gemini_model),
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# ── public: single generate ────────────────────────────────────────────
|
|
164
|
+
|
|
165
|
+
def generate(
|
|
166
|
+
self,
|
|
167
|
+
prompt: Prompt,
|
|
168
|
+
provider: str,
|
|
169
|
+
response_model: Optional[Type[BaseModel]] = None,
|
|
170
|
+
**kwargs: Any,
|
|
171
|
+
) -> LLMResponse:
|
|
172
|
+
"""
|
|
173
|
+
Generate a response for a single prompt (blocking).
|
|
174
|
+
|
|
175
|
+
Always uses the sync code path so it is safe to call from any context:
|
|
176
|
+
scripts, threads, Jupyter, FastAPI handlers, Windows, etc.
|
|
177
|
+
|
|
178
|
+
If you are inside an async function use ``await async_generate()``
|
|
179
|
+
instead; that path uses the async SDK clients end-to-end.
|
|
180
|
+
|
|
181
|
+
Handles key rotation, RPM waiting, retries, and metadata recording.
|
|
182
|
+
Always returns an LLMResponse — check ``.error`` for failures.
|
|
183
|
+
"""
|
|
184
|
+
return self._sync_single_generate(
|
|
185
|
+
prompt=prompt,
|
|
186
|
+
response_model=response_model,
|
|
187
|
+
provider=provider,
|
|
188
|
+
**kwargs,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
async def async_generate(
|
|
192
|
+
self,
|
|
193
|
+
prompt: Prompt,
|
|
194
|
+
provider: str,
|
|
195
|
+
response_model: Optional[Type[BaseModel]] = None,
|
|
196
|
+
**kwargs: Any,
|
|
197
|
+
) -> LLMResponse:
|
|
198
|
+
"""Async version of generate() — await this inside an async context."""
|
|
199
|
+
return await self._async_single_generate(
|
|
200
|
+
prompt=prompt,
|
|
201
|
+
response_model=response_model,
|
|
202
|
+
provider=provider,
|
|
203
|
+
**kwargs,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# ── public: batch generate ─────────────────────────────────────────────
|
|
207
|
+
|
|
208
|
+
def batch_generate(
|
|
209
|
+
self,
|
|
210
|
+
prompts: list[Prompt],
|
|
211
|
+
provider: str,
|
|
212
|
+
response_model: Optional[Type[BaseModel]] = None,
|
|
213
|
+
max_concurrent: Optional[int] = None,
|
|
214
|
+
show_progress: Optional[bool] = None,
|
|
215
|
+
**kwargs: Any,
|
|
216
|
+
) -> BatchResult:
|
|
217
|
+
"""
|
|
218
|
+
Generate responses for a list of prompts.
|
|
219
|
+
|
|
220
|
+
Results are in the same order as *prompts*.
|
|
221
|
+
Uses async or threaded mode based on client ``mode`` setting.
|
|
222
|
+
|
|
223
|
+
Parameters
|
|
224
|
+
----------
|
|
225
|
+
prompts List of Prompt objects.
|
|
226
|
+
provider ``"openai"`` or ``"gemini"``.
|
|
227
|
+
response_model Optional Pydantic model for structured output.
|
|
228
|
+
max_concurrent Override per-call concurrency limit.
|
|
229
|
+
show_progress Override per-call progress bar setting.
|
|
230
|
+
"""
|
|
231
|
+
concurrency = max_concurrent or self._max_concurrent
|
|
232
|
+
progress = show_progress if show_progress is not None else self._show_progress
|
|
233
|
+
|
|
234
|
+
if self._mode == "async":
|
|
235
|
+
return _run_async(
|
|
236
|
+
async_batch(
|
|
237
|
+
generate_fn=self._async_single_generate,
|
|
238
|
+
prompts=prompts,
|
|
239
|
+
response_model=response_model,
|
|
240
|
+
schema_retries=self._schema_retries,
|
|
241
|
+
provider=provider,
|
|
242
|
+
max_concurrent=concurrency,
|
|
243
|
+
show_progress=progress,
|
|
244
|
+
extra_kwargs=kwargs,
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
else:
|
|
248
|
+
return threaded_batch(
|
|
249
|
+
generate_fn=self._sync_single_generate,
|
|
250
|
+
prompts=prompts,
|
|
251
|
+
response_model=response_model,
|
|
252
|
+
schema_retries=self._schema_retries,
|
|
253
|
+
provider=provider,
|
|
254
|
+
max_concurrent=concurrency,
|
|
255
|
+
show_progress=progress,
|
|
256
|
+
extra_kwargs=kwargs,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
async def async_batch_generate(
|
|
260
|
+
self,
|
|
261
|
+
prompts: list[Prompt],
|
|
262
|
+
provider: str,
|
|
263
|
+
response_model: Optional[Type[BaseModel]] = None,
|
|
264
|
+
max_concurrent: Optional[int] = None,
|
|
265
|
+
show_progress: Optional[bool] = None,
|
|
266
|
+
**kwargs: Any,
|
|
267
|
+
) -> BatchResult:
|
|
268
|
+
"""Async version of batch_generate() for use inside async contexts."""
|
|
269
|
+
concurrency = max_concurrent or self._max_concurrent
|
|
270
|
+
progress = show_progress if show_progress is not None else self._show_progress
|
|
271
|
+
|
|
272
|
+
return await async_batch(
|
|
273
|
+
generate_fn=self._async_single_generate,
|
|
274
|
+
prompts=prompts,
|
|
275
|
+
response_model=response_model,
|
|
276
|
+
schema_retries=self._schema_retries,
|
|
277
|
+
provider=provider,
|
|
278
|
+
max_concurrent=concurrency,
|
|
279
|
+
show_progress=progress,
|
|
280
|
+
extra_kwargs=kwargs,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# ── public: quota management ───────────────────────────────────────────
|
|
284
|
+
|
|
285
|
+
def set_quota(self, provider: str, key_id: str, quota: QuotaConfig) -> None:
|
|
286
|
+
"""
|
|
287
|
+
Set or update quota limits for a specific key.
|
|
288
|
+
|
|
289
|
+
``quota.model`` controls scope:
|
|
290
|
+
- ``None`` (default) — applies to all models on this key that
|
|
291
|
+
don't have their own explicit entry.
|
|
292
|
+
- A model string — applies only to that model.
|
|
293
|
+
|
|
294
|
+
Examples::
|
|
295
|
+
|
|
296
|
+
# key-level RPM + default daily limit for all models
|
|
297
|
+
client.set_quota(
|
|
298
|
+
provider="gemini",
|
|
299
|
+
key_id="AIza-abc1",
|
|
300
|
+
quota=QuotaConfig(rpm_limit=15, daily_token_limit=1_500_000),
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# tighter limit for one expensive model only
|
|
304
|
+
client.set_quota(
|
|
305
|
+
provider="gemini",
|
|
306
|
+
key_id="AIza-abc1",
|
|
307
|
+
quota=QuotaConfig(model="gemini-2.5-pro", daily_token_limit=250_000),
|
|
308
|
+
)
|
|
309
|
+
"""
|
|
310
|
+
self._km.set_quota(provider, key_id, quota)
|
|
311
|
+
|
|
312
|
+
def status(
|
|
313
|
+
self,
|
|
314
|
+
provider: Optional[str] = None,
|
|
315
|
+
key_id: Optional[str] = None,
|
|
316
|
+
) -> list[dict]:
|
|
317
|
+
"""
|
|
318
|
+
Return key status dicts.
|
|
319
|
+
|
|
320
|
+
Parameters
|
|
321
|
+
----------
|
|
322
|
+
provider Filter to a specific provider (``"openai"`` / ``"gemini"``).
|
|
323
|
+
None returns all providers.
|
|
324
|
+
key_id Filter to a specific key (first 8 chars of the key).
|
|
325
|
+
None returns all keys for the provider.
|
|
326
|
+
"""
|
|
327
|
+
return self._km.status_report(provider=provider, key_id=key_id)
|
|
328
|
+
|
|
329
|
+
def print_status(
|
|
330
|
+
self,
|
|
331
|
+
provider: Optional[str] = None,
|
|
332
|
+
key_id: Optional[str] = None,
|
|
333
|
+
) -> None:
|
|
334
|
+
"""Pretty-print key status to stdout (model-aware)."""
|
|
335
|
+
import datetime
|
|
336
|
+
rows = self._km.status_report(provider=provider, key_id=key_id)
|
|
337
|
+
if not rows:
|
|
338
|
+
print("No keys found.")
|
|
339
|
+
return
|
|
340
|
+
|
|
341
|
+
sep = "-" * 72
|
|
342
|
+
for r in rows:
|
|
343
|
+
print(sep)
|
|
344
|
+
print(f" Provider : {r['provider']}")
|
|
345
|
+
print(f" Key ID : {r['key_id']}...")
|
|
346
|
+
print(f" Status : {r['status']}")
|
|
347
|
+
print(f" RPM limit: {r['rpm_limit'] or 'not set'} | "
|
|
348
|
+
f"Current RPM: {r['current_rpm']}")
|
|
349
|
+
print()
|
|
350
|
+
|
|
351
|
+
models = r.get("models", [])
|
|
352
|
+
if models:
|
|
353
|
+
print(f" Models ({len(models)} tracked)")
|
|
354
|
+
for mr in models:
|
|
355
|
+
status_flag = "\u2713" if mr["status"] == "active" else "\u2717"
|
|
356
|
+
print(f" [{status_flag}] {mr['model']}")
|
|
357
|
+
if mr["deactivated_at"]:
|
|
358
|
+
dt = datetime.datetime.utcfromtimestamp(mr["deactivated_at"])
|
|
359
|
+
print(f" Deactivated : {dt.strftime('%Y-%m-%d %H:%M:%S')} UTC")
|
|
360
|
+
print(f" TPM limit : {mr['tpm_limit'] or 'not set'} "
|
|
361
|
+
f"Current TPM: {mr['current_tpm']}")
|
|
362
|
+
daily_rem = mr["daily_remaining"]
|
|
363
|
+
print(f" Daily limit : {mr['daily_token_limit'] or 'not set'} "
|
|
364
|
+
f"Used: {mr['day_token_total']} "
|
|
365
|
+
f"Remaining: {daily_rem if daily_rem is not None else 'unlimited'}")
|
|
366
|
+
print(f" Reset hour : {mr['reset_hour_utc']:02d}:00 UTC")
|
|
367
|
+
print(f" Totals : {mr['total_requests']} req "
|
|
368
|
+
f"{mr['total_tokens']} tok {mr['total_errors']} err")
|
|
369
|
+
else:
|
|
370
|
+
print(" No model usage recorded yet.")
|
|
371
|
+
|
|
372
|
+
if r["recent_meta"]:
|
|
373
|
+
print()
|
|
374
|
+
print(f" Last {len(r['recent_meta'])} requests")
|
|
375
|
+
for m in r["recent_meta"]:
|
|
376
|
+
ts = datetime.datetime.utcfromtimestamp(m["timestamp"])
|
|
377
|
+
status_str = "ok" if m["success"] else f"ERR: {m.get('error', '')[:60]}"
|
|
378
|
+
print(
|
|
379
|
+
f" {ts.strftime('%H:%M:%S')} UTC | "
|
|
380
|
+
f"{m['model']:<28} | "
|
|
381
|
+
f"in={m['input_tokens']} out={m['output_tokens']} "
|
|
382
|
+
f"total={m['total_tokens']} | "
|
|
383
|
+
f"{m['latency_ms']:.0f}ms | {status_str}"
|
|
384
|
+
)
|
|
385
|
+
print(sep)
|
|
386
|
+
|
|
387
|
+
# ── internal: async single generate ───────────────────────────────────
|
|
388
|
+
|
|
389
|
+
async def _async_single_generate(
|
|
390
|
+
self,
|
|
391
|
+
prompt: Prompt,
|
|
392
|
+
provider: str,
|
|
393
|
+
response_model: Optional[Type[BaseModel]] = None,
|
|
394
|
+
schema_retries: Optional[int] = None,
|
|
395
|
+
**kwargs: Any,
|
|
396
|
+
) -> LLMResponse:
|
|
397
|
+
schema_retries = schema_retries if schema_retries is not None else self._schema_retries
|
|
398
|
+
prov_impl = self._get_provider(provider)
|
|
399
|
+
|
|
400
|
+
last_error: Optional[str] = None
|
|
401
|
+
keys_tried: set[tuple] = set()
|
|
402
|
+
|
|
403
|
+
while True:
|
|
404
|
+
# acquire a key that has this model active
|
|
405
|
+
try:
|
|
406
|
+
raw_key, ks = self._km.get_key(provider, model=prov_impl.model)
|
|
407
|
+
except RuntimeError as exc:
|
|
408
|
+
return self._error_response(provider, str(exc))
|
|
409
|
+
|
|
410
|
+
# avoid re-trying the same (key, model) combination
|
|
411
|
+
if (ks.key_hash, prov_impl.model) in keys_tried:
|
|
412
|
+
break
|
|
413
|
+
|
|
414
|
+
# wait for RPM slot (key-level) and TPM slot (model-level)
|
|
415
|
+
await self._rl.async_wait_for_slot(ks, prov_impl.model)
|
|
416
|
+
|
|
417
|
+
# attempt with retries on same key
|
|
418
|
+
for attempt in range(self._key_retries + 1):
|
|
419
|
+
t0 = time.perf_counter()
|
|
420
|
+
try:
|
|
421
|
+
response = await prov_impl.async_generate(
|
|
422
|
+
prompt=prompt,
|
|
423
|
+
api_key=raw_key,
|
|
424
|
+
response_model=response_model,
|
|
425
|
+
schema_retries=schema_retries,
|
|
426
|
+
**kwargs,
|
|
427
|
+
)
|
|
428
|
+
# success — record metadata
|
|
429
|
+
meta = RequestMeta(
|
|
430
|
+
provider=provider,
|
|
431
|
+
key_id=ks.key_id,
|
|
432
|
+
model=prov_impl.model,
|
|
433
|
+
input_tokens=response.input_tokens,
|
|
434
|
+
output_tokens=response.output_tokens,
|
|
435
|
+
total_tokens=response.total_tokens,
|
|
436
|
+
latency_ms=response.latency_ms,
|
|
437
|
+
success=True,
|
|
438
|
+
)
|
|
439
|
+
self._km.record_request(ks, meta)
|
|
440
|
+
return response
|
|
441
|
+
|
|
442
|
+
except Exception as exc:
|
|
443
|
+
latency_ms = (time.perf_counter() - t0) * 1000
|
|
444
|
+
last_error = str(exc)
|
|
445
|
+
is_quota = prov_impl._is_quota_error(exc)
|
|
446
|
+
|
|
447
|
+
# record failed request
|
|
448
|
+
meta = RequestMeta(
|
|
449
|
+
provider=provider,
|
|
450
|
+
key_id=ks.key_id,
|
|
451
|
+
model=prov_impl.model,
|
|
452
|
+
latency_ms=latency_ms,
|
|
453
|
+
success=False,
|
|
454
|
+
error=last_error[:200],
|
|
455
|
+
)
|
|
456
|
+
self._km.record_request(ks, meta)
|
|
457
|
+
|
|
458
|
+
if is_quota:
|
|
459
|
+
# deactivate only this model on this key, not the whole key
|
|
460
|
+
self._km.deactivate_model(
|
|
461
|
+
ks, model=prov_impl.model, reason=last_error[:100]
|
|
462
|
+
)
|
|
463
|
+
break # rotate to next key/model
|
|
464
|
+
if attempt < self._key_retries:
|
|
465
|
+
# small backoff before same-key retry
|
|
466
|
+
await asyncio.sleep(1.0 * (attempt + 1))
|
|
467
|
+
# else: fall through and rotate key
|
|
468
|
+
|
|
469
|
+
keys_tried.add((ks.key_hash, prov_impl.model))
|
|
470
|
+
|
|
471
|
+
return self._error_response(provider, last_error or "All keys exhausted.")
|
|
472
|
+
|
|
473
|
+
# ── internal: sync single generate ────────────────────────────────────
|
|
474
|
+
|
|
475
|
+
def _sync_single_generate(
|
|
476
|
+
self,
|
|
477
|
+
prompt: Prompt,
|
|
478
|
+
provider: str,
|
|
479
|
+
response_model: Optional[Type[BaseModel]] = None,
|
|
480
|
+
schema_retries: Optional[int] = None,
|
|
481
|
+
**kwargs: Any,
|
|
482
|
+
) -> LLMResponse:
|
|
483
|
+
schema_retries = schema_retries if schema_retries is not None else self._schema_retries
|
|
484
|
+
prov_impl = self._get_provider(provider)
|
|
485
|
+
|
|
486
|
+
last_error: Optional[str] = None
|
|
487
|
+
keys_tried: set[tuple] = set()
|
|
488
|
+
|
|
489
|
+
while True:
|
|
490
|
+
try:
|
|
491
|
+
raw_key, ks = self._km.get_key(provider, model=prov_impl.model)
|
|
492
|
+
except RuntimeError as exc:
|
|
493
|
+
return self._error_response(provider, str(exc))
|
|
494
|
+
|
|
495
|
+
if (ks.key_hash, prov_impl.model) in keys_tried:
|
|
496
|
+
break
|
|
497
|
+
|
|
498
|
+
self._rl.sync_wait_for_slot(ks, prov_impl.model)
|
|
499
|
+
|
|
500
|
+
for attempt in range(self._key_retries + 1):
|
|
501
|
+
t0 = time.perf_counter()
|
|
502
|
+
try:
|
|
503
|
+
response = prov_impl.sync_generate(
|
|
504
|
+
prompt=prompt,
|
|
505
|
+
api_key=raw_key,
|
|
506
|
+
response_model=response_model,
|
|
507
|
+
schema_retries=schema_retries,
|
|
508
|
+
**kwargs,
|
|
509
|
+
)
|
|
510
|
+
meta = RequestMeta(
|
|
511
|
+
provider=provider,
|
|
512
|
+
key_id=ks.key_id,
|
|
513
|
+
model=prov_impl.model,
|
|
514
|
+
input_tokens=response.input_tokens,
|
|
515
|
+
output_tokens=response.output_tokens,
|
|
516
|
+
total_tokens=response.total_tokens,
|
|
517
|
+
latency_ms=response.latency_ms,
|
|
518
|
+
success=True,
|
|
519
|
+
)
|
|
520
|
+
self._km.record_request(ks, meta)
|
|
521
|
+
return response
|
|
522
|
+
|
|
523
|
+
except Exception as exc:
|
|
524
|
+
latency_ms = (time.perf_counter() - t0) * 1000
|
|
525
|
+
last_error = str(exc)
|
|
526
|
+
is_quota = prov_impl._is_quota_error(exc)
|
|
527
|
+
|
|
528
|
+
meta = RequestMeta(
|
|
529
|
+
provider=provider,
|
|
530
|
+
key_id=ks.key_id,
|
|
531
|
+
model=prov_impl.model,
|
|
532
|
+
latency_ms=latency_ms,
|
|
533
|
+
success=False,
|
|
534
|
+
error=last_error[:200],
|
|
535
|
+
)
|
|
536
|
+
self._km.record_request(ks, meta)
|
|
537
|
+
|
|
538
|
+
if is_quota:
|
|
539
|
+
self._km.deactivate_model(
|
|
540
|
+
ks, model=prov_impl.model, reason=last_error[:100]
|
|
541
|
+
)
|
|
542
|
+
break
|
|
543
|
+
if attempt < self._key_retries:
|
|
544
|
+
time.sleep(1.0 * (attempt + 1))
|
|
545
|
+
|
|
546
|
+
keys_tried.add((ks.key_hash, prov_impl.model))
|
|
547
|
+
|
|
548
|
+
return self._error_response(provider, last_error or "All keys exhausted.")
|
|
549
|
+
|
|
550
|
+
# ── internal helpers ───────────────────────────────────────────────────
|
|
551
|
+
|
|
552
|
+
def _get_provider(self, provider: str) -> BaseProvider:
|
|
553
|
+
impl = self._providers.get(provider)
|
|
554
|
+
if impl is None:
|
|
555
|
+
raise ValueError(
|
|
556
|
+
f"Unknown provider '{provider}'. "
|
|
557
|
+
f"Valid options: {list(self._providers.keys())}"
|
|
558
|
+
)
|
|
559
|
+
return impl
|
|
560
|
+
|
|
561
|
+
@staticmethod
|
|
562
|
+
def _error_response(provider: str, error: str) -> LLMResponse:
|
|
563
|
+
return LLMResponse(
|
|
564
|
+
content="",
|
|
565
|
+
parsed=None,
|
|
566
|
+
schema_matched=False,
|
|
567
|
+
provider=provider,
|
|
568
|
+
model="",
|
|
569
|
+
key_id="",
|
|
570
|
+
input_tokens=0,
|
|
571
|
+
output_tokens=0,
|
|
572
|
+
total_tokens=0,
|
|
573
|
+
latency_ms=0.0,
|
|
574
|
+
error=error,
|
|
575
|
+
)
|