lm-deluge 0.0.56__py3-none-any.whl → 0.0.69__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lm_deluge/__init__.py +12 -1
- lm_deluge/api_requests/anthropic.py +12 -1
- lm_deluge/api_requests/base.py +87 -5
- lm_deluge/api_requests/bedrock.py +3 -4
- lm_deluge/api_requests/chat_reasoning.py +4 -0
- lm_deluge/api_requests/gemini.py +7 -6
- lm_deluge/api_requests/mistral.py +8 -9
- lm_deluge/api_requests/openai.py +179 -124
- lm_deluge/batches.py +25 -9
- lm_deluge/client.py +280 -67
- lm_deluge/config.py +1 -1
- lm_deluge/file.py +382 -13
- lm_deluge/mock_openai.py +482 -0
- lm_deluge/models/__init__.py +12 -8
- lm_deluge/models/anthropic.py +12 -20
- lm_deluge/models/bedrock.py +0 -14
- lm_deluge/models/cohere.py +0 -16
- lm_deluge/models/google.py +0 -20
- lm_deluge/models/grok.py +48 -4
- lm_deluge/models/groq.py +2 -2
- lm_deluge/models/kimi.py +34 -0
- lm_deluge/models/meta.py +0 -8
- lm_deluge/models/minimax.py +10 -0
- lm_deluge/models/openai.py +28 -34
- lm_deluge/models/openrouter.py +64 -1
- lm_deluge/models/together.py +0 -16
- lm_deluge/prompt.py +138 -29
- lm_deluge/request_context.py +9 -11
- lm_deluge/tool.py +395 -19
- lm_deluge/tracker.py +11 -5
- lm_deluge/warnings.py +46 -0
- {lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/METADATA +3 -1
- {lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/RECORD +36 -33
- lm_deluge/agent.py +0 -0
- lm_deluge/gemini_limits.py +0 -65
- {lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/WHEEL +0 -0
- {lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/licenses/LICENSE +0 -0
- {lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/top_level.txt +0 -0
lm_deluge/client.py
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import (
|
|
3
|
+
Any,
|
|
4
|
+
AsyncGenerator,
|
|
5
|
+
Callable,
|
|
6
|
+
ClassVar,
|
|
7
|
+
Literal,
|
|
8
|
+
Self,
|
|
9
|
+
Sequence,
|
|
10
|
+
cast,
|
|
11
|
+
overload,
|
|
12
|
+
)
|
|
3
13
|
|
|
4
14
|
import numpy as np
|
|
5
15
|
import yaml
|
|
@@ -12,12 +22,17 @@ from lm_deluge.batches import (
|
|
|
12
22
|
submit_batches_oa,
|
|
13
23
|
wait_for_batch_completion_async,
|
|
14
24
|
)
|
|
15
|
-
from lm_deluge.prompt import
|
|
25
|
+
from lm_deluge.prompt import (
|
|
26
|
+
CachePattern,
|
|
27
|
+
Conversation,
|
|
28
|
+
Prompt,
|
|
29
|
+
prompts_to_conversations,
|
|
30
|
+
)
|
|
16
31
|
from lm_deluge.tool import MCPServer, Tool
|
|
17
32
|
|
|
18
33
|
from .api_requests.base import APIResponse
|
|
19
34
|
from .config import SamplingParams
|
|
20
|
-
from .models import APIModel, registry
|
|
35
|
+
from .models import APIModel, register_model, registry
|
|
21
36
|
from .request_context import RequestContext
|
|
22
37
|
from .tracker import StatusTracker
|
|
23
38
|
|
|
@@ -29,6 +44,12 @@ class _LLMClient(BaseModel):
|
|
|
29
44
|
Keeps all validation, serialization, and existing functionality.
|
|
30
45
|
"""
|
|
31
46
|
|
|
47
|
+
_REASONING_SUFFIXES: ClassVar[dict[str, Literal["low", "medium", "high"]]] = {
|
|
48
|
+
"-low": "low",
|
|
49
|
+
"-medium": "medium",
|
|
50
|
+
"-high": "high",
|
|
51
|
+
}
|
|
52
|
+
|
|
32
53
|
model_names: str | list[str] = ["gpt-4.1-mini"]
|
|
33
54
|
name: str | None = None
|
|
34
55
|
max_requests_per_minute: int = 1_000
|
|
@@ -40,13 +61,16 @@ class _LLMClient(BaseModel):
|
|
|
40
61
|
request_timeout: int = 30
|
|
41
62
|
cache: Any = None
|
|
42
63
|
extra_headers: dict[str, str] | None = None
|
|
64
|
+
extra_body: dict[str, str] | None = None
|
|
65
|
+
use_responses_api: bool = False
|
|
66
|
+
background: bool = False
|
|
43
67
|
# sampling params - if provided, and sampling_params is not,
|
|
44
68
|
# these override the defaults
|
|
45
69
|
temperature: float = 0.75
|
|
46
70
|
top_p: float = 1.0
|
|
47
71
|
json_mode: bool = False
|
|
48
72
|
max_new_tokens: int = 512
|
|
49
|
-
reasoning_effort: Literal["low", "medium", "high", None] = None
|
|
73
|
+
reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None
|
|
50
74
|
logprobs: bool = False
|
|
51
75
|
top_logprobs: int | None = None
|
|
52
76
|
force_local_mcp: bool = False
|
|
@@ -100,13 +124,112 @@ class _LLMClient(BaseModel):
|
|
|
100
124
|
|
|
101
125
|
# NEW! Builder methods
|
|
102
126
|
def with_model(self, model: str):
|
|
103
|
-
self.
|
|
127
|
+
self._update_models([model])
|
|
104
128
|
return self
|
|
105
129
|
|
|
106
130
|
def with_models(self, models: list[str]):
|
|
107
|
-
self.
|
|
131
|
+
self._update_models(models)
|
|
108
132
|
return self
|
|
109
133
|
|
|
134
|
+
def _update_models(self, models: list[str]) -> None:
|
|
135
|
+
normalized, per_model_efforts = self._normalize_model_names(models)
|
|
136
|
+
if self.reasoning_effort is None:
|
|
137
|
+
unique_efforts = {eff for eff in per_model_efforts if eff is not None}
|
|
138
|
+
if len(normalized) == 1 and per_model_efforts[0] is not None:
|
|
139
|
+
self.reasoning_effort = per_model_efforts[0]
|
|
140
|
+
elif (
|
|
141
|
+
len(unique_efforts) == 1
|
|
142
|
+
and len(unique_efforts) != 0
|
|
143
|
+
and None not in per_model_efforts
|
|
144
|
+
):
|
|
145
|
+
self.reasoning_effort = next(iter(unique_efforts)) # type: ignore
|
|
146
|
+
self.model_names = normalized
|
|
147
|
+
self._align_sampling_params(per_model_efforts)
|
|
148
|
+
self._reset_model_weights()
|
|
149
|
+
|
|
150
|
+
def _normalize_model_names(
|
|
151
|
+
self, models: list[str]
|
|
152
|
+
) -> tuple[list[str], list[Literal["low", "medium", "high"] | None]]:
|
|
153
|
+
normalized: list[str] = []
|
|
154
|
+
efforts: list[Literal["low", "medium", "high"] | None] = []
|
|
155
|
+
|
|
156
|
+
for name in models:
|
|
157
|
+
base_name = self._preprocess_openrouter_model(name)
|
|
158
|
+
trimmed_name, effort = self.__class__._strip_reasoning_suffix_if_registered(
|
|
159
|
+
base_name
|
|
160
|
+
)
|
|
161
|
+
normalized.append(trimmed_name)
|
|
162
|
+
efforts.append(effort)
|
|
163
|
+
|
|
164
|
+
return normalized, efforts
|
|
165
|
+
|
|
166
|
+
def _align_sampling_params(
|
|
167
|
+
self, per_model_efforts: list[Literal["low", "medium", "high"] | None]
|
|
168
|
+
) -> None:
|
|
169
|
+
if len(per_model_efforts) < len(self.model_names):
|
|
170
|
+
per_model_efforts = per_model_efforts + [None] * (
|
|
171
|
+
len(self.model_names) - len(per_model_efforts)
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
if not self.model_names:
|
|
175
|
+
self.sampling_params = []
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
if not self.sampling_params:
|
|
179
|
+
self.sampling_params = []
|
|
180
|
+
|
|
181
|
+
if len(self.sampling_params) == 0:
|
|
182
|
+
for _ in self.model_names:
|
|
183
|
+
self.sampling_params.append(
|
|
184
|
+
SamplingParams(
|
|
185
|
+
temperature=self.temperature,
|
|
186
|
+
top_p=self.top_p,
|
|
187
|
+
json_mode=self.json_mode,
|
|
188
|
+
max_new_tokens=self.max_new_tokens,
|
|
189
|
+
reasoning_effort=self.reasoning_effort,
|
|
190
|
+
logprobs=self.logprobs,
|
|
191
|
+
top_logprobs=self.top_logprobs,
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
elif len(self.sampling_params) == 1 and len(self.model_names) > 1:
|
|
195
|
+
base_param = self.sampling_params[0]
|
|
196
|
+
self.sampling_params = [
|
|
197
|
+
base_param.model_copy(deep=True) for _ in self.model_names
|
|
198
|
+
]
|
|
199
|
+
elif len(self.sampling_params) != len(self.model_names):
|
|
200
|
+
base_param = self.sampling_params[0]
|
|
201
|
+
self.sampling_params = [
|
|
202
|
+
base_param.model_copy(deep=True) for _ in self.model_names
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
if self.reasoning_effort is not None:
|
|
206
|
+
for sp in self.sampling_params:
|
|
207
|
+
sp.reasoning_effort = self.reasoning_effort
|
|
208
|
+
else:
|
|
209
|
+
for sp, effort in zip(self.sampling_params, per_model_efforts):
|
|
210
|
+
if effort is not None:
|
|
211
|
+
sp.reasoning_effort = effort
|
|
212
|
+
|
|
213
|
+
def _reset_model_weights(self) -> None:
|
|
214
|
+
if not self.model_names:
|
|
215
|
+
self.model_weights = []
|
|
216
|
+
return
|
|
217
|
+
|
|
218
|
+
if isinstance(self.model_weights, list):
|
|
219
|
+
if len(self.model_weights) == len(self.model_names) and any(
|
|
220
|
+
self.model_weights
|
|
221
|
+
):
|
|
222
|
+
total = sum(self.model_weights)
|
|
223
|
+
if total == 0:
|
|
224
|
+
self.model_weights = [
|
|
225
|
+
1 / len(self.model_names) for _ in self.model_names
|
|
226
|
+
]
|
|
227
|
+
else:
|
|
228
|
+
self.model_weights = [w / total for w in self.model_weights]
|
|
229
|
+
return
|
|
230
|
+
# Fallback to uniform distribution
|
|
231
|
+
self.model_weights = [1 / len(self.model_names) for _ in self.model_names]
|
|
232
|
+
|
|
110
233
|
def with_limits(
|
|
111
234
|
self,
|
|
112
235
|
max_requests_per_minute: int | None = None,
|
|
@@ -130,11 +253,64 @@ class _LLMClient(BaseModel):
|
|
|
130
253
|
def models(self):
|
|
131
254
|
return self.model_names # why? idk
|
|
132
255
|
|
|
256
|
+
@staticmethod
|
|
257
|
+
def _preprocess_openrouter_model(model_name: str) -> str:
|
|
258
|
+
"""Process openrouter: prefix and register model if needed."""
|
|
259
|
+
if model_name.startswith("openrouter:"):
|
|
260
|
+
slug = model_name.split(":", 1)[1] # Everything after "openrouter:"
|
|
261
|
+
# Create a unique id by replacing slashes with hyphens
|
|
262
|
+
model_id = f"openrouter-{slug.replace('/', '-')}"
|
|
263
|
+
|
|
264
|
+
# Register the model if not already in registry
|
|
265
|
+
if model_id not in registry:
|
|
266
|
+
register_model(
|
|
267
|
+
id=model_id,
|
|
268
|
+
name=slug, # The full slug sent to OpenRouter API (e.g., "openrouter/andromeda-alpha")
|
|
269
|
+
api_base="https://openrouter.ai/api/v1",
|
|
270
|
+
api_key_env_var="OPENROUTER_API_KEY",
|
|
271
|
+
api_spec="openai",
|
|
272
|
+
supports_json=True,
|
|
273
|
+
supports_logprobs=False,
|
|
274
|
+
supports_responses=False,
|
|
275
|
+
input_cost=0, # Unknown costs for generic models
|
|
276
|
+
cached_input_cost=0,
|
|
277
|
+
cache_write_cost=0,
|
|
278
|
+
output_cost=0,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
return model_id
|
|
282
|
+
return model_name
|
|
283
|
+
|
|
133
284
|
@model_validator(mode="before")
|
|
134
285
|
@classmethod
|
|
135
286
|
def fix_lists(cls, data) -> "_LLMClient":
|
|
136
|
-
|
|
137
|
-
|
|
287
|
+
# Process model_names - handle both strings and lists
|
|
288
|
+
model_names = data.get("model_names")
|
|
289
|
+
|
|
290
|
+
if isinstance(model_names, str):
|
|
291
|
+
# Single model as string
|
|
292
|
+
# First, handle OpenRouter prefix
|
|
293
|
+
model_name = cls._preprocess_openrouter_model(model_names)
|
|
294
|
+
|
|
295
|
+
# Then handle reasoning effort suffix (e.g., "gpt-5-high")
|
|
296
|
+
model_name, effort = cls._strip_reasoning_suffix_if_registered(model_name)
|
|
297
|
+
if effort and data.get("reasoning_effort") is None:
|
|
298
|
+
data["reasoning_effort"] = effort
|
|
299
|
+
|
|
300
|
+
data["model_names"] = [model_name]
|
|
301
|
+
|
|
302
|
+
elif isinstance(model_names, list):
|
|
303
|
+
# List of models - process each one
|
|
304
|
+
processed_models = []
|
|
305
|
+
for model_name in model_names:
|
|
306
|
+
# Handle OpenRouter prefix for each model
|
|
307
|
+
processed_model = cls._preprocess_openrouter_model(model_name)
|
|
308
|
+
processed_model, _ = cls._strip_reasoning_suffix_if_registered(
|
|
309
|
+
processed_model
|
|
310
|
+
)
|
|
311
|
+
processed_models.append(processed_model)
|
|
312
|
+
data["model_names"] = processed_models
|
|
313
|
+
|
|
138
314
|
if not isinstance(data.get("sampling_params", []), list):
|
|
139
315
|
data["sampling_params"] = [data["sampling_params"]]
|
|
140
316
|
if "sampling_params" not in data or len(data.get("sampling_params", [])) == 0:
|
|
@@ -153,6 +329,18 @@ class _LLMClient(BaseModel):
|
|
|
153
329
|
data["sampling_params"] = data["sampling_params"] * len(data["model_names"])
|
|
154
330
|
return data
|
|
155
331
|
|
|
332
|
+
@classmethod
|
|
333
|
+
def _strip_reasoning_suffix_if_registered(
|
|
334
|
+
cls, model_name: str
|
|
335
|
+
) -> tuple[str, Literal["low", "medium", "high"] | None]:
|
|
336
|
+
"""Remove reasoning suffix only when the trimmed model already exists."""
|
|
337
|
+
for suffix, effort in cls._REASONING_SUFFIXES.items():
|
|
338
|
+
if model_name.endswith(suffix) and len(model_name) > len(suffix):
|
|
339
|
+
candidate = model_name[: -len(suffix)]
|
|
340
|
+
if candidate in registry:
|
|
341
|
+
return candidate, effort
|
|
342
|
+
return model_name, None
|
|
343
|
+
|
|
156
344
|
@model_validator(mode="after")
|
|
157
345
|
def validate_client(self) -> Self:
|
|
158
346
|
if isinstance(self.model_names, str):
|
|
@@ -171,6 +359,11 @@ class _LLMClient(BaseModel):
|
|
|
171
359
|
# normalize weights
|
|
172
360
|
self.model_weights = [w / sum(self.model_weights) for w in self.model_weights]
|
|
173
361
|
|
|
362
|
+
# background mode only allowed for responses api
|
|
363
|
+
if self.background:
|
|
364
|
+
assert (
|
|
365
|
+
self.use_responses_api
|
|
366
|
+
), "background mode only allowed for responses api"
|
|
174
367
|
# Auto-generate name if not provided
|
|
175
368
|
if self.name is None:
|
|
176
369
|
if len(self.model_names) == 1:
|
|
@@ -256,13 +449,6 @@ class _LLMClient(BaseModel):
|
|
|
256
449
|
# Idle wait before next capacity check. Aim for ~RPM spacing.
|
|
257
450
|
await asyncio.sleep(max(60.0 / self.max_requests_per_minute, 0.01))
|
|
258
451
|
|
|
259
|
-
async def _execute_request(self, context: RequestContext) -> APIResponse:
|
|
260
|
-
"""Create and send a single API request using the provided context."""
|
|
261
|
-
model_obj = APIModel.from_registry(context.model_name)
|
|
262
|
-
request = model_obj.make_request(context)
|
|
263
|
-
response = await request.execute_once()
|
|
264
|
-
return response
|
|
265
|
-
|
|
266
452
|
async def process_single_request(
|
|
267
453
|
self, context: RequestContext, retry_queue: asyncio.Queue | None = None
|
|
268
454
|
) -> APIResponse:
|
|
@@ -290,7 +476,9 @@ class _LLMClient(BaseModel):
|
|
|
290
476
|
# Execute single request
|
|
291
477
|
assert context.status_tracker
|
|
292
478
|
context.status_tracker.update_pbar()
|
|
293
|
-
|
|
479
|
+
model_obj = APIModel.from_registry(context.model_name)
|
|
480
|
+
request = model_obj.make_request(context)
|
|
481
|
+
response = await request.execute_once()
|
|
294
482
|
|
|
295
483
|
# Handle successful response
|
|
296
484
|
if not response.is_error:
|
|
@@ -350,44 +538,46 @@ class _LLMClient(BaseModel):
|
|
|
350
538
|
@overload
|
|
351
539
|
async def process_prompts_async(
|
|
352
540
|
self,
|
|
353
|
-
prompts:
|
|
541
|
+
prompts: Prompt | Sequence[Prompt],
|
|
354
542
|
*,
|
|
355
543
|
return_completions_only: Literal[True],
|
|
356
544
|
show_progress: bool = ...,
|
|
357
545
|
tools: list[Tool | dict | MCPServer] | None = ...,
|
|
358
546
|
cache: CachePattern | None = ...,
|
|
359
|
-
|
|
547
|
+
service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
|
|
360
548
|
) -> list[str | None]: ...
|
|
361
549
|
|
|
362
550
|
@overload
|
|
363
551
|
async def process_prompts_async(
|
|
364
552
|
self,
|
|
365
|
-
prompts:
|
|
553
|
+
prompts: Prompt | Sequence[Prompt],
|
|
366
554
|
*,
|
|
367
555
|
return_completions_only: Literal[False] = ...,
|
|
368
556
|
show_progress: bool = ...,
|
|
369
557
|
tools: list[Tool | dict | MCPServer] | None = ...,
|
|
370
558
|
cache: CachePattern | None = ...,
|
|
371
|
-
|
|
372
|
-
) -> list[APIResponse
|
|
559
|
+
service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
|
|
560
|
+
) -> list[APIResponse]: ...
|
|
373
561
|
|
|
374
562
|
async def process_prompts_async(
|
|
375
563
|
self,
|
|
376
|
-
prompts:
|
|
564
|
+
prompts: Prompt | Sequence[Prompt],
|
|
377
565
|
*,
|
|
378
566
|
return_completions_only: bool = False,
|
|
379
567
|
show_progress: bool = True,
|
|
380
568
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
381
569
|
cache: CachePattern | None = None,
|
|
382
|
-
|
|
383
|
-
) -> list[APIResponse
|
|
570
|
+
service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
|
|
571
|
+
) -> list[APIResponse] | list[str | None] | dict[str, int]:
|
|
384
572
|
"""Process multiple prompts asynchronously using the start_nowait/wait_for_all backend.
|
|
385
573
|
|
|
386
574
|
This implementation creates all tasks upfront and waits for them to complete,
|
|
387
575
|
avoiding issues with tracker state accumulating across multiple calls.
|
|
388
576
|
"""
|
|
389
577
|
# Convert prompts to Conversations
|
|
390
|
-
|
|
578
|
+
if not isinstance(prompts, list):
|
|
579
|
+
prompts = prompts = cast(Sequence[Prompt], [prompts])
|
|
580
|
+
prompts = prompts_to_conversations(cast(Sequence[Prompt], prompts))
|
|
391
581
|
|
|
392
582
|
# Ensure tracker exists (start_nowait will call add_to_total for each task)
|
|
393
583
|
if self._tracker is None:
|
|
@@ -398,13 +588,14 @@ class _LLMClient(BaseModel):
|
|
|
398
588
|
|
|
399
589
|
# Start all tasks using start_nowait - tasks will coordinate via shared capacity lock
|
|
400
590
|
task_ids = []
|
|
591
|
+
assert isinstance(prompts, Sequence)
|
|
401
592
|
for prompt in prompts:
|
|
402
593
|
assert isinstance(prompt, Conversation)
|
|
403
594
|
task_id = self.start_nowait(
|
|
404
595
|
prompt,
|
|
405
596
|
tools=tools,
|
|
406
597
|
cache=cache,
|
|
407
|
-
|
|
598
|
+
service_tier=service_tier,
|
|
408
599
|
)
|
|
409
600
|
task_ids.append(task_id)
|
|
410
601
|
|
|
@@ -443,13 +634,12 @@ class _LLMClient(BaseModel):
|
|
|
443
634
|
|
|
444
635
|
def process_prompts_sync(
|
|
445
636
|
self,
|
|
446
|
-
prompts:
|
|
637
|
+
prompts: Prompt | Sequence[Prompt],
|
|
447
638
|
*,
|
|
448
639
|
return_completions_only: bool = False,
|
|
449
640
|
show_progress=True,
|
|
450
641
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
451
642
|
cache: CachePattern | None = None,
|
|
452
|
-
use_responses_api: bool = False,
|
|
453
643
|
):
|
|
454
644
|
return asyncio.run(
|
|
455
645
|
self.process_prompts_async(
|
|
@@ -458,7 +648,6 @@ class _LLMClient(BaseModel):
|
|
|
458
648
|
show_progress=show_progress,
|
|
459
649
|
tools=tools,
|
|
460
650
|
cache=cache,
|
|
461
|
-
use_responses_api=use_responses_api,
|
|
462
651
|
)
|
|
463
652
|
)
|
|
464
653
|
|
|
@@ -478,18 +667,18 @@ class _LLMClient(BaseModel):
|
|
|
478
667
|
|
|
479
668
|
def start_nowait(
|
|
480
669
|
self,
|
|
481
|
-
prompt:
|
|
670
|
+
prompt: Prompt,
|
|
482
671
|
*,
|
|
483
672
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
484
673
|
cache: CachePattern | None = None,
|
|
485
|
-
|
|
674
|
+
service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
|
|
486
675
|
) -> int:
|
|
487
676
|
tracker = self._get_tracker()
|
|
488
677
|
task_id = self._next_task_id
|
|
489
678
|
self._next_task_id += 1
|
|
490
679
|
model, sampling_params = self._select_model()
|
|
491
|
-
|
|
492
|
-
|
|
680
|
+
prompt = prompts_to_conversations([prompt])[0]
|
|
681
|
+
assert isinstance(prompt, Conversation)
|
|
493
682
|
context = RequestContext(
|
|
494
683
|
task_id=task_id,
|
|
495
684
|
model_name=model,
|
|
@@ -500,7 +689,9 @@ class _LLMClient(BaseModel):
|
|
|
500
689
|
status_tracker=tracker,
|
|
501
690
|
tools=tools,
|
|
502
691
|
cache=cache,
|
|
503
|
-
use_responses_api=use_responses_api,
|
|
692
|
+
use_responses_api=self.use_responses_api,
|
|
693
|
+
background=self.background,
|
|
694
|
+
service_tier=service_tier,
|
|
504
695
|
extra_headers=self.extra_headers,
|
|
505
696
|
force_local_mcp=self.force_local_mcp,
|
|
506
697
|
)
|
|
@@ -511,33 +702,45 @@ class _LLMClient(BaseModel):
|
|
|
511
702
|
|
|
512
703
|
async def start(
|
|
513
704
|
self,
|
|
514
|
-
prompt:
|
|
705
|
+
prompt: Prompt,
|
|
515
706
|
*,
|
|
516
707
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
517
708
|
cache: CachePattern | None = None,
|
|
518
|
-
|
|
519
|
-
) -> APIResponse
|
|
709
|
+
service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
|
|
710
|
+
) -> APIResponse:
|
|
520
711
|
task_id = self.start_nowait(
|
|
521
|
-
prompt, tools=tools, cache=cache,
|
|
712
|
+
prompt, tools=tools, cache=cache, service_tier=service_tier
|
|
522
713
|
)
|
|
523
714
|
return await self.wait_for(task_id)
|
|
524
715
|
|
|
525
|
-
async def wait_for(self, task_id: int) -> APIResponse
|
|
716
|
+
async def wait_for(self, task_id: int) -> APIResponse:
|
|
526
717
|
task = self._tasks.get(task_id)
|
|
527
718
|
if task:
|
|
528
719
|
return await task
|
|
529
|
-
|
|
720
|
+
res = self._results.get(task_id)
|
|
721
|
+
if res:
|
|
722
|
+
return res
|
|
723
|
+
else:
|
|
724
|
+
return APIResponse(
|
|
725
|
+
id=-1,
|
|
726
|
+
model_internal="",
|
|
727
|
+
prompt=Conversation([]),
|
|
728
|
+
sampling_params=SamplingParams(),
|
|
729
|
+
status_code=500,
|
|
730
|
+
is_error=True,
|
|
731
|
+
error_message="Task not found",
|
|
732
|
+
)
|
|
530
733
|
|
|
531
734
|
async def wait_for_all(
|
|
532
735
|
self, task_ids: Sequence[int] | None = None
|
|
533
|
-
) -> list[APIResponse
|
|
736
|
+
) -> list[APIResponse]:
|
|
534
737
|
if task_ids is None:
|
|
535
738
|
task_ids = list(self._tasks.keys())
|
|
536
739
|
return [await self.wait_for(tid) for tid in task_ids]
|
|
537
740
|
|
|
538
741
|
async def as_completed(
|
|
539
742
|
self, task_ids: Sequence[int] | None = None
|
|
540
|
-
) -> AsyncGenerator[tuple[int, APIResponse
|
|
743
|
+
) -> AsyncGenerator[tuple[int, APIResponse], None]:
|
|
541
744
|
"""Yield ``(task_id, result)`` pairs as tasks complete.
|
|
542
745
|
|
|
543
746
|
Args:
|
|
@@ -561,7 +764,9 @@ class _LLMClient(BaseModel):
|
|
|
561
764
|
for task in list(tasks_map.keys()):
|
|
562
765
|
if task.done():
|
|
563
766
|
tid = tasks_map.pop(task)
|
|
564
|
-
|
|
767
|
+
task_result = self._results.get(tid, await task)
|
|
768
|
+
assert task_result
|
|
769
|
+
yield tid, task_result
|
|
565
770
|
|
|
566
771
|
while tasks_map:
|
|
567
772
|
done, _ = await asyncio.wait(
|
|
@@ -569,16 +774,18 @@ class _LLMClient(BaseModel):
|
|
|
569
774
|
)
|
|
570
775
|
for task in done:
|
|
571
776
|
tid = tasks_map.pop(task)
|
|
572
|
-
|
|
777
|
+
task_result = self._results.get(tid, await task)
|
|
778
|
+
assert task_result
|
|
779
|
+
yield tid, task_result
|
|
573
780
|
|
|
574
781
|
async def stream(
|
|
575
782
|
self,
|
|
576
|
-
prompt:
|
|
783
|
+
prompt: Prompt,
|
|
577
784
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
578
785
|
):
|
|
579
786
|
model, sampling_params = self._select_model()
|
|
580
|
-
|
|
581
|
-
|
|
787
|
+
prompt = prompts_to_conversations([prompt])[0]
|
|
788
|
+
assert isinstance(prompt, Conversation)
|
|
582
789
|
async for item in stream_chat(
|
|
583
790
|
model, prompt, sampling_params, tools, None, self.extra_headers
|
|
584
791
|
):
|
|
@@ -592,7 +799,7 @@ class _LLMClient(BaseModel):
|
|
|
592
799
|
|
|
593
800
|
async def run_agent_loop(
|
|
594
801
|
self,
|
|
595
|
-
conversation:
|
|
802
|
+
conversation: Prompt,
|
|
596
803
|
*,
|
|
597
804
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
598
805
|
max_rounds: int = 5,
|
|
@@ -605,8 +812,9 @@ class _LLMClient(BaseModel):
|
|
|
605
812
|
instances or built‑in tool dictionaries.
|
|
606
813
|
"""
|
|
607
814
|
|
|
608
|
-
if isinstance(conversation,
|
|
609
|
-
conversation =
|
|
815
|
+
if not isinstance(conversation, Conversation):
|
|
816
|
+
conversation = prompts_to_conversations([conversation])[0]
|
|
817
|
+
assert isinstance(conversation, Conversation)
|
|
610
818
|
|
|
611
819
|
# Expand MCPServer objects to their constituent tools for tool execution
|
|
612
820
|
expanded_tools: list[Tool] = []
|
|
@@ -618,23 +826,20 @@ class _LLMClient(BaseModel):
|
|
|
618
826
|
mcp_tools = await tool.to_tools()
|
|
619
827
|
expanded_tools.extend(mcp_tools)
|
|
620
828
|
|
|
621
|
-
|
|
829
|
+
response: APIResponse | None = None
|
|
622
830
|
|
|
623
831
|
for _ in range(max_rounds):
|
|
624
|
-
|
|
625
|
-
|
|
832
|
+
response = await self.start(
|
|
833
|
+
conversation,
|
|
626
834
|
tools=tools, # type: ignore
|
|
627
|
-
return_completions_only=False,
|
|
628
|
-
show_progress=show_progress,
|
|
629
835
|
)
|
|
630
836
|
|
|
631
|
-
|
|
632
|
-
if last_response is None or last_response.content is None:
|
|
837
|
+
if response is None or response.content is None:
|
|
633
838
|
break
|
|
634
839
|
|
|
635
|
-
conversation = conversation.with_message(
|
|
840
|
+
conversation = conversation.with_message(response.content)
|
|
636
841
|
|
|
637
|
-
tool_calls =
|
|
842
|
+
tool_calls = response.content.tool_calls
|
|
638
843
|
if not tool_calls:
|
|
639
844
|
break
|
|
640
845
|
|
|
@@ -657,16 +862,16 @@ class _LLMClient(BaseModel):
|
|
|
657
862
|
if not isinstance(result, (str, dict, list)):
|
|
658
863
|
result = str(result)
|
|
659
864
|
|
|
660
|
-
conversation.
|
|
865
|
+
conversation.with_tool_result(call.id, result) # type: ignore
|
|
661
866
|
|
|
662
|
-
if
|
|
867
|
+
if response is None:
|
|
663
868
|
raise RuntimeError("model did not return a response")
|
|
664
869
|
|
|
665
|
-
return conversation,
|
|
870
|
+
return conversation, response
|
|
666
871
|
|
|
667
872
|
def run_agent_loop_sync(
|
|
668
873
|
self,
|
|
669
|
-
conversation:
|
|
874
|
+
conversation: Prompt,
|
|
670
875
|
*,
|
|
671
876
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
672
877
|
max_rounds: int = 5,
|
|
@@ -685,7 +890,7 @@ class _LLMClient(BaseModel):
|
|
|
685
890
|
|
|
686
891
|
async def submit_batch_job(
|
|
687
892
|
self,
|
|
688
|
-
prompts:
|
|
893
|
+
prompts: Prompt | Sequence[Prompt],
|
|
689
894
|
*,
|
|
690
895
|
tools: list[Tool] | None = None,
|
|
691
896
|
cache: CachePattern | None = None,
|
|
@@ -747,11 +952,13 @@ def LLMClient(
|
|
|
747
952
|
request_timeout: int = 30,
|
|
748
953
|
cache: Any = None,
|
|
749
954
|
extra_headers: dict[str, str] | None = None,
|
|
955
|
+
use_responses_api: bool = False,
|
|
956
|
+
background: bool = False,
|
|
750
957
|
temperature: float = 0.75,
|
|
751
958
|
top_p: float = 1.0,
|
|
752
959
|
json_mode: bool = False,
|
|
753
960
|
max_new_tokens: int = 512,
|
|
754
|
-
reasoning_effort: Literal["low", "medium", "high", None] = None,
|
|
961
|
+
reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
|
|
755
962
|
logprobs: bool = False,
|
|
756
963
|
top_logprobs: int | None = None,
|
|
757
964
|
force_local_mcp: bool = False,
|
|
@@ -774,11 +981,13 @@ def LLMClient(
|
|
|
774
981
|
request_timeout: int = 30,
|
|
775
982
|
cache: Any = None,
|
|
776
983
|
extra_headers: dict[str, str] | None = None,
|
|
984
|
+
use_responses_api: bool = False,
|
|
985
|
+
background: bool = False,
|
|
777
986
|
temperature: float = 0.75,
|
|
778
987
|
top_p: float = 1.0,
|
|
779
988
|
json_mode: bool = False,
|
|
780
989
|
max_new_tokens: int = 512,
|
|
781
|
-
reasoning_effort: Literal["low", "medium", "high", None] = None,
|
|
990
|
+
reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
|
|
782
991
|
logprobs: bool = False,
|
|
783
992
|
top_logprobs: int | None = None,
|
|
784
993
|
force_local_mcp: bool = False,
|
|
@@ -800,11 +1009,13 @@ def LLMClient(
|
|
|
800
1009
|
request_timeout: int = 30,
|
|
801
1010
|
cache: Any = None,
|
|
802
1011
|
extra_headers: dict[str, str] | None = None,
|
|
1012
|
+
use_responses_api: bool = False,
|
|
1013
|
+
background: bool = False,
|
|
803
1014
|
temperature: float = 0.75,
|
|
804
1015
|
top_p: float = 1.0,
|
|
805
1016
|
json_mode: bool = False,
|
|
806
1017
|
max_new_tokens: int = 512,
|
|
807
|
-
reasoning_effort: Literal["low", "medium", "high", None] = None,
|
|
1018
|
+
reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
|
|
808
1019
|
logprobs: bool = False,
|
|
809
1020
|
top_logprobs: int | None = None,
|
|
810
1021
|
force_local_mcp: bool = False,
|
|
@@ -838,6 +1049,8 @@ def LLMClient(
|
|
|
838
1049
|
request_timeout=request_timeout,
|
|
839
1050
|
cache=cache,
|
|
840
1051
|
extra_headers=extra_headers,
|
|
1052
|
+
use_responses_api=use_responses_api,
|
|
1053
|
+
background=background,
|
|
841
1054
|
temperature=temperature,
|
|
842
1055
|
top_p=top_p,
|
|
843
1056
|
json_mode=json_mode,
|
lm_deluge/config.py
CHANGED
|
@@ -8,7 +8,7 @@ class SamplingParams(BaseModel):
|
|
|
8
8
|
top_p: float = 1.0
|
|
9
9
|
json_mode: bool = False
|
|
10
10
|
max_new_tokens: int = 512
|
|
11
|
-
reasoning_effort: Literal["low", "medium", "high", "none", None] = None
|
|
11
|
+
reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None
|
|
12
12
|
logprobs: bool = False
|
|
13
13
|
top_logprobs: int | None = None
|
|
14
14
|
|