lm-deluge 0.0.67__py3-none-any.whl → 0.0.90__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lm-deluge might be problematic. Click here for more details.
- lm_deluge/__init__.py +1 -2
- lm_deluge/api_requests/anthropic.py +117 -22
- lm_deluge/api_requests/base.py +84 -11
- lm_deluge/api_requests/bedrock.py +30 -6
- lm_deluge/api_requests/chat_reasoning.py +4 -0
- lm_deluge/api_requests/gemini.py +166 -20
- lm_deluge/api_requests/openai.py +145 -25
- lm_deluge/batches.py +15 -45
- lm_deluge/client.py +309 -50
- lm_deluge/config.py +15 -3
- lm_deluge/models/__init__.py +14 -1
- lm_deluge/models/anthropic.py +29 -14
- lm_deluge/models/arcee.py +16 -0
- lm_deluge/models/deepseek.py +36 -4
- lm_deluge/models/google.py +42 -0
- lm_deluge/models/grok.py +24 -0
- lm_deluge/models/kimi.py +36 -0
- lm_deluge/models/minimax.py +18 -0
- lm_deluge/models/openai.py +100 -0
- lm_deluge/models/openrouter.py +133 -7
- lm_deluge/models/together.py +11 -0
- lm_deluge/models/zai.py +50 -0
- lm_deluge/pipelines/gepa/__init__.py +95 -0
- lm_deluge/pipelines/gepa/core.py +354 -0
- lm_deluge/pipelines/gepa/docs/samples.py +705 -0
- lm_deluge/pipelines/gepa/examples/01_synthetic_keywords.py +140 -0
- lm_deluge/pipelines/gepa/examples/02_gsm8k_math.py +261 -0
- lm_deluge/pipelines/gepa/examples/03_hotpotqa_multihop.py +300 -0
- lm_deluge/pipelines/gepa/examples/04_batch_classification.py +271 -0
- lm_deluge/pipelines/gepa/examples/simple_qa.py +129 -0
- lm_deluge/pipelines/gepa/optimizer.py +435 -0
- lm_deluge/pipelines/gepa/proposer.py +235 -0
- lm_deluge/pipelines/gepa/util.py +165 -0
- lm_deluge/{llm_tools → pipelines}/score.py +2 -2
- lm_deluge/{llm_tools → pipelines}/translate.py +5 -3
- lm_deluge/prompt.py +537 -88
- lm_deluge/request_context.py +7 -2
- lm_deluge/server/__init__.py +24 -0
- lm_deluge/server/__main__.py +144 -0
- lm_deluge/server/adapters.py +369 -0
- lm_deluge/server/app.py +388 -0
- lm_deluge/server/auth.py +71 -0
- lm_deluge/server/model_policy.py +215 -0
- lm_deluge/server/models_anthropic.py +172 -0
- lm_deluge/server/models_openai.py +175 -0
- lm_deluge/tool/__init__.py +1130 -0
- lm_deluge/tool/builtin/anthropic/__init__.py +300 -0
- lm_deluge/tool/builtin/anthropic/bash.py +0 -0
- lm_deluge/tool/builtin/anthropic/computer_use.py +0 -0
- lm_deluge/tool/builtin/gemini.py +59 -0
- lm_deluge/tool/builtin/openai.py +74 -0
- lm_deluge/tool/cua/__init__.py +173 -0
- lm_deluge/tool/cua/actions.py +148 -0
- lm_deluge/tool/cua/base.py +27 -0
- lm_deluge/tool/cua/batch.py +215 -0
- lm_deluge/tool/cua/converters.py +466 -0
- lm_deluge/tool/cua/kernel.py +702 -0
- lm_deluge/tool/cua/trycua.py +989 -0
- lm_deluge/tool/prefab/__init__.py +45 -0
- lm_deluge/tool/prefab/batch_tool.py +156 -0
- lm_deluge/tool/prefab/docs.py +1119 -0
- lm_deluge/tool/prefab/email.py +294 -0
- lm_deluge/tool/prefab/filesystem.py +1711 -0
- lm_deluge/tool/prefab/full_text_search/__init__.py +285 -0
- lm_deluge/tool/prefab/full_text_search/tantivy_index.py +396 -0
- lm_deluge/tool/prefab/memory.py +458 -0
- lm_deluge/tool/prefab/otc/__init__.py +165 -0
- lm_deluge/tool/prefab/otc/executor.py +281 -0
- lm_deluge/tool/prefab/otc/parse.py +188 -0
- lm_deluge/tool/prefab/random.py +212 -0
- lm_deluge/tool/prefab/rlm/__init__.py +296 -0
- lm_deluge/tool/prefab/rlm/executor.py +349 -0
- lm_deluge/tool/prefab/rlm/parse.py +144 -0
- lm_deluge/tool/prefab/sandbox/__init__.py +19 -0
- lm_deluge/tool/prefab/sandbox/daytona_sandbox.py +483 -0
- lm_deluge/tool/prefab/sandbox/docker_sandbox.py +609 -0
- lm_deluge/tool/prefab/sandbox/fargate_sandbox.py +546 -0
- lm_deluge/tool/prefab/sandbox/modal_sandbox.py +469 -0
- lm_deluge/tool/prefab/sandbox/seatbelt_sandbox.py +827 -0
- lm_deluge/tool/prefab/sheets.py +385 -0
- lm_deluge/tool/prefab/skills.py +0 -0
- lm_deluge/tool/prefab/subagents.py +233 -0
- lm_deluge/tool/prefab/todos.py +342 -0
- lm_deluge/tool/prefab/tool_search.py +169 -0
- lm_deluge/tool/prefab/web_search.py +199 -0
- lm_deluge/tracker.py +16 -13
- lm_deluge/util/schema.py +412 -0
- lm_deluge/warnings.py +8 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/METADATA +23 -9
- lm_deluge-0.0.90.dist-info/RECORD +132 -0
- lm_deluge/built_in_tools/anthropic/__init__.py +0 -128
- lm_deluge/built_in_tools/openai.py +0 -28
- lm_deluge/presets/cerebras.py +0 -17
- lm_deluge/presets/meta.py +0 -13
- lm_deluge/tool.py +0 -849
- lm_deluge-0.0.67.dist-info/RECORD +0 -72
- lm_deluge/{llm_tools → pipelines}/__init__.py +1 -1
- /lm_deluge/{llm_tools → pipelines}/classify.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/extract.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/locate.py +0 -0
- /lm_deluge/{llm_tools → pipelines}/ocr.py +0 -0
- /lm_deluge/{built_in_tools/anthropic/bash.py → skills/anthropic.py} +0 -0
- /lm_deluge/{built_in_tools/anthropic/computer_use.py → skills/compat.py} +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/editor.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/base.py +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/WHEEL +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/licenses/LICENSE +0 -0
- {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/top_level.txt +0 -0
lm_deluge/client.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
from dataclasses import dataclass
|
|
2
3
|
from typing import (
|
|
3
4
|
Any,
|
|
4
5
|
AsyncGenerator,
|
|
@@ -37,6 +38,14 @@ from .request_context import RequestContext
|
|
|
37
38
|
from .tracker import StatusTracker
|
|
38
39
|
|
|
39
40
|
|
|
41
|
+
@dataclass
|
|
42
|
+
class AgentLoopResponse:
|
|
43
|
+
"""Wrapper for agent loop results to distinguish from single request results."""
|
|
44
|
+
|
|
45
|
+
conversation: Conversation
|
|
46
|
+
final_response: APIResponse
|
|
47
|
+
|
|
48
|
+
|
|
40
49
|
# TODO: add optional max_input_tokens to client so we can reject long prompts to prevent abuse
|
|
41
50
|
class _LLMClient(BaseModel):
|
|
42
51
|
"""
|
|
@@ -44,10 +53,15 @@ class _LLMClient(BaseModel):
|
|
|
44
53
|
Keeps all validation, serialization, and existing functionality.
|
|
45
54
|
"""
|
|
46
55
|
|
|
47
|
-
_REASONING_SUFFIXES: ClassVar[
|
|
56
|
+
_REASONING_SUFFIXES: ClassVar[
|
|
57
|
+
dict[str, Literal["low", "medium", "high", "xhigh", "minimal", "none"]]
|
|
58
|
+
] = {
|
|
48
59
|
"-low": "low",
|
|
49
60
|
"-medium": "medium",
|
|
50
61
|
"-high": "high",
|
|
62
|
+
"-xhigh": "xhigh",
|
|
63
|
+
"-minimal": "minimal",
|
|
64
|
+
"-none": "none",
|
|
51
65
|
}
|
|
52
66
|
|
|
53
67
|
model_names: str | list[str] = ["gpt-4.1-mini"]
|
|
@@ -66,11 +80,15 @@ class _LLMClient(BaseModel):
|
|
|
66
80
|
background: bool = False
|
|
67
81
|
# sampling params - if provided, and sampling_params is not,
|
|
68
82
|
# these override the defaults
|
|
69
|
-
temperature: float = 0
|
|
83
|
+
temperature: float = 1.0
|
|
70
84
|
top_p: float = 1.0
|
|
71
85
|
json_mode: bool = False
|
|
72
86
|
max_new_tokens: int = 512
|
|
73
|
-
reasoning_effort: Literal[
|
|
87
|
+
reasoning_effort: Literal[
|
|
88
|
+
"low", "medium", "high", "xhigh", "minimal", "none", None
|
|
89
|
+
] = None
|
|
90
|
+
global_effort: Literal["low", "medium", "high"] | None = None
|
|
91
|
+
thinking_budget: int | None = None
|
|
74
92
|
logprobs: bool = False
|
|
75
93
|
top_logprobs: int | None = None
|
|
76
94
|
force_local_mcp: bool = False
|
|
@@ -84,10 +102,17 @@ class _LLMClient(BaseModel):
|
|
|
84
102
|
# Internal state for async task handling
|
|
85
103
|
_next_task_id: int = PrivateAttr(default=0)
|
|
86
104
|
_tasks: dict[int, asyncio.Task] = PrivateAttr(default_factory=dict)
|
|
87
|
-
_results: dict[int, APIResponse] = PrivateAttr(
|
|
105
|
+
_results: dict[int, APIResponse | AgentLoopResponse] = PrivateAttr(
|
|
106
|
+
default_factory=dict
|
|
107
|
+
)
|
|
88
108
|
_tracker: StatusTracker | None = PrivateAttr(default=None)
|
|
89
109
|
_capacity_lock: asyncio.Lock = PrivateAttr(default_factory=asyncio.Lock)
|
|
90
110
|
|
|
111
|
+
# usage
|
|
112
|
+
def print_usage(self):
|
|
113
|
+
if self._tracker:
|
|
114
|
+
self._tracker.log_usage()
|
|
115
|
+
|
|
91
116
|
# Progress management for queueing API
|
|
92
117
|
def open(self, total: int | None = None, show_progress: bool = True):
|
|
93
118
|
self._tracker = StatusTracker(
|
|
@@ -149,9 +174,14 @@ class _LLMClient(BaseModel):
|
|
|
149
174
|
|
|
150
175
|
def _normalize_model_names(
|
|
151
176
|
self, models: list[str]
|
|
152
|
-
) -> tuple[
|
|
177
|
+
) -> tuple[
|
|
178
|
+
list[str],
|
|
179
|
+
list[Literal["low", "medium", "high", "xhigh", "minimal", "none"] | None],
|
|
180
|
+
]:
|
|
153
181
|
normalized: list[str] = []
|
|
154
|
-
efforts: list[
|
|
182
|
+
efforts: list[
|
|
183
|
+
Literal["low", "medium", "high", "xhigh", "minimal", "none"] | None
|
|
184
|
+
] = []
|
|
155
185
|
|
|
156
186
|
for name in models:
|
|
157
187
|
base_name = self._preprocess_openrouter_model(name)
|
|
@@ -164,7 +194,10 @@ class _LLMClient(BaseModel):
|
|
|
164
194
|
return normalized, efforts
|
|
165
195
|
|
|
166
196
|
def _align_sampling_params(
|
|
167
|
-
self,
|
|
197
|
+
self,
|
|
198
|
+
per_model_efforts: list[
|
|
199
|
+
Literal["low", "medium", "high", "xhigh", "minimal", "none"] | None
|
|
200
|
+
],
|
|
168
201
|
) -> None:
|
|
169
202
|
if len(per_model_efforts) < len(self.model_names):
|
|
170
203
|
per_model_efforts = per_model_efforts + [None] * (
|
|
@@ -187,6 +220,8 @@ class _LLMClient(BaseModel):
|
|
|
187
220
|
json_mode=self.json_mode,
|
|
188
221
|
max_new_tokens=self.max_new_tokens,
|
|
189
222
|
reasoning_effort=self.reasoning_effort,
|
|
223
|
+
global_effort=self.global_effort or "high",
|
|
224
|
+
thinking_budget=self.thinking_budget,
|
|
190
225
|
logprobs=self.logprobs,
|
|
191
226
|
top_logprobs=self.top_logprobs,
|
|
192
227
|
)
|
|
@@ -242,6 +277,7 @@ class _LLMClient(BaseModel):
|
|
|
242
277
|
self.max_tokens_per_minute = max_tokens_per_minute
|
|
243
278
|
if max_concurrent_requests:
|
|
244
279
|
self.max_concurrent_requests = max_concurrent_requests
|
|
280
|
+
return self
|
|
245
281
|
|
|
246
282
|
def _get_tracker(self) -> StatusTracker:
|
|
247
283
|
if self._tracker is None:
|
|
@@ -253,6 +289,28 @@ class _LLMClient(BaseModel):
|
|
|
253
289
|
def models(self):
|
|
254
290
|
return self.model_names # why? idk
|
|
255
291
|
|
|
292
|
+
@staticmethod
|
|
293
|
+
def _preprocess_tinker_model(model_name: str) -> str:
|
|
294
|
+
if model_name.startswith("tinker://"):
|
|
295
|
+
model_id = model_name
|
|
296
|
+
if model_id not in registry:
|
|
297
|
+
register_model(
|
|
298
|
+
id=model_name,
|
|
299
|
+
name=model_name,
|
|
300
|
+
api_base="https://tinker.thinkingmachines.dev/services/tinker-prod/oai/api/v1",
|
|
301
|
+
api_key_env_var="TINKER_API_KEY",
|
|
302
|
+
api_spec="openai",
|
|
303
|
+
supports_json=True,
|
|
304
|
+
supports_logprobs=False,
|
|
305
|
+
supports_responses=False,
|
|
306
|
+
input_cost=0, # Unknown costs for arbitrary tinker models
|
|
307
|
+
cached_input_cost=0,
|
|
308
|
+
cache_write_cost=0,
|
|
309
|
+
output_cost=0,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
return model_name
|
|
313
|
+
|
|
256
314
|
@staticmethod
|
|
257
315
|
def _preprocess_openrouter_model(model_name: str) -> str:
|
|
258
316
|
"""Process openrouter: prefix and register model if needed."""
|
|
@@ -279,7 +337,8 @@ class _LLMClient(BaseModel):
|
|
|
279
337
|
)
|
|
280
338
|
|
|
281
339
|
return model_id
|
|
282
|
-
|
|
340
|
+
else:
|
|
341
|
+
return model_name
|
|
283
342
|
|
|
284
343
|
@model_validator(mode="before")
|
|
285
344
|
@classmethod
|
|
@@ -292,6 +351,9 @@ class _LLMClient(BaseModel):
|
|
|
292
351
|
# First, handle OpenRouter prefix
|
|
293
352
|
model_name = cls._preprocess_openrouter_model(model_names)
|
|
294
353
|
|
|
354
|
+
# next handle tinker prefix
|
|
355
|
+
model_name = cls._preprocess_tinker_model(model_name)
|
|
356
|
+
|
|
295
357
|
# Then handle reasoning effort suffix (e.g., "gpt-5-high")
|
|
296
358
|
model_name, effort = cls._strip_reasoning_suffix_if_registered(model_name)
|
|
297
359
|
if effort and data.get("reasoning_effort") is None:
|
|
@@ -316,11 +378,13 @@ class _LLMClient(BaseModel):
|
|
|
316
378
|
if "sampling_params" not in data or len(data.get("sampling_params", [])) == 0:
|
|
317
379
|
data["sampling_params"] = [
|
|
318
380
|
SamplingParams(
|
|
319
|
-
temperature=data.get("temperature", 0
|
|
381
|
+
temperature=data.get("temperature", 1.0),
|
|
320
382
|
top_p=data.get("top_p", 1.0),
|
|
321
383
|
json_mode=data.get("json_mode", False),
|
|
322
384
|
max_new_tokens=data.get("max_new_tokens", 512),
|
|
323
385
|
reasoning_effort=data.get("reasoning_effort", None),
|
|
386
|
+
global_effort=data.get("global_effort") or "high",
|
|
387
|
+
thinking_budget=data.get("thinking_budget", None),
|
|
324
388
|
logprobs=data.get("logprobs", False),
|
|
325
389
|
top_logprobs=data.get("top_logprobs", None),
|
|
326
390
|
)
|
|
@@ -332,7 +396,9 @@ class _LLMClient(BaseModel):
|
|
|
332
396
|
@classmethod
|
|
333
397
|
def _strip_reasoning_suffix_if_registered(
|
|
334
398
|
cls, model_name: str
|
|
335
|
-
) -> tuple[
|
|
399
|
+
) -> tuple[
|
|
400
|
+
str, Literal["low", "medium", "high", "xhigh", "minimal", "none"] | None
|
|
401
|
+
]:
|
|
336
402
|
"""Remove reasoning suffix only when the trimmed model already exists."""
|
|
337
403
|
for suffix, effort in cls._REASONING_SUFFIXES.items():
|
|
338
404
|
if model_name.endswith(suffix) and len(model_name) > len(suffix):
|
|
@@ -364,6 +430,15 @@ class _LLMClient(BaseModel):
|
|
|
364
430
|
assert (
|
|
365
431
|
self.use_responses_api
|
|
366
432
|
), "background mode only allowed for responses api"
|
|
433
|
+
|
|
434
|
+
# codex models require responses api
|
|
435
|
+
for model_name in self.model_names:
|
|
436
|
+
if "codex" in model_name.lower() and not self.use_responses_api:
|
|
437
|
+
raise ValueError(
|
|
438
|
+
f"Model '{model_name}' requires use_responses_api=True. "
|
|
439
|
+
"Codex models are only available via the Responses API."
|
|
440
|
+
)
|
|
441
|
+
|
|
367
442
|
# Auto-generate name if not provided
|
|
368
443
|
if self.name is None:
|
|
369
444
|
if len(self.model_names) == 1:
|
|
@@ -542,7 +617,8 @@ class _LLMClient(BaseModel):
|
|
|
542
617
|
*,
|
|
543
618
|
return_completions_only: Literal[True],
|
|
544
619
|
show_progress: bool = ...,
|
|
545
|
-
tools:
|
|
620
|
+
tools: Sequence[Tool | dict | MCPServer] | None = ...,
|
|
621
|
+
output_schema: type[BaseModel] | dict | None = ...,
|
|
546
622
|
cache: CachePattern | None = ...,
|
|
547
623
|
service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
|
|
548
624
|
) -> list[str | None]: ...
|
|
@@ -554,7 +630,8 @@ class _LLMClient(BaseModel):
|
|
|
554
630
|
*,
|
|
555
631
|
return_completions_only: Literal[False] = ...,
|
|
556
632
|
show_progress: bool = ...,
|
|
557
|
-
tools:
|
|
633
|
+
tools: Sequence[Tool | dict | MCPServer] | None = ...,
|
|
634
|
+
output_schema: type[BaseModel] | dict | None = ...,
|
|
558
635
|
cache: CachePattern | None = ...,
|
|
559
636
|
service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
|
|
560
637
|
) -> list[APIResponse]: ...
|
|
@@ -565,7 +642,8 @@ class _LLMClient(BaseModel):
|
|
|
565
642
|
*,
|
|
566
643
|
return_completions_only: bool = False,
|
|
567
644
|
show_progress: bool = True,
|
|
568
|
-
tools:
|
|
645
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
646
|
+
output_schema: type[BaseModel] | dict | None = None,
|
|
569
647
|
cache: CachePattern | None = None,
|
|
570
648
|
service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
|
|
571
649
|
) -> list[APIResponse] | list[str | None] | dict[str, int]:
|
|
@@ -594,6 +672,7 @@ class _LLMClient(BaseModel):
|
|
|
594
672
|
task_id = self.start_nowait(
|
|
595
673
|
prompt,
|
|
596
674
|
tools=tools,
|
|
675
|
+
output_schema=output_schema,
|
|
597
676
|
cache=cache,
|
|
598
677
|
service_tier=service_tier,
|
|
599
678
|
)
|
|
@@ -638,7 +717,8 @@ class _LLMClient(BaseModel):
|
|
|
638
717
|
*,
|
|
639
718
|
return_completions_only: bool = False,
|
|
640
719
|
show_progress=True,
|
|
641
|
-
tools:
|
|
720
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
721
|
+
output_schema: type[BaseModel] | dict | None = None,
|
|
642
722
|
cache: CachePattern | None = None,
|
|
643
723
|
):
|
|
644
724
|
return asyncio.run(
|
|
@@ -647,6 +727,7 @@ class _LLMClient(BaseModel):
|
|
|
647
727
|
return_completions_only=return_completions_only,
|
|
648
728
|
show_progress=show_progress,
|
|
649
729
|
tools=tools,
|
|
730
|
+
output_schema=output_schema,
|
|
650
731
|
cache=cache,
|
|
651
732
|
)
|
|
652
733
|
)
|
|
@@ -669,7 +750,8 @@ class _LLMClient(BaseModel):
|
|
|
669
750
|
self,
|
|
670
751
|
prompt: Prompt,
|
|
671
752
|
*,
|
|
672
|
-
tools:
|
|
753
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
754
|
+
output_schema: type[BaseModel] | dict | None = None,
|
|
673
755
|
cache: CachePattern | None = None,
|
|
674
756
|
service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
|
|
675
757
|
) -> int:
|
|
@@ -688,6 +770,7 @@ class _LLMClient(BaseModel):
|
|
|
688
770
|
request_timeout=self.request_timeout,
|
|
689
771
|
status_tracker=tracker,
|
|
690
772
|
tools=tools,
|
|
773
|
+
output_schema=output_schema,
|
|
691
774
|
cache=cache,
|
|
692
775
|
use_responses_api=self.use_responses_api,
|
|
693
776
|
background=self.background,
|
|
@@ -702,25 +785,30 @@ class _LLMClient(BaseModel):
|
|
|
702
785
|
|
|
703
786
|
async def start(
|
|
704
787
|
self,
|
|
705
|
-
prompt:
|
|
788
|
+
prompt: Prompt,
|
|
706
789
|
*,
|
|
707
|
-
tools:
|
|
790
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
791
|
+
output_schema: type[BaseModel] | dict | None = None,
|
|
708
792
|
cache: CachePattern | None = None,
|
|
709
793
|
service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
|
|
710
794
|
) -> APIResponse:
|
|
711
795
|
task_id = self.start_nowait(
|
|
712
|
-
prompt,
|
|
796
|
+
prompt,
|
|
797
|
+
tools=tools,
|
|
798
|
+
output_schema=output_schema,
|
|
799
|
+
cache=cache,
|
|
800
|
+
service_tier=service_tier,
|
|
713
801
|
)
|
|
714
802
|
return await self.wait_for(task_id)
|
|
715
803
|
|
|
716
804
|
async def wait_for(self, task_id: int) -> APIResponse:
|
|
717
805
|
task = self._tasks.get(task_id)
|
|
718
806
|
if task:
|
|
719
|
-
|
|
720
|
-
res = self._results.get(task_id)
|
|
721
|
-
if res:
|
|
722
|
-
return res
|
|
807
|
+
result = await task
|
|
723
808
|
else:
|
|
809
|
+
result = self._results.get(task_id)
|
|
810
|
+
|
|
811
|
+
if result is None:
|
|
724
812
|
return APIResponse(
|
|
725
813
|
id=-1,
|
|
726
814
|
model_internal="",
|
|
@@ -731,6 +819,11 @@ class _LLMClient(BaseModel):
|
|
|
731
819
|
error_message="Task not found",
|
|
732
820
|
)
|
|
733
821
|
|
|
822
|
+
assert isinstance(
|
|
823
|
+
result, APIResponse
|
|
824
|
+
), f"Expected APIResponse, got {type(result)}. Use wait_for_agent_loop for agent loop tasks."
|
|
825
|
+
return result
|
|
826
|
+
|
|
734
827
|
async def wait_for_all(
|
|
735
828
|
self, task_ids: Sequence[int] | None = None
|
|
736
829
|
) -> list[APIResponse]:
|
|
@@ -766,6 +859,9 @@ class _LLMClient(BaseModel):
|
|
|
766
859
|
tid = tasks_map.pop(task)
|
|
767
860
|
task_result = self._results.get(tid, await task)
|
|
768
861
|
assert task_result
|
|
862
|
+
assert isinstance(
|
|
863
|
+
task_result, APIResponse
|
|
864
|
+
), f"Expected APIResponse, got {type(task_result)}. as_completed() only works with single requests, not agent loops."
|
|
769
865
|
yield tid, task_result
|
|
770
866
|
|
|
771
867
|
while tasks_map:
|
|
@@ -776,16 +872,19 @@ class _LLMClient(BaseModel):
|
|
|
776
872
|
tid = tasks_map.pop(task)
|
|
777
873
|
task_result = self._results.get(tid, await task)
|
|
778
874
|
assert task_result
|
|
875
|
+
assert isinstance(
|
|
876
|
+
task_result, APIResponse
|
|
877
|
+
), f"Expected APIResponse, got {type(task_result)}. as_completed() only works with single requests, not agent loops."
|
|
779
878
|
yield tid, task_result
|
|
780
879
|
|
|
781
880
|
async def stream(
|
|
782
881
|
self,
|
|
783
|
-
prompt:
|
|
784
|
-
tools:
|
|
882
|
+
prompt: Prompt,
|
|
883
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
785
884
|
):
|
|
786
885
|
model, sampling_params = self._select_model()
|
|
787
|
-
|
|
788
|
-
|
|
886
|
+
prompt = prompts_to_conversations([prompt])[0]
|
|
887
|
+
assert isinstance(prompt, Conversation)
|
|
789
888
|
async for item in stream_chat(
|
|
790
889
|
model, prompt, sampling_params, tools, None, self.extra_headers
|
|
791
890
|
):
|
|
@@ -797,23 +896,15 @@ class _LLMClient(BaseModel):
|
|
|
797
896
|
return self.postprocess(item)
|
|
798
897
|
return item
|
|
799
898
|
|
|
800
|
-
async def
|
|
899
|
+
async def _run_agent_loop_internal(
|
|
801
900
|
self,
|
|
802
|
-
|
|
901
|
+
task_id: int,
|
|
902
|
+
conversation: Conversation,
|
|
803
903
|
*,
|
|
804
|
-
tools:
|
|
904
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
805
905
|
max_rounds: int = 5,
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
"""Run a simple agent loop until no more tool calls are returned.
|
|
809
|
-
|
|
810
|
-
The provided ``conversation`` will be mutated and returned alongside the
|
|
811
|
-
final ``APIResponse`` from the model. ``tools`` may include ``Tool``
|
|
812
|
-
instances or built‑in tool dictionaries.
|
|
813
|
-
"""
|
|
814
|
-
|
|
815
|
-
if isinstance(conversation, str):
|
|
816
|
-
conversation = Conversation.user(conversation)
|
|
906
|
+
) -> AgentLoopResponse:
|
|
907
|
+
"""Internal method to run agent loop and return wrapped result."""
|
|
817
908
|
|
|
818
909
|
# Expand MCPServer objects to their constituent tools for tool execution
|
|
819
910
|
expanded_tools: list[Tool] = []
|
|
@@ -861,18 +952,86 @@ class _LLMClient(BaseModel):
|
|
|
861
952
|
if not isinstance(result, (str, dict, list)):
|
|
862
953
|
result = str(result)
|
|
863
954
|
|
|
864
|
-
conversation.with_tool_result(call.id, result) # type: ignore
|
|
955
|
+
conversation = conversation.with_tool_result(call.id, result) # type: ignore
|
|
865
956
|
|
|
866
957
|
if response is None:
|
|
867
958
|
raise RuntimeError("model did not return a response")
|
|
868
959
|
|
|
869
|
-
|
|
960
|
+
result = AgentLoopResponse(conversation=conversation, final_response=response)
|
|
961
|
+
self._results[task_id] = result
|
|
962
|
+
return result
|
|
963
|
+
|
|
964
|
+
def start_agent_loop_nowait(
|
|
965
|
+
self,
|
|
966
|
+
conversation: Prompt,
|
|
967
|
+
*,
|
|
968
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
969
|
+
max_rounds: int = 5,
|
|
970
|
+
) -> int:
|
|
971
|
+
"""Start an agent loop without waiting for it to complete.
|
|
972
|
+
|
|
973
|
+
Returns a task_id that can be used with wait_for_agent_loop().
|
|
974
|
+
"""
|
|
975
|
+
if not isinstance(conversation, Conversation):
|
|
976
|
+
conversation = prompts_to_conversations([conversation])[0]
|
|
977
|
+
assert isinstance(conversation, Conversation)
|
|
978
|
+
|
|
979
|
+
task_id = self._next_task_id
|
|
980
|
+
self._next_task_id += 1
|
|
981
|
+
|
|
982
|
+
task = asyncio.create_task(
|
|
983
|
+
self._run_agent_loop_internal(
|
|
984
|
+
task_id, conversation, tools=tools, max_rounds=max_rounds
|
|
985
|
+
)
|
|
986
|
+
)
|
|
987
|
+
self._tasks[task_id] = task
|
|
988
|
+
return task_id
|
|
989
|
+
|
|
990
|
+
async def wait_for_agent_loop(
|
|
991
|
+
self, task_id: int
|
|
992
|
+
) -> tuple[Conversation, APIResponse]:
|
|
993
|
+
"""Wait for an agent loop task to complete.
|
|
994
|
+
|
|
995
|
+
Returns the conversation and final response from the agent loop.
|
|
996
|
+
"""
|
|
997
|
+
task = self._tasks.get(task_id)
|
|
998
|
+
if task:
|
|
999
|
+
result = await task
|
|
1000
|
+
else:
|
|
1001
|
+
result = self._results.get(task_id)
|
|
1002
|
+
|
|
1003
|
+
if result is None:
|
|
1004
|
+
raise RuntimeError(f"Agent loop task {task_id} not found")
|
|
1005
|
+
|
|
1006
|
+
assert isinstance(
|
|
1007
|
+
result, AgentLoopResponse
|
|
1008
|
+
), f"Expected AgentLoopResponse, got {type(result)}"
|
|
1009
|
+
return result.conversation, result.final_response
|
|
1010
|
+
|
|
1011
|
+
async def run_agent_loop(
|
|
1012
|
+
self,
|
|
1013
|
+
conversation: Prompt,
|
|
1014
|
+
*,
|
|
1015
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
1016
|
+
max_rounds: int = 5,
|
|
1017
|
+
show_progress: bool = False,
|
|
1018
|
+
) -> tuple[Conversation, APIResponse]:
|
|
1019
|
+
"""Run a simple agent loop until no more tool calls are returned.
|
|
1020
|
+
|
|
1021
|
+
The provided ``conversation`` will be mutated and returned alongside the
|
|
1022
|
+
final ``APIResponse`` from the model. ``tools`` may include ``Tool``
|
|
1023
|
+
instances or built‑in tool dictionaries.
|
|
1024
|
+
"""
|
|
1025
|
+
task_id = self.start_agent_loop_nowait(
|
|
1026
|
+
conversation, tools=tools, max_rounds=max_rounds
|
|
1027
|
+
)
|
|
1028
|
+
return await self.wait_for_agent_loop(task_id)
|
|
870
1029
|
|
|
871
1030
|
def run_agent_loop_sync(
|
|
872
1031
|
self,
|
|
873
|
-
conversation:
|
|
1032
|
+
conversation: Prompt,
|
|
874
1033
|
*,
|
|
875
|
-
tools:
|
|
1034
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
876
1035
|
max_rounds: int = 5,
|
|
877
1036
|
show_progress: bool = False,
|
|
878
1037
|
) -> tuple[Conversation, APIResponse]:
|
|
@@ -887,6 +1046,92 @@ class _LLMClient(BaseModel):
|
|
|
887
1046
|
)
|
|
888
1047
|
)
|
|
889
1048
|
|
|
1049
|
+
async def process_agent_loops_async(
|
|
1050
|
+
self,
|
|
1051
|
+
prompts: Sequence[Prompt],
|
|
1052
|
+
*,
|
|
1053
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
1054
|
+
max_rounds: int = 5,
|
|
1055
|
+
max_concurrent_agents: int = 10,
|
|
1056
|
+
show_progress: bool = True,
|
|
1057
|
+
) -> list[tuple[Conversation, APIResponse]]:
|
|
1058
|
+
"""Process multiple agent loops concurrently.
|
|
1059
|
+
|
|
1060
|
+
Each prompt becomes an independent agent loop that can make multiple LLM
|
|
1061
|
+
calls and execute tools until completion. The agent loops run concurrently,
|
|
1062
|
+
limited by ``max_concurrent_agents``, while the underlying LLM requests
|
|
1063
|
+
are still governed by ``max_concurrent_requests``.
|
|
1064
|
+
|
|
1065
|
+
Args:
|
|
1066
|
+
prompts: Sequence of prompts, each becoming a separate agent loop.
|
|
1067
|
+
tools: Tools available to all agent loops.
|
|
1068
|
+
max_rounds: Maximum rounds per agent loop (default 5).
|
|
1069
|
+
max_concurrent_agents: Maximum number of agent loops running
|
|
1070
|
+
concurrently (default 10). This is separate from the LLM request
|
|
1071
|
+
concurrency limit.
|
|
1072
|
+
show_progress: Whether to show progress bar for LLM requests.
|
|
1073
|
+
|
|
1074
|
+
Returns:
|
|
1075
|
+
List of (Conversation, APIResponse) tuples in the same order as
|
|
1076
|
+
the input prompts.
|
|
1077
|
+
"""
|
|
1078
|
+
# Convert prompts to Conversations
|
|
1079
|
+
conversations = prompts_to_conversations(list(prompts))
|
|
1080
|
+
|
|
1081
|
+
# Ensure tracker exists for underlying LLM requests
|
|
1082
|
+
if self._tracker is None:
|
|
1083
|
+
self.open(total=0, show_progress=show_progress)
|
|
1084
|
+
tracker_preopened = False
|
|
1085
|
+
else:
|
|
1086
|
+
tracker_preopened = True
|
|
1087
|
+
|
|
1088
|
+
# Semaphore to limit concurrent agent loops
|
|
1089
|
+
agent_semaphore = asyncio.Semaphore(max_concurrent_agents)
|
|
1090
|
+
|
|
1091
|
+
async def run_single_loop(
|
|
1092
|
+
idx: int, conv: Conversation
|
|
1093
|
+
) -> tuple[int, Conversation, APIResponse]:
|
|
1094
|
+
"""Run a single agent loop with semaphore protection."""
|
|
1095
|
+
async with agent_semaphore:
|
|
1096
|
+
task_id = self._next_task_id
|
|
1097
|
+
self._next_task_id += 1
|
|
1098
|
+
result = await self._run_agent_loop_internal(
|
|
1099
|
+
task_id, conv, tools=tools, max_rounds=max_rounds
|
|
1100
|
+
)
|
|
1101
|
+
return idx, result.conversation, result.final_response
|
|
1102
|
+
|
|
1103
|
+
# Launch all agent loops concurrently (semaphore limits actual concurrency)
|
|
1104
|
+
tasks = [run_single_loop(idx, conv) for idx, conv in enumerate(conversations)]
|
|
1105
|
+
completed = await asyncio.gather(*tasks)
|
|
1106
|
+
|
|
1107
|
+
# Close tracker if we opened it
|
|
1108
|
+
if not tracker_preopened:
|
|
1109
|
+
self.close()
|
|
1110
|
+
|
|
1111
|
+
# Sort by original index and extract results
|
|
1112
|
+
completed_sorted = sorted(completed, key=lambda x: x[0])
|
|
1113
|
+
return [(conv, resp) for _, conv, resp in completed_sorted]
|
|
1114
|
+
|
|
1115
|
+
def process_agent_loops_sync(
|
|
1116
|
+
self,
|
|
1117
|
+
prompts: Sequence[Prompt],
|
|
1118
|
+
*,
|
|
1119
|
+
tools: Sequence[Tool | dict | MCPServer] | None = None,
|
|
1120
|
+
max_rounds: int = 5,
|
|
1121
|
+
max_concurrent_agents: int = 10,
|
|
1122
|
+
show_progress: bool = True,
|
|
1123
|
+
) -> list[tuple[Conversation, APIResponse]]:
|
|
1124
|
+
"""Synchronous wrapper for :meth:`process_agent_loops_async`."""
|
|
1125
|
+
return asyncio.run(
|
|
1126
|
+
self.process_agent_loops_async(
|
|
1127
|
+
prompts,
|
|
1128
|
+
tools=tools,
|
|
1129
|
+
max_rounds=max_rounds,
|
|
1130
|
+
max_concurrent_agents=max_concurrent_agents,
|
|
1131
|
+
show_progress=show_progress,
|
|
1132
|
+
)
|
|
1133
|
+
)
|
|
1134
|
+
|
|
890
1135
|
async def submit_batch_job(
|
|
891
1136
|
self,
|
|
892
1137
|
prompts: Prompt | Sequence[Prompt],
|
|
@@ -953,11 +1198,15 @@ def LLMClient(
|
|
|
953
1198
|
extra_headers: dict[str, str] | None = None,
|
|
954
1199
|
use_responses_api: bool = False,
|
|
955
1200
|
background: bool = False,
|
|
956
|
-
temperature: float = 0
|
|
1201
|
+
temperature: float = 1.0,
|
|
957
1202
|
top_p: float = 1.0,
|
|
958
1203
|
json_mode: bool = False,
|
|
959
1204
|
max_new_tokens: int = 512,
|
|
960
|
-
reasoning_effort: Literal[
|
|
1205
|
+
reasoning_effort: Literal[
|
|
1206
|
+
"low", "medium", "high", "xhigh", "minimal", "none", None
|
|
1207
|
+
] = None,
|
|
1208
|
+
global_effort: Literal["low", "medium", "high"] | None = None,
|
|
1209
|
+
thinking_budget: int | None = None,
|
|
961
1210
|
logprobs: bool = False,
|
|
962
1211
|
top_logprobs: int | None = None,
|
|
963
1212
|
force_local_mcp: bool = False,
|
|
@@ -982,11 +1231,15 @@ def LLMClient(
|
|
|
982
1231
|
extra_headers: dict[str, str] | None = None,
|
|
983
1232
|
use_responses_api: bool = False,
|
|
984
1233
|
background: bool = False,
|
|
985
|
-
temperature: float = 0
|
|
1234
|
+
temperature: float = 1.0,
|
|
986
1235
|
top_p: float = 1.0,
|
|
987
1236
|
json_mode: bool = False,
|
|
988
1237
|
max_new_tokens: int = 512,
|
|
989
|
-
reasoning_effort: Literal[
|
|
1238
|
+
reasoning_effort: Literal[
|
|
1239
|
+
"low", "medium", "high", "xhigh", "minimal", "none", None
|
|
1240
|
+
] = None,
|
|
1241
|
+
global_effort: Literal["low", "medium", "high"] | None = None,
|
|
1242
|
+
thinking_budget: int | None = None,
|
|
990
1243
|
logprobs: bool = False,
|
|
991
1244
|
top_logprobs: int | None = None,
|
|
992
1245
|
force_local_mcp: bool = False,
|
|
@@ -1010,11 +1263,15 @@ def LLMClient(
|
|
|
1010
1263
|
extra_headers: dict[str, str] | None = None,
|
|
1011
1264
|
use_responses_api: bool = False,
|
|
1012
1265
|
background: bool = False,
|
|
1013
|
-
temperature: float = 0
|
|
1266
|
+
temperature: float = 1.0,
|
|
1014
1267
|
top_p: float = 1.0,
|
|
1015
1268
|
json_mode: bool = False,
|
|
1016
1269
|
max_new_tokens: int = 512,
|
|
1017
|
-
reasoning_effort: Literal[
|
|
1270
|
+
reasoning_effort: Literal[
|
|
1271
|
+
"low", "medium", "high", "xhigh", "minimal", "none", None
|
|
1272
|
+
] = None,
|
|
1273
|
+
global_effort: Literal["low", "medium", "high"] | None = None,
|
|
1274
|
+
thinking_budget: int | None = None,
|
|
1018
1275
|
logprobs: bool = False,
|
|
1019
1276
|
top_logprobs: int | None = None,
|
|
1020
1277
|
force_local_mcp: bool = False,
|
|
@@ -1055,6 +1312,8 @@ def LLMClient(
|
|
|
1055
1312
|
json_mode=json_mode,
|
|
1056
1313
|
max_new_tokens=max_new_tokens,
|
|
1057
1314
|
reasoning_effort=reasoning_effort,
|
|
1315
|
+
global_effort=global_effort,
|
|
1316
|
+
thinking_budget=thinking_budget,
|
|
1058
1317
|
logprobs=logprobs,
|
|
1059
1318
|
top_logprobs=top_logprobs,
|
|
1060
1319
|
force_local_mcp=force_local_mcp,
|
lm_deluge/config.py
CHANGED
|
@@ -4,13 +4,25 @@ from pydantic import BaseModel
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class SamplingParams(BaseModel):
|
|
7
|
-
temperature: float =
|
|
7
|
+
temperature: float = 1.0 # more typical for new models
|
|
8
8
|
top_p: float = 1.0
|
|
9
9
|
json_mode: bool = False
|
|
10
|
-
max_new_tokens: int =
|
|
11
|
-
|
|
10
|
+
max_new_tokens: int = 2_048
|
|
11
|
+
global_effort: Literal["low", "medium", "high"] = "high" # for opus-4.5
|
|
12
|
+
reasoning_effort: Literal[
|
|
13
|
+
"low", "medium", "high", "xhigh", "minimal", "none", None
|
|
14
|
+
] = None
|
|
15
|
+
thinking_budget: int | None = None
|
|
12
16
|
logprobs: bool = False
|
|
13
17
|
top_logprobs: int | None = None
|
|
18
|
+
strict_tools: bool = True
|
|
19
|
+
# Gemini 3 only - controls multimodal vision processing fidelity
|
|
20
|
+
media_resolution: (
|
|
21
|
+
Literal[
|
|
22
|
+
"media_resolution_low", "media_resolution_medium", "media_resolution_high"
|
|
23
|
+
]
|
|
24
|
+
| None
|
|
25
|
+
) = None
|
|
14
26
|
|
|
15
27
|
def to_vllm(self):
|
|
16
28
|
try:
|