lm-deluge 0.0.67__py3-none-any.whl → 0.0.90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (108) hide show
  1. lm_deluge/__init__.py +1 -2
  2. lm_deluge/api_requests/anthropic.py +117 -22
  3. lm_deluge/api_requests/base.py +84 -11
  4. lm_deluge/api_requests/bedrock.py +30 -6
  5. lm_deluge/api_requests/chat_reasoning.py +4 -0
  6. lm_deluge/api_requests/gemini.py +166 -20
  7. lm_deluge/api_requests/openai.py +145 -25
  8. lm_deluge/batches.py +15 -45
  9. lm_deluge/client.py +309 -50
  10. lm_deluge/config.py +15 -3
  11. lm_deluge/models/__init__.py +14 -1
  12. lm_deluge/models/anthropic.py +29 -14
  13. lm_deluge/models/arcee.py +16 -0
  14. lm_deluge/models/deepseek.py +36 -4
  15. lm_deluge/models/google.py +42 -0
  16. lm_deluge/models/grok.py +24 -0
  17. lm_deluge/models/kimi.py +36 -0
  18. lm_deluge/models/minimax.py +18 -0
  19. lm_deluge/models/openai.py +100 -0
  20. lm_deluge/models/openrouter.py +133 -7
  21. lm_deluge/models/together.py +11 -0
  22. lm_deluge/models/zai.py +50 -0
  23. lm_deluge/pipelines/gepa/__init__.py +95 -0
  24. lm_deluge/pipelines/gepa/core.py +354 -0
  25. lm_deluge/pipelines/gepa/docs/samples.py +705 -0
  26. lm_deluge/pipelines/gepa/examples/01_synthetic_keywords.py +140 -0
  27. lm_deluge/pipelines/gepa/examples/02_gsm8k_math.py +261 -0
  28. lm_deluge/pipelines/gepa/examples/03_hotpotqa_multihop.py +300 -0
  29. lm_deluge/pipelines/gepa/examples/04_batch_classification.py +271 -0
  30. lm_deluge/pipelines/gepa/examples/simple_qa.py +129 -0
  31. lm_deluge/pipelines/gepa/optimizer.py +435 -0
  32. lm_deluge/pipelines/gepa/proposer.py +235 -0
  33. lm_deluge/pipelines/gepa/util.py +165 -0
  34. lm_deluge/{llm_tools → pipelines}/score.py +2 -2
  35. lm_deluge/{llm_tools → pipelines}/translate.py +5 -3
  36. lm_deluge/prompt.py +537 -88
  37. lm_deluge/request_context.py +7 -2
  38. lm_deluge/server/__init__.py +24 -0
  39. lm_deluge/server/__main__.py +144 -0
  40. lm_deluge/server/adapters.py +369 -0
  41. lm_deluge/server/app.py +388 -0
  42. lm_deluge/server/auth.py +71 -0
  43. lm_deluge/server/model_policy.py +215 -0
  44. lm_deluge/server/models_anthropic.py +172 -0
  45. lm_deluge/server/models_openai.py +175 -0
  46. lm_deluge/tool/__init__.py +1130 -0
  47. lm_deluge/tool/builtin/anthropic/__init__.py +300 -0
  48. lm_deluge/tool/builtin/anthropic/bash.py +0 -0
  49. lm_deluge/tool/builtin/anthropic/computer_use.py +0 -0
  50. lm_deluge/tool/builtin/gemini.py +59 -0
  51. lm_deluge/tool/builtin/openai.py +74 -0
  52. lm_deluge/tool/cua/__init__.py +173 -0
  53. lm_deluge/tool/cua/actions.py +148 -0
  54. lm_deluge/tool/cua/base.py +27 -0
  55. lm_deluge/tool/cua/batch.py +215 -0
  56. lm_deluge/tool/cua/converters.py +466 -0
  57. lm_deluge/tool/cua/kernel.py +702 -0
  58. lm_deluge/tool/cua/trycua.py +989 -0
  59. lm_deluge/tool/prefab/__init__.py +45 -0
  60. lm_deluge/tool/prefab/batch_tool.py +156 -0
  61. lm_deluge/tool/prefab/docs.py +1119 -0
  62. lm_deluge/tool/prefab/email.py +294 -0
  63. lm_deluge/tool/prefab/filesystem.py +1711 -0
  64. lm_deluge/tool/prefab/full_text_search/__init__.py +285 -0
  65. lm_deluge/tool/prefab/full_text_search/tantivy_index.py +396 -0
  66. lm_deluge/tool/prefab/memory.py +458 -0
  67. lm_deluge/tool/prefab/otc/__init__.py +165 -0
  68. lm_deluge/tool/prefab/otc/executor.py +281 -0
  69. lm_deluge/tool/prefab/otc/parse.py +188 -0
  70. lm_deluge/tool/prefab/random.py +212 -0
  71. lm_deluge/tool/prefab/rlm/__init__.py +296 -0
  72. lm_deluge/tool/prefab/rlm/executor.py +349 -0
  73. lm_deluge/tool/prefab/rlm/parse.py +144 -0
  74. lm_deluge/tool/prefab/sandbox/__init__.py +19 -0
  75. lm_deluge/tool/prefab/sandbox/daytona_sandbox.py +483 -0
  76. lm_deluge/tool/prefab/sandbox/docker_sandbox.py +609 -0
  77. lm_deluge/tool/prefab/sandbox/fargate_sandbox.py +546 -0
  78. lm_deluge/tool/prefab/sandbox/modal_sandbox.py +469 -0
  79. lm_deluge/tool/prefab/sandbox/seatbelt_sandbox.py +827 -0
  80. lm_deluge/tool/prefab/sheets.py +385 -0
  81. lm_deluge/tool/prefab/skills.py +0 -0
  82. lm_deluge/tool/prefab/subagents.py +233 -0
  83. lm_deluge/tool/prefab/todos.py +342 -0
  84. lm_deluge/tool/prefab/tool_search.py +169 -0
  85. lm_deluge/tool/prefab/web_search.py +199 -0
  86. lm_deluge/tracker.py +16 -13
  87. lm_deluge/util/schema.py +412 -0
  88. lm_deluge/warnings.py +8 -0
  89. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/METADATA +23 -9
  90. lm_deluge-0.0.90.dist-info/RECORD +132 -0
  91. lm_deluge/built_in_tools/anthropic/__init__.py +0 -128
  92. lm_deluge/built_in_tools/openai.py +0 -28
  93. lm_deluge/presets/cerebras.py +0 -17
  94. lm_deluge/presets/meta.py +0 -13
  95. lm_deluge/tool.py +0 -849
  96. lm_deluge-0.0.67.dist-info/RECORD +0 -72
  97. lm_deluge/{llm_tools → pipelines}/__init__.py +1 -1
  98. /lm_deluge/{llm_tools → pipelines}/classify.py +0 -0
  99. /lm_deluge/{llm_tools → pipelines}/extract.py +0 -0
  100. /lm_deluge/{llm_tools → pipelines}/locate.py +0 -0
  101. /lm_deluge/{llm_tools → pipelines}/ocr.py +0 -0
  102. /lm_deluge/{built_in_tools/anthropic/bash.py → skills/anthropic.py} +0 -0
  103. /lm_deluge/{built_in_tools/anthropic/computer_use.py → skills/compat.py} +0 -0
  104. /lm_deluge/{built_in_tools → tool/builtin}/anthropic/editor.py +0 -0
  105. /lm_deluge/{built_in_tools → tool/builtin}/base.py +0 -0
  106. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/WHEEL +0 -0
  107. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/licenses/LICENSE +0 -0
  108. {lm_deluge-0.0.67.dist-info → lm_deluge-0.0.90.dist-info}/top_level.txt +0 -0
lm_deluge/client.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ from dataclasses import dataclass
2
3
  from typing import (
3
4
  Any,
4
5
  AsyncGenerator,
@@ -37,6 +38,14 @@ from .request_context import RequestContext
37
38
  from .tracker import StatusTracker
38
39
 
39
40
 
41
+ @dataclass
42
+ class AgentLoopResponse:
43
+ """Wrapper for agent loop results to distinguish from single request results."""
44
+
45
+ conversation: Conversation
46
+ final_response: APIResponse
47
+
48
+
40
49
  # TODO: add optional max_input_tokens to client so we can reject long prompts to prevent abuse
41
50
  class _LLMClient(BaseModel):
42
51
  """
@@ -44,10 +53,15 @@ class _LLMClient(BaseModel):
44
53
  Keeps all validation, serialization, and existing functionality.
45
54
  """
46
55
 
47
- _REASONING_SUFFIXES: ClassVar[dict[str, Literal["low", "medium", "high"]]] = {
56
+ _REASONING_SUFFIXES: ClassVar[
57
+ dict[str, Literal["low", "medium", "high", "xhigh", "minimal", "none"]]
58
+ ] = {
48
59
  "-low": "low",
49
60
  "-medium": "medium",
50
61
  "-high": "high",
62
+ "-xhigh": "xhigh",
63
+ "-minimal": "minimal",
64
+ "-none": "none",
51
65
  }
52
66
 
53
67
  model_names: str | list[str] = ["gpt-4.1-mini"]
@@ -66,11 +80,15 @@ class _LLMClient(BaseModel):
66
80
  background: bool = False
67
81
  # sampling params - if provided, and sampling_params is not,
68
82
  # these override the defaults
69
- temperature: float = 0.75
83
+ temperature: float = 1.0
70
84
  top_p: float = 1.0
71
85
  json_mode: bool = False
72
86
  max_new_tokens: int = 512
73
- reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None
87
+ reasoning_effort: Literal[
88
+ "low", "medium", "high", "xhigh", "minimal", "none", None
89
+ ] = None
90
+ global_effort: Literal["low", "medium", "high"] | None = None
91
+ thinking_budget: int | None = None
74
92
  logprobs: bool = False
75
93
  top_logprobs: int | None = None
76
94
  force_local_mcp: bool = False
@@ -84,10 +102,17 @@ class _LLMClient(BaseModel):
84
102
  # Internal state for async task handling
85
103
  _next_task_id: int = PrivateAttr(default=0)
86
104
  _tasks: dict[int, asyncio.Task] = PrivateAttr(default_factory=dict)
87
- _results: dict[int, APIResponse] = PrivateAttr(default_factory=dict)
105
+ _results: dict[int, APIResponse | AgentLoopResponse] = PrivateAttr(
106
+ default_factory=dict
107
+ )
88
108
  _tracker: StatusTracker | None = PrivateAttr(default=None)
89
109
  _capacity_lock: asyncio.Lock = PrivateAttr(default_factory=asyncio.Lock)
90
110
 
111
+ # usage
112
+ def print_usage(self):
113
+ if self._tracker:
114
+ self._tracker.log_usage()
115
+
91
116
  # Progress management for queueing API
92
117
  def open(self, total: int | None = None, show_progress: bool = True):
93
118
  self._tracker = StatusTracker(
@@ -149,9 +174,14 @@ class _LLMClient(BaseModel):
149
174
 
150
175
  def _normalize_model_names(
151
176
  self, models: list[str]
152
- ) -> tuple[list[str], list[Literal["low", "medium", "high"] | None]]:
177
+ ) -> tuple[
178
+ list[str],
179
+ list[Literal["low", "medium", "high", "xhigh", "minimal", "none"] | None],
180
+ ]:
153
181
  normalized: list[str] = []
154
- efforts: list[Literal["low", "medium", "high"] | None] = []
182
+ efforts: list[
183
+ Literal["low", "medium", "high", "xhigh", "minimal", "none"] | None
184
+ ] = []
155
185
 
156
186
  for name in models:
157
187
  base_name = self._preprocess_openrouter_model(name)
@@ -164,7 +194,10 @@ class _LLMClient(BaseModel):
164
194
  return normalized, efforts
165
195
 
166
196
  def _align_sampling_params(
167
- self, per_model_efforts: list[Literal["low", "medium", "high"] | None]
197
+ self,
198
+ per_model_efforts: list[
199
+ Literal["low", "medium", "high", "xhigh", "minimal", "none"] | None
200
+ ],
168
201
  ) -> None:
169
202
  if len(per_model_efforts) < len(self.model_names):
170
203
  per_model_efforts = per_model_efforts + [None] * (
@@ -187,6 +220,8 @@ class _LLMClient(BaseModel):
187
220
  json_mode=self.json_mode,
188
221
  max_new_tokens=self.max_new_tokens,
189
222
  reasoning_effort=self.reasoning_effort,
223
+ global_effort=self.global_effort or "high",
224
+ thinking_budget=self.thinking_budget,
190
225
  logprobs=self.logprobs,
191
226
  top_logprobs=self.top_logprobs,
192
227
  )
@@ -242,6 +277,7 @@ class _LLMClient(BaseModel):
242
277
  self.max_tokens_per_minute = max_tokens_per_minute
243
278
  if max_concurrent_requests:
244
279
  self.max_concurrent_requests = max_concurrent_requests
280
+ return self
245
281
 
246
282
  def _get_tracker(self) -> StatusTracker:
247
283
  if self._tracker is None:
@@ -253,6 +289,28 @@ class _LLMClient(BaseModel):
253
289
  def models(self):
254
290
  return self.model_names # why? idk
255
291
 
292
+ @staticmethod
293
+ def _preprocess_tinker_model(model_name: str) -> str:
294
+ if model_name.startswith("tinker://"):
295
+ model_id = model_name
296
+ if model_id not in registry:
297
+ register_model(
298
+ id=model_name,
299
+ name=model_name,
300
+ api_base="https://tinker.thinkingmachines.dev/services/tinker-prod/oai/api/v1",
301
+ api_key_env_var="TINKER_API_KEY",
302
+ api_spec="openai",
303
+ supports_json=True,
304
+ supports_logprobs=False,
305
+ supports_responses=False,
306
+ input_cost=0, # Unknown costs for arbitrary tinker models
307
+ cached_input_cost=0,
308
+ cache_write_cost=0,
309
+ output_cost=0,
310
+ )
311
+
312
+ return model_name
313
+
256
314
  @staticmethod
257
315
  def _preprocess_openrouter_model(model_name: str) -> str:
258
316
  """Process openrouter: prefix and register model if needed."""
@@ -279,7 +337,8 @@ class _LLMClient(BaseModel):
279
337
  )
280
338
 
281
339
  return model_id
282
- return model_name
340
+ else:
341
+ return model_name
283
342
 
284
343
  @model_validator(mode="before")
285
344
  @classmethod
@@ -292,6 +351,9 @@ class _LLMClient(BaseModel):
292
351
  # First, handle OpenRouter prefix
293
352
  model_name = cls._preprocess_openrouter_model(model_names)
294
353
 
354
+ # next handle tinker prefix
355
+ model_name = cls._preprocess_tinker_model(model_name)
356
+
295
357
  # Then handle reasoning effort suffix (e.g., "gpt-5-high")
296
358
  model_name, effort = cls._strip_reasoning_suffix_if_registered(model_name)
297
359
  if effort and data.get("reasoning_effort") is None:
@@ -316,11 +378,13 @@ class _LLMClient(BaseModel):
316
378
  if "sampling_params" not in data or len(data.get("sampling_params", [])) == 0:
317
379
  data["sampling_params"] = [
318
380
  SamplingParams(
319
- temperature=data.get("temperature", 0.75),
381
+ temperature=data.get("temperature", 1.0),
320
382
  top_p=data.get("top_p", 1.0),
321
383
  json_mode=data.get("json_mode", False),
322
384
  max_new_tokens=data.get("max_new_tokens", 512),
323
385
  reasoning_effort=data.get("reasoning_effort", None),
386
+ global_effort=data.get("global_effort") or "high",
387
+ thinking_budget=data.get("thinking_budget", None),
324
388
  logprobs=data.get("logprobs", False),
325
389
  top_logprobs=data.get("top_logprobs", None),
326
390
  )
@@ -332,7 +396,9 @@ class _LLMClient(BaseModel):
332
396
  @classmethod
333
397
  def _strip_reasoning_suffix_if_registered(
334
398
  cls, model_name: str
335
- ) -> tuple[str, Literal["low", "medium", "high"] | None]:
399
+ ) -> tuple[
400
+ str, Literal["low", "medium", "high", "xhigh", "minimal", "none"] | None
401
+ ]:
336
402
  """Remove reasoning suffix only when the trimmed model already exists."""
337
403
  for suffix, effort in cls._REASONING_SUFFIXES.items():
338
404
  if model_name.endswith(suffix) and len(model_name) > len(suffix):
@@ -364,6 +430,15 @@ class _LLMClient(BaseModel):
364
430
  assert (
365
431
  self.use_responses_api
366
432
  ), "background mode only allowed for responses api"
433
+
434
+ # codex models require responses api
435
+ for model_name in self.model_names:
436
+ if "codex" in model_name.lower() and not self.use_responses_api:
437
+ raise ValueError(
438
+ f"Model '{model_name}' requires use_responses_api=True. "
439
+ "Codex models are only available via the Responses API."
440
+ )
441
+
367
442
  # Auto-generate name if not provided
368
443
  if self.name is None:
369
444
  if len(self.model_names) == 1:
@@ -542,7 +617,8 @@ class _LLMClient(BaseModel):
542
617
  *,
543
618
  return_completions_only: Literal[True],
544
619
  show_progress: bool = ...,
545
- tools: list[Tool | dict | MCPServer] | None = ...,
620
+ tools: Sequence[Tool | dict | MCPServer] | None = ...,
621
+ output_schema: type[BaseModel] | dict | None = ...,
546
622
  cache: CachePattern | None = ...,
547
623
  service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
548
624
  ) -> list[str | None]: ...
@@ -554,7 +630,8 @@ class _LLMClient(BaseModel):
554
630
  *,
555
631
  return_completions_only: Literal[False] = ...,
556
632
  show_progress: bool = ...,
557
- tools: list[Tool | dict | MCPServer] | None = ...,
633
+ tools: Sequence[Tool | dict | MCPServer] | None = ...,
634
+ output_schema: type[BaseModel] | dict | None = ...,
558
635
  cache: CachePattern | None = ...,
559
636
  service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
560
637
  ) -> list[APIResponse]: ...
@@ -565,7 +642,8 @@ class _LLMClient(BaseModel):
565
642
  *,
566
643
  return_completions_only: bool = False,
567
644
  show_progress: bool = True,
568
- tools: list[Tool | dict | MCPServer] | None = None,
645
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
646
+ output_schema: type[BaseModel] | dict | None = None,
569
647
  cache: CachePattern | None = None,
570
648
  service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
571
649
  ) -> list[APIResponse] | list[str | None] | dict[str, int]:
@@ -594,6 +672,7 @@ class _LLMClient(BaseModel):
594
672
  task_id = self.start_nowait(
595
673
  prompt,
596
674
  tools=tools,
675
+ output_schema=output_schema,
597
676
  cache=cache,
598
677
  service_tier=service_tier,
599
678
  )
@@ -638,7 +717,8 @@ class _LLMClient(BaseModel):
638
717
  *,
639
718
  return_completions_only: bool = False,
640
719
  show_progress=True,
641
- tools: list[Tool | dict | MCPServer] | None = None,
720
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
721
+ output_schema: type[BaseModel] | dict | None = None,
642
722
  cache: CachePattern | None = None,
643
723
  ):
644
724
  return asyncio.run(
@@ -647,6 +727,7 @@ class _LLMClient(BaseModel):
647
727
  return_completions_only=return_completions_only,
648
728
  show_progress=show_progress,
649
729
  tools=tools,
730
+ output_schema=output_schema,
650
731
  cache=cache,
651
732
  )
652
733
  )
@@ -669,7 +750,8 @@ class _LLMClient(BaseModel):
669
750
  self,
670
751
  prompt: Prompt,
671
752
  *,
672
- tools: list[Tool | dict | MCPServer] | None = None,
753
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
754
+ output_schema: type[BaseModel] | dict | None = None,
673
755
  cache: CachePattern | None = None,
674
756
  service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
675
757
  ) -> int:
@@ -688,6 +770,7 @@ class _LLMClient(BaseModel):
688
770
  request_timeout=self.request_timeout,
689
771
  status_tracker=tracker,
690
772
  tools=tools,
773
+ output_schema=output_schema,
691
774
  cache=cache,
692
775
  use_responses_api=self.use_responses_api,
693
776
  background=self.background,
@@ -702,25 +785,30 @@ class _LLMClient(BaseModel):
702
785
 
703
786
  async def start(
704
787
  self,
705
- prompt: str | Conversation,
788
+ prompt: Prompt,
706
789
  *,
707
- tools: list[Tool | dict | MCPServer] | None = None,
790
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
791
+ output_schema: type[BaseModel] | dict | None = None,
708
792
  cache: CachePattern | None = None,
709
793
  service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
710
794
  ) -> APIResponse:
711
795
  task_id = self.start_nowait(
712
- prompt, tools=tools, cache=cache, service_tier=service_tier
796
+ prompt,
797
+ tools=tools,
798
+ output_schema=output_schema,
799
+ cache=cache,
800
+ service_tier=service_tier,
713
801
  )
714
802
  return await self.wait_for(task_id)
715
803
 
716
804
  async def wait_for(self, task_id: int) -> APIResponse:
717
805
  task = self._tasks.get(task_id)
718
806
  if task:
719
- return await task
720
- res = self._results.get(task_id)
721
- if res:
722
- return res
807
+ result = await task
723
808
  else:
809
+ result = self._results.get(task_id)
810
+
811
+ if result is None:
724
812
  return APIResponse(
725
813
  id=-1,
726
814
  model_internal="",
@@ -731,6 +819,11 @@ class _LLMClient(BaseModel):
731
819
  error_message="Task not found",
732
820
  )
733
821
 
822
+ assert isinstance(
823
+ result, APIResponse
824
+ ), f"Expected APIResponse, got {type(result)}. Use wait_for_agent_loop for agent loop tasks."
825
+ return result
826
+
734
827
  async def wait_for_all(
735
828
  self, task_ids: Sequence[int] | None = None
736
829
  ) -> list[APIResponse]:
@@ -766,6 +859,9 @@ class _LLMClient(BaseModel):
766
859
  tid = tasks_map.pop(task)
767
860
  task_result = self._results.get(tid, await task)
768
861
  assert task_result
862
+ assert isinstance(
863
+ task_result, APIResponse
864
+ ), f"Expected APIResponse, got {type(task_result)}. as_completed() only works with single requests, not agent loops."
769
865
  yield tid, task_result
770
866
 
771
867
  while tasks_map:
@@ -776,16 +872,19 @@ class _LLMClient(BaseModel):
776
872
  tid = tasks_map.pop(task)
777
873
  task_result = self._results.get(tid, await task)
778
874
  assert task_result
875
+ assert isinstance(
876
+ task_result, APIResponse
877
+ ), f"Expected APIResponse, got {type(task_result)}. as_completed() only works with single requests, not agent loops."
779
878
  yield tid, task_result
780
879
 
781
880
  async def stream(
782
881
  self,
783
- prompt: str | Conversation,
784
- tools: list[Tool | dict | MCPServer] | None = None,
882
+ prompt: Prompt,
883
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
785
884
  ):
786
885
  model, sampling_params = self._select_model()
787
- if isinstance(prompt, str):
788
- prompt = Conversation.user(prompt)
886
+ prompt = prompts_to_conversations([prompt])[0]
887
+ assert isinstance(prompt, Conversation)
789
888
  async for item in stream_chat(
790
889
  model, prompt, sampling_params, tools, None, self.extra_headers
791
890
  ):
@@ -797,23 +896,15 @@ class _LLMClient(BaseModel):
797
896
  return self.postprocess(item)
798
897
  return item
799
898
 
800
- async def run_agent_loop(
899
+ async def _run_agent_loop_internal(
801
900
  self,
802
- conversation: str | Conversation,
901
+ task_id: int,
902
+ conversation: Conversation,
803
903
  *,
804
- tools: list[Tool | dict | MCPServer] | None = None,
904
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
805
905
  max_rounds: int = 5,
806
- show_progress: bool = False,
807
- ) -> tuple[Conversation, APIResponse]:
808
- """Run a simple agent loop until no more tool calls are returned.
809
-
810
- The provided ``conversation`` will be mutated and returned alongside the
811
- final ``APIResponse`` from the model. ``tools`` may include ``Tool``
812
- instances or built‑in tool dictionaries.
813
- """
814
-
815
- if isinstance(conversation, str):
816
- conversation = Conversation.user(conversation)
906
+ ) -> AgentLoopResponse:
907
+ """Internal method to run agent loop and return wrapped result."""
817
908
 
818
909
  # Expand MCPServer objects to their constituent tools for tool execution
819
910
  expanded_tools: list[Tool] = []
@@ -861,18 +952,86 @@ class _LLMClient(BaseModel):
861
952
  if not isinstance(result, (str, dict, list)):
862
953
  result = str(result)
863
954
 
864
- conversation.with_tool_result(call.id, result) # type: ignore
955
+ conversation = conversation.with_tool_result(call.id, result) # type: ignore
865
956
 
866
957
  if response is None:
867
958
  raise RuntimeError("model did not return a response")
868
959
 
869
- return conversation, response
960
+ result = AgentLoopResponse(conversation=conversation, final_response=response)
961
+ self._results[task_id] = result
962
+ return result
963
+
964
+ def start_agent_loop_nowait(
965
+ self,
966
+ conversation: Prompt,
967
+ *,
968
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
969
+ max_rounds: int = 5,
970
+ ) -> int:
971
+ """Start an agent loop without waiting for it to complete.
972
+
973
+ Returns a task_id that can be used with wait_for_agent_loop().
974
+ """
975
+ if not isinstance(conversation, Conversation):
976
+ conversation = prompts_to_conversations([conversation])[0]
977
+ assert isinstance(conversation, Conversation)
978
+
979
+ task_id = self._next_task_id
980
+ self._next_task_id += 1
981
+
982
+ task = asyncio.create_task(
983
+ self._run_agent_loop_internal(
984
+ task_id, conversation, tools=tools, max_rounds=max_rounds
985
+ )
986
+ )
987
+ self._tasks[task_id] = task
988
+ return task_id
989
+
990
+ async def wait_for_agent_loop(
991
+ self, task_id: int
992
+ ) -> tuple[Conversation, APIResponse]:
993
+ """Wait for an agent loop task to complete.
994
+
995
+ Returns the conversation and final response from the agent loop.
996
+ """
997
+ task = self._tasks.get(task_id)
998
+ if task:
999
+ result = await task
1000
+ else:
1001
+ result = self._results.get(task_id)
1002
+
1003
+ if result is None:
1004
+ raise RuntimeError(f"Agent loop task {task_id} not found")
1005
+
1006
+ assert isinstance(
1007
+ result, AgentLoopResponse
1008
+ ), f"Expected AgentLoopResponse, got {type(result)}"
1009
+ return result.conversation, result.final_response
1010
+
1011
+ async def run_agent_loop(
1012
+ self,
1013
+ conversation: Prompt,
1014
+ *,
1015
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
1016
+ max_rounds: int = 5,
1017
+ show_progress: bool = False,
1018
+ ) -> tuple[Conversation, APIResponse]:
1019
+ """Run a simple agent loop until no more tool calls are returned.
1020
+
1021
+ The provided ``conversation`` will be mutated and returned alongside the
1022
+ final ``APIResponse`` from the model. ``tools`` may include ``Tool``
1023
+ instances or built‑in tool dictionaries.
1024
+ """
1025
+ task_id = self.start_agent_loop_nowait(
1026
+ conversation, tools=tools, max_rounds=max_rounds
1027
+ )
1028
+ return await self.wait_for_agent_loop(task_id)
870
1029
 
871
1030
  def run_agent_loop_sync(
872
1031
  self,
873
- conversation: str | Conversation,
1032
+ conversation: Prompt,
874
1033
  *,
875
- tools: list[Tool | dict | MCPServer] | None = None,
1034
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
876
1035
  max_rounds: int = 5,
877
1036
  show_progress: bool = False,
878
1037
  ) -> tuple[Conversation, APIResponse]:
@@ -887,6 +1046,92 @@ class _LLMClient(BaseModel):
887
1046
  )
888
1047
  )
889
1048
 
1049
+ async def process_agent_loops_async(
1050
+ self,
1051
+ prompts: Sequence[Prompt],
1052
+ *,
1053
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
1054
+ max_rounds: int = 5,
1055
+ max_concurrent_agents: int = 10,
1056
+ show_progress: bool = True,
1057
+ ) -> list[tuple[Conversation, APIResponse]]:
1058
+ """Process multiple agent loops concurrently.
1059
+
1060
+ Each prompt becomes an independent agent loop that can make multiple LLM
1061
+ calls and execute tools until completion. The agent loops run concurrently,
1062
+ limited by ``max_concurrent_agents``, while the underlying LLM requests
1063
+ are still governed by ``max_concurrent_requests``.
1064
+
1065
+ Args:
1066
+ prompts: Sequence of prompts, each becoming a separate agent loop.
1067
+ tools: Tools available to all agent loops.
1068
+ max_rounds: Maximum rounds per agent loop (default 5).
1069
+ max_concurrent_agents: Maximum number of agent loops running
1070
+ concurrently (default 10). This is separate from the LLM request
1071
+ concurrency limit.
1072
+ show_progress: Whether to show progress bar for LLM requests.
1073
+
1074
+ Returns:
1075
+ List of (Conversation, APIResponse) tuples in the same order as
1076
+ the input prompts.
1077
+ """
1078
+ # Convert prompts to Conversations
1079
+ conversations = prompts_to_conversations(list(prompts))
1080
+
1081
+ # Ensure tracker exists for underlying LLM requests
1082
+ if self._tracker is None:
1083
+ self.open(total=0, show_progress=show_progress)
1084
+ tracker_preopened = False
1085
+ else:
1086
+ tracker_preopened = True
1087
+
1088
+ # Semaphore to limit concurrent agent loops
1089
+ agent_semaphore = asyncio.Semaphore(max_concurrent_agents)
1090
+
1091
+ async def run_single_loop(
1092
+ idx: int, conv: Conversation
1093
+ ) -> tuple[int, Conversation, APIResponse]:
1094
+ """Run a single agent loop with semaphore protection."""
1095
+ async with agent_semaphore:
1096
+ task_id = self._next_task_id
1097
+ self._next_task_id += 1
1098
+ result = await self._run_agent_loop_internal(
1099
+ task_id, conv, tools=tools, max_rounds=max_rounds
1100
+ )
1101
+ return idx, result.conversation, result.final_response
1102
+
1103
+ # Launch all agent loops concurrently (semaphore limits actual concurrency)
1104
+ tasks = [run_single_loop(idx, conv) for idx, conv in enumerate(conversations)]
1105
+ completed = await asyncio.gather(*tasks)
1106
+
1107
+ # Close tracker if we opened it
1108
+ if not tracker_preopened:
1109
+ self.close()
1110
+
1111
+ # Sort by original index and extract results
1112
+ completed_sorted = sorted(completed, key=lambda x: x[0])
1113
+ return [(conv, resp) for _, conv, resp in completed_sorted]
1114
+
1115
+ def process_agent_loops_sync(
1116
+ self,
1117
+ prompts: Sequence[Prompt],
1118
+ *,
1119
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
1120
+ max_rounds: int = 5,
1121
+ max_concurrent_agents: int = 10,
1122
+ show_progress: bool = True,
1123
+ ) -> list[tuple[Conversation, APIResponse]]:
1124
+ """Synchronous wrapper for :meth:`process_agent_loops_async`."""
1125
+ return asyncio.run(
1126
+ self.process_agent_loops_async(
1127
+ prompts,
1128
+ tools=tools,
1129
+ max_rounds=max_rounds,
1130
+ max_concurrent_agents=max_concurrent_agents,
1131
+ show_progress=show_progress,
1132
+ )
1133
+ )
1134
+
890
1135
  async def submit_batch_job(
891
1136
  self,
892
1137
  prompts: Prompt | Sequence[Prompt],
@@ -953,11 +1198,15 @@ def LLMClient(
953
1198
  extra_headers: dict[str, str] | None = None,
954
1199
  use_responses_api: bool = False,
955
1200
  background: bool = False,
956
- temperature: float = 0.75,
1201
+ temperature: float = 1.0,
957
1202
  top_p: float = 1.0,
958
1203
  json_mode: bool = False,
959
1204
  max_new_tokens: int = 512,
960
- reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
1205
+ reasoning_effort: Literal[
1206
+ "low", "medium", "high", "xhigh", "minimal", "none", None
1207
+ ] = None,
1208
+ global_effort: Literal["low", "medium", "high"] | None = None,
1209
+ thinking_budget: int | None = None,
961
1210
  logprobs: bool = False,
962
1211
  top_logprobs: int | None = None,
963
1212
  force_local_mcp: bool = False,
@@ -982,11 +1231,15 @@ def LLMClient(
982
1231
  extra_headers: dict[str, str] | None = None,
983
1232
  use_responses_api: bool = False,
984
1233
  background: bool = False,
985
- temperature: float = 0.75,
1234
+ temperature: float = 1.0,
986
1235
  top_p: float = 1.0,
987
1236
  json_mode: bool = False,
988
1237
  max_new_tokens: int = 512,
989
- reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
1238
+ reasoning_effort: Literal[
1239
+ "low", "medium", "high", "xhigh", "minimal", "none", None
1240
+ ] = None,
1241
+ global_effort: Literal["low", "medium", "high"] | None = None,
1242
+ thinking_budget: int | None = None,
990
1243
  logprobs: bool = False,
991
1244
  top_logprobs: int | None = None,
992
1245
  force_local_mcp: bool = False,
@@ -1010,11 +1263,15 @@ def LLMClient(
1010
1263
  extra_headers: dict[str, str] | None = None,
1011
1264
  use_responses_api: bool = False,
1012
1265
  background: bool = False,
1013
- temperature: float = 0.75,
1266
+ temperature: float = 1.0,
1014
1267
  top_p: float = 1.0,
1015
1268
  json_mode: bool = False,
1016
1269
  max_new_tokens: int = 512,
1017
- reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
1270
+ reasoning_effort: Literal[
1271
+ "low", "medium", "high", "xhigh", "minimal", "none", None
1272
+ ] = None,
1273
+ global_effort: Literal["low", "medium", "high"] | None = None,
1274
+ thinking_budget: int | None = None,
1018
1275
  logprobs: bool = False,
1019
1276
  top_logprobs: int | None = None,
1020
1277
  force_local_mcp: bool = False,
@@ -1055,6 +1312,8 @@ def LLMClient(
1055
1312
  json_mode=json_mode,
1056
1313
  max_new_tokens=max_new_tokens,
1057
1314
  reasoning_effort=reasoning_effort,
1315
+ global_effort=global_effort,
1316
+ thinking_budget=thinking_budget,
1058
1317
  logprobs=logprobs,
1059
1318
  top_logprobs=top_logprobs,
1060
1319
  force_local_mcp=force_local_mcp,
lm_deluge/config.py CHANGED
@@ -4,13 +4,25 @@ from pydantic import BaseModel
4
4
 
5
5
 
6
6
  class SamplingParams(BaseModel):
7
- temperature: float = 0.0
7
+ temperature: float = 1.0 # more typical for new models
8
8
  top_p: float = 1.0
9
9
  json_mode: bool = False
10
- max_new_tokens: int = 512
11
- reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None
10
+ max_new_tokens: int = 2_048
11
+ global_effort: Literal["low", "medium", "high"] = "high" # for opus-4.5
12
+ reasoning_effort: Literal[
13
+ "low", "medium", "high", "xhigh", "minimal", "none", None
14
+ ] = None
15
+ thinking_budget: int | None = None
12
16
  logprobs: bool = False
13
17
  top_logprobs: int | None = None
18
+ strict_tools: bool = True
19
+ # Gemini 3 only - controls multimodal vision processing fidelity
20
+ media_resolution: (
21
+ Literal[
22
+ "media_resolution_low", "media_resolution_medium", "media_resolution_high"
23
+ ]
24
+ | None
25
+ ) = None
14
26
 
15
27
  def to_vllm(self):
16
28
  try: