lm-deluge 0.0.56__py3-none-any.whl → 0.0.69__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. lm_deluge/__init__.py +12 -1
  2. lm_deluge/api_requests/anthropic.py +12 -1
  3. lm_deluge/api_requests/base.py +87 -5
  4. lm_deluge/api_requests/bedrock.py +3 -4
  5. lm_deluge/api_requests/chat_reasoning.py +4 -0
  6. lm_deluge/api_requests/gemini.py +7 -6
  7. lm_deluge/api_requests/mistral.py +8 -9
  8. lm_deluge/api_requests/openai.py +179 -124
  9. lm_deluge/batches.py +25 -9
  10. lm_deluge/client.py +280 -67
  11. lm_deluge/config.py +1 -1
  12. lm_deluge/file.py +382 -13
  13. lm_deluge/mock_openai.py +482 -0
  14. lm_deluge/models/__init__.py +12 -8
  15. lm_deluge/models/anthropic.py +12 -20
  16. lm_deluge/models/bedrock.py +0 -14
  17. lm_deluge/models/cohere.py +0 -16
  18. lm_deluge/models/google.py +0 -20
  19. lm_deluge/models/grok.py +48 -4
  20. lm_deluge/models/groq.py +2 -2
  21. lm_deluge/models/kimi.py +34 -0
  22. lm_deluge/models/meta.py +0 -8
  23. lm_deluge/models/minimax.py +10 -0
  24. lm_deluge/models/openai.py +28 -34
  25. lm_deluge/models/openrouter.py +64 -1
  26. lm_deluge/models/together.py +0 -16
  27. lm_deluge/prompt.py +138 -29
  28. lm_deluge/request_context.py +9 -11
  29. lm_deluge/tool.py +395 -19
  30. lm_deluge/tracker.py +11 -5
  31. lm_deluge/warnings.py +46 -0
  32. {lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/METADATA +3 -1
  33. {lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/RECORD +36 -33
  34. lm_deluge/agent.py +0 -0
  35. lm_deluge/gemini_limits.py +0 -65
  36. {lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/WHEEL +0 -0
  37. {lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/licenses/LICENSE +0 -0
  38. {lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/top_level.txt +0 -0
lm_deluge/client.py CHANGED
@@ -1,5 +1,15 @@
1
1
  import asyncio
2
- from typing import Any, AsyncGenerator, Callable, Literal, Self, Sequence, overload
2
+ from typing import (
3
+ Any,
4
+ AsyncGenerator,
5
+ Callable,
6
+ ClassVar,
7
+ Literal,
8
+ Self,
9
+ Sequence,
10
+ cast,
11
+ overload,
12
+ )
3
13
 
4
14
  import numpy as np
5
15
  import yaml
@@ -12,12 +22,17 @@ from lm_deluge.batches import (
12
22
  submit_batches_oa,
13
23
  wait_for_batch_completion_async,
14
24
  )
15
- from lm_deluge.prompt import CachePattern, Conversation, prompts_to_conversations
25
+ from lm_deluge.prompt import (
26
+ CachePattern,
27
+ Conversation,
28
+ Prompt,
29
+ prompts_to_conversations,
30
+ )
16
31
  from lm_deluge.tool import MCPServer, Tool
17
32
 
18
33
  from .api_requests.base import APIResponse
19
34
  from .config import SamplingParams
20
- from .models import APIModel, registry
35
+ from .models import APIModel, register_model, registry
21
36
  from .request_context import RequestContext
22
37
  from .tracker import StatusTracker
23
38
 
@@ -29,6 +44,12 @@ class _LLMClient(BaseModel):
29
44
  Keeps all validation, serialization, and existing functionality.
30
45
  """
31
46
 
47
+ _REASONING_SUFFIXES: ClassVar[dict[str, Literal["low", "medium", "high"]]] = {
48
+ "-low": "low",
49
+ "-medium": "medium",
50
+ "-high": "high",
51
+ }
52
+
32
53
  model_names: str | list[str] = ["gpt-4.1-mini"]
33
54
  name: str | None = None
34
55
  max_requests_per_minute: int = 1_000
@@ -40,13 +61,16 @@ class _LLMClient(BaseModel):
40
61
  request_timeout: int = 30
41
62
  cache: Any = None
42
63
  extra_headers: dict[str, str] | None = None
64
+ extra_body: dict[str, str] | None = None
65
+ use_responses_api: bool = False
66
+ background: bool = False
43
67
  # sampling params - if provided, and sampling_params is not,
44
68
  # these override the defaults
45
69
  temperature: float = 0.75
46
70
  top_p: float = 1.0
47
71
  json_mode: bool = False
48
72
  max_new_tokens: int = 512
49
- reasoning_effort: Literal["low", "medium", "high", None] = None
73
+ reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None
50
74
  logprobs: bool = False
51
75
  top_logprobs: int | None = None
52
76
  force_local_mcp: bool = False
@@ -100,13 +124,112 @@ class _LLMClient(BaseModel):
100
124
 
101
125
  # NEW! Builder methods
102
126
  def with_model(self, model: str):
103
- self.model_names = [model]
127
+ self._update_models([model])
104
128
  return self
105
129
 
106
130
  def with_models(self, models: list[str]):
107
- self.model_names = models
131
+ self._update_models(models)
108
132
  return self
109
133
 
134
+ def _update_models(self, models: list[str]) -> None:
135
+ normalized, per_model_efforts = self._normalize_model_names(models)
136
+ if self.reasoning_effort is None:
137
+ unique_efforts = {eff for eff in per_model_efforts if eff is not None}
138
+ if len(normalized) == 1 and per_model_efforts[0] is not None:
139
+ self.reasoning_effort = per_model_efforts[0]
140
+ elif (
141
+ len(unique_efforts) == 1
142
+ and len(unique_efforts) != 0
143
+ and None not in per_model_efforts
144
+ ):
145
+ self.reasoning_effort = next(iter(unique_efforts)) # type: ignore
146
+ self.model_names = normalized
147
+ self._align_sampling_params(per_model_efforts)
148
+ self._reset_model_weights()
149
+
150
+ def _normalize_model_names(
151
+ self, models: list[str]
152
+ ) -> tuple[list[str], list[Literal["low", "medium", "high"] | None]]:
153
+ normalized: list[str] = []
154
+ efforts: list[Literal["low", "medium", "high"] | None] = []
155
+
156
+ for name in models:
157
+ base_name = self._preprocess_openrouter_model(name)
158
+ trimmed_name, effort = self.__class__._strip_reasoning_suffix_if_registered(
159
+ base_name
160
+ )
161
+ normalized.append(trimmed_name)
162
+ efforts.append(effort)
163
+
164
+ return normalized, efforts
165
+
166
+ def _align_sampling_params(
167
+ self, per_model_efforts: list[Literal["low", "medium", "high"] | None]
168
+ ) -> None:
169
+ if len(per_model_efforts) < len(self.model_names):
170
+ per_model_efforts = per_model_efforts + [None] * (
171
+ len(self.model_names) - len(per_model_efforts)
172
+ )
173
+
174
+ if not self.model_names:
175
+ self.sampling_params = []
176
+ return
177
+
178
+ if not self.sampling_params:
179
+ self.sampling_params = []
180
+
181
+ if len(self.sampling_params) == 0:
182
+ for _ in self.model_names:
183
+ self.sampling_params.append(
184
+ SamplingParams(
185
+ temperature=self.temperature,
186
+ top_p=self.top_p,
187
+ json_mode=self.json_mode,
188
+ max_new_tokens=self.max_new_tokens,
189
+ reasoning_effort=self.reasoning_effort,
190
+ logprobs=self.logprobs,
191
+ top_logprobs=self.top_logprobs,
192
+ )
193
+ )
194
+ elif len(self.sampling_params) == 1 and len(self.model_names) > 1:
195
+ base_param = self.sampling_params[0]
196
+ self.sampling_params = [
197
+ base_param.model_copy(deep=True) for _ in self.model_names
198
+ ]
199
+ elif len(self.sampling_params) != len(self.model_names):
200
+ base_param = self.sampling_params[0]
201
+ self.sampling_params = [
202
+ base_param.model_copy(deep=True) for _ in self.model_names
203
+ ]
204
+
205
+ if self.reasoning_effort is not None:
206
+ for sp in self.sampling_params:
207
+ sp.reasoning_effort = self.reasoning_effort
208
+ else:
209
+ for sp, effort in zip(self.sampling_params, per_model_efforts):
210
+ if effort is not None:
211
+ sp.reasoning_effort = effort
212
+
213
+ def _reset_model_weights(self) -> None:
214
+ if not self.model_names:
215
+ self.model_weights = []
216
+ return
217
+
218
+ if isinstance(self.model_weights, list):
219
+ if len(self.model_weights) == len(self.model_names) and any(
220
+ self.model_weights
221
+ ):
222
+ total = sum(self.model_weights)
223
+ if total == 0:
224
+ self.model_weights = [
225
+ 1 / len(self.model_names) for _ in self.model_names
226
+ ]
227
+ else:
228
+ self.model_weights = [w / total for w in self.model_weights]
229
+ return
230
+ # Fallback to uniform distribution
231
+ self.model_weights = [1 / len(self.model_names) for _ in self.model_names]
232
+
110
233
  def with_limits(
111
234
  self,
112
235
  max_requests_per_minute: int | None = None,
@@ -130,11 +253,64 @@ class _LLMClient(BaseModel):
130
253
  def models(self):
131
254
  return self.model_names # why? idk
132
255
 
256
+ @staticmethod
257
+ def _preprocess_openrouter_model(model_name: str) -> str:
258
+ """Process openrouter: prefix and register model if needed."""
259
+ if model_name.startswith("openrouter:"):
260
+ slug = model_name.split(":", 1)[1] # Everything after "openrouter:"
261
+ # Create a unique id by replacing slashes with hyphens
262
+ model_id = f"openrouter-{slug.replace('/', '-')}"
263
+
264
+ # Register the model if not already in registry
265
+ if model_id not in registry:
266
+ register_model(
267
+ id=model_id,
268
+ name=slug, # The full slug sent to OpenRouter API (e.g., "openrouter/andromeda-alpha")
269
+ api_base="https://openrouter.ai/api/v1",
270
+ api_key_env_var="OPENROUTER_API_KEY",
271
+ api_spec="openai",
272
+ supports_json=True,
273
+ supports_logprobs=False,
274
+ supports_responses=False,
275
+ input_cost=0, # Unknown costs for generic models
276
+ cached_input_cost=0,
277
+ cache_write_cost=0,
278
+ output_cost=0,
279
+ )
280
+
281
+ return model_id
282
+ return model_name
283
+
133
284
  @model_validator(mode="before")
134
285
  @classmethod
135
286
  def fix_lists(cls, data) -> "_LLMClient":
136
- if isinstance(data.get("model_names"), str):
137
- data["model_names"] = [data["model_names"]]
287
+ # Process model_names - handle both strings and lists
288
+ model_names = data.get("model_names")
289
+
290
+ if isinstance(model_names, str):
291
+ # Single model as string
292
+ # First, handle OpenRouter prefix
293
+ model_name = cls._preprocess_openrouter_model(model_names)
294
+
295
+ # Then handle reasoning effort suffix (e.g., "gpt-5-high")
296
+ model_name, effort = cls._strip_reasoning_suffix_if_registered(model_name)
297
+ if effort and data.get("reasoning_effort") is None:
298
+ data["reasoning_effort"] = effort
299
+
300
+ data["model_names"] = [model_name]
301
+
302
+ elif isinstance(model_names, list):
303
+ # List of models - process each one
304
+ processed_models = []
305
+ for model_name in model_names:
306
+ # Handle OpenRouter prefix for each model
307
+ processed_model = cls._preprocess_openrouter_model(model_name)
308
+ processed_model, _ = cls._strip_reasoning_suffix_if_registered(
309
+ processed_model
310
+ )
311
+ processed_models.append(processed_model)
312
+ data["model_names"] = processed_models
313
+
138
314
  if not isinstance(data.get("sampling_params", []), list):
139
315
  data["sampling_params"] = [data["sampling_params"]]
140
316
  if "sampling_params" not in data or len(data.get("sampling_params", [])) == 0:
@@ -153,6 +329,18 @@ class _LLMClient(BaseModel):
153
329
  data["sampling_params"] = data["sampling_params"] * len(data["model_names"])
154
330
  return data
155
331
 
332
+ @classmethod
333
+ def _strip_reasoning_suffix_if_registered(
334
+ cls, model_name: str
335
+ ) -> tuple[str, Literal["low", "medium", "high"] | None]:
336
+ """Remove reasoning suffix only when the trimmed model already exists."""
337
+ for suffix, effort in cls._REASONING_SUFFIXES.items():
338
+ if model_name.endswith(suffix) and len(model_name) > len(suffix):
339
+ candidate = model_name[: -len(suffix)]
340
+ if candidate in registry:
341
+ return candidate, effort
342
+ return model_name, None
343
+
156
344
  @model_validator(mode="after")
157
345
  def validate_client(self) -> Self:
158
346
  if isinstance(self.model_names, str):
@@ -171,6 +359,11 @@ class _LLMClient(BaseModel):
171
359
  # normalize weights
172
360
  self.model_weights = [w / sum(self.model_weights) for w in self.model_weights]
173
361
 
362
+ # background mode only allowed for responses api
363
+ if self.background:
364
+ assert (
365
+ self.use_responses_api
366
+ ), "background mode only allowed for responses api"
174
367
  # Auto-generate name if not provided
175
368
  if self.name is None:
176
369
  if len(self.model_names) == 1:
@@ -256,13 +449,6 @@ class _LLMClient(BaseModel):
256
449
  # Idle wait before next capacity check. Aim for ~RPM spacing.
257
450
  await asyncio.sleep(max(60.0 / self.max_requests_per_minute, 0.01))
258
451
 
259
- async def _execute_request(self, context: RequestContext) -> APIResponse:
260
- """Create and send a single API request using the provided context."""
261
- model_obj = APIModel.from_registry(context.model_name)
262
- request = model_obj.make_request(context)
263
- response = await request.execute_once()
264
- return response
265
-
266
452
  async def process_single_request(
267
453
  self, context: RequestContext, retry_queue: asyncio.Queue | None = None
268
454
  ) -> APIResponse:
@@ -290,7 +476,9 @@ class _LLMClient(BaseModel):
290
476
  # Execute single request
291
477
  assert context.status_tracker
292
478
  context.status_tracker.update_pbar()
293
- response = await self._execute_request(context)
479
+ model_obj = APIModel.from_registry(context.model_name)
480
+ request = model_obj.make_request(context)
481
+ response = await request.execute_once()
294
482
 
295
483
  # Handle successful response
296
484
  if not response.is_error:
@@ -350,44 +538,46 @@ class _LLMClient(BaseModel):
350
538
  @overload
351
539
  async def process_prompts_async(
352
540
  self,
353
- prompts: Sequence[str | list[dict] | Conversation],
541
+ prompts: Prompt | Sequence[Prompt],
354
542
  *,
355
543
  return_completions_only: Literal[True],
356
544
  show_progress: bool = ...,
357
545
  tools: list[Tool | dict | MCPServer] | None = ...,
358
546
  cache: CachePattern | None = ...,
359
- use_responses_api: bool = ...,
547
+ service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
360
548
  ) -> list[str | None]: ...
361
549
 
362
550
  @overload
363
551
  async def process_prompts_async(
364
552
  self,
365
- prompts: Sequence[str | list[dict] | Conversation],
553
+ prompts: Prompt | Sequence[Prompt],
366
554
  *,
367
555
  return_completions_only: Literal[False] = ...,
368
556
  show_progress: bool = ...,
369
557
  tools: list[Tool | dict | MCPServer] | None = ...,
370
558
  cache: CachePattern | None = ...,
371
- use_responses_api: bool = ...,
372
- ) -> list[APIResponse | None]: ...
559
+ service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
560
+ ) -> list[APIResponse]: ...
373
561
 
374
562
  async def process_prompts_async(
375
563
  self,
376
- prompts: Sequence[str | list[dict] | Conversation],
564
+ prompts: Prompt | Sequence[Prompt],
377
565
  *,
378
566
  return_completions_only: bool = False,
379
567
  show_progress: bool = True,
380
568
  tools: list[Tool | dict | MCPServer] | None = None,
381
569
  cache: CachePattern | None = None,
382
- use_responses_api: bool = False,
383
- ) -> list[APIResponse | None] | list[str | None] | dict[str, int]:
570
+ service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
571
+ ) -> list[APIResponse] | list[str | None] | dict[str, int]:
384
572
  """Process multiple prompts asynchronously using the start_nowait/wait_for_all backend.
385
573
 
386
574
  This implementation creates all tasks upfront and waits for them to complete,
387
575
  avoiding issues with tracker state accumulating across multiple calls.
388
576
  """
389
577
  # Convert prompts to Conversations
390
- prompts = prompts_to_conversations(prompts)
578
+ if not isinstance(prompts, list):
579
+ prompts = prompts = cast(Sequence[Prompt], [prompts])
580
+ prompts = prompts_to_conversations(cast(Sequence[Prompt], prompts))
391
581
 
392
582
  # Ensure tracker exists (start_nowait will call add_to_total for each task)
393
583
  if self._tracker is None:
@@ -398,13 +588,14 @@ class _LLMClient(BaseModel):
398
588
 
399
589
  # Start all tasks using start_nowait - tasks will coordinate via shared capacity lock
400
590
  task_ids = []
591
+ assert isinstance(prompts, Sequence)
401
592
  for prompt in prompts:
402
593
  assert isinstance(prompt, Conversation)
403
594
  task_id = self.start_nowait(
404
595
  prompt,
405
596
  tools=tools,
406
597
  cache=cache,
407
- use_responses_api=use_responses_api,
598
+ service_tier=service_tier,
408
599
  )
409
600
  task_ids.append(task_id)
410
601
 
@@ -443,13 +634,12 @@ class _LLMClient(BaseModel):
443
634
 
444
635
  def process_prompts_sync(
445
636
  self,
446
- prompts: Sequence[str | list[dict] | Conversation],
637
+ prompts: Prompt | Sequence[Prompt],
447
638
  *,
448
639
  return_completions_only: bool = False,
449
640
  show_progress=True,
450
641
  tools: list[Tool | dict | MCPServer] | None = None,
451
642
  cache: CachePattern | None = None,
452
- use_responses_api: bool = False,
453
643
  ):
454
644
  return asyncio.run(
455
645
  self.process_prompts_async(
@@ -458,7 +648,6 @@ class _LLMClient(BaseModel):
458
648
  show_progress=show_progress,
459
649
  tools=tools,
460
650
  cache=cache,
461
- use_responses_api=use_responses_api,
462
651
  )
463
652
  )
464
653
 
@@ -478,18 +667,18 @@ class _LLMClient(BaseModel):
478
667
 
479
668
  def start_nowait(
480
669
  self,
481
- prompt: str | Conversation,
670
+ prompt: Prompt,
482
671
  *,
483
672
  tools: list[Tool | dict | MCPServer] | None = None,
484
673
  cache: CachePattern | None = None,
485
- use_responses_api: bool = False,
674
+ service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
486
675
  ) -> int:
487
676
  tracker = self._get_tracker()
488
677
  task_id = self._next_task_id
489
678
  self._next_task_id += 1
490
679
  model, sampling_params = self._select_model()
491
- if isinstance(prompt, str):
492
- prompt = Conversation.user(prompt)
680
+ prompt = prompts_to_conversations([prompt])[0]
681
+ assert isinstance(prompt, Conversation)
493
682
  context = RequestContext(
494
683
  task_id=task_id,
495
684
  model_name=model,
@@ -500,7 +689,9 @@ class _LLMClient(BaseModel):
500
689
  status_tracker=tracker,
501
690
  tools=tools,
502
691
  cache=cache,
503
- use_responses_api=use_responses_api,
692
+ use_responses_api=self.use_responses_api,
693
+ background=self.background,
694
+ service_tier=service_tier,
504
695
  extra_headers=self.extra_headers,
505
696
  force_local_mcp=self.force_local_mcp,
506
697
  )
@@ -511,33 +702,45 @@ class _LLMClient(BaseModel):
511
702
 
512
703
  async def start(
513
704
  self,
514
- prompt: str | Conversation,
705
+ prompt: Prompt,
515
706
  *,
516
707
  tools: list[Tool | dict | MCPServer] | None = None,
517
708
  cache: CachePattern | None = None,
518
- use_responses_api: bool = False,
519
- ) -> APIResponse | None:
709
+ service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
710
+ ) -> APIResponse:
520
711
  task_id = self.start_nowait(
521
- prompt, tools=tools, cache=cache, use_responses_api=use_responses_api
712
+ prompt, tools=tools, cache=cache, service_tier=service_tier
522
713
  )
523
714
  return await self.wait_for(task_id)
524
715
 
525
- async def wait_for(self, task_id: int) -> APIResponse | None:
716
+ async def wait_for(self, task_id: int) -> APIResponse:
526
717
  task = self._tasks.get(task_id)
527
718
  if task:
528
719
  return await task
529
- return self._results.get(task_id)
720
+ res = self._results.get(task_id)
721
+ if res:
722
+ return res
723
+ else:
724
+ return APIResponse(
725
+ id=-1,
726
+ model_internal="",
727
+ prompt=Conversation([]),
728
+ sampling_params=SamplingParams(),
729
+ status_code=500,
730
+ is_error=True,
731
+ error_message="Task not found",
732
+ )
530
733
 
531
734
  async def wait_for_all(
532
735
  self, task_ids: Sequence[int] | None = None
533
- ) -> list[APIResponse | None]:
736
+ ) -> list[APIResponse]:
534
737
  if task_ids is None:
535
738
  task_ids = list(self._tasks.keys())
536
739
  return [await self.wait_for(tid) for tid in task_ids]
537
740
 
538
741
  async def as_completed(
539
742
  self, task_ids: Sequence[int] | None = None
540
- ) -> AsyncGenerator[tuple[int, APIResponse | None], None]:
743
+ ) -> AsyncGenerator[tuple[int, APIResponse], None]:
541
744
  """Yield ``(task_id, result)`` pairs as tasks complete.
542
745
 
543
746
  Args:
@@ -561,7 +764,9 @@ class _LLMClient(BaseModel):
561
764
  for task in list(tasks_map.keys()):
562
765
  if task.done():
563
766
  tid = tasks_map.pop(task)
564
- yield tid, self._results.get(tid, await task)
767
+ task_result = self._results.get(tid, await task)
768
+ assert task_result
769
+ yield tid, task_result
565
770
 
566
771
  while tasks_map:
567
772
  done, _ = await asyncio.wait(
@@ -569,16 +774,18 @@ class _LLMClient(BaseModel):
569
774
  )
570
775
  for task in done:
571
776
  tid = tasks_map.pop(task)
572
- yield tid, self._results.get(tid, await task)
777
+ task_result = self._results.get(tid, await task)
778
+ assert task_result
779
+ yield tid, task_result
573
780
 
574
781
  async def stream(
575
782
  self,
576
- prompt: str | Conversation,
783
+ prompt: Prompt,
577
784
  tools: list[Tool | dict | MCPServer] | None = None,
578
785
  ):
579
786
  model, sampling_params = self._select_model()
580
- if isinstance(prompt, str):
581
- prompt = Conversation.user(prompt)
787
+ prompt = prompts_to_conversations([prompt])[0]
788
+ assert isinstance(prompt, Conversation)
582
789
  async for item in stream_chat(
583
790
  model, prompt, sampling_params, tools, None, self.extra_headers
584
791
  ):
@@ -592,7 +799,7 @@ class _LLMClient(BaseModel):
592
799
 
593
800
  async def run_agent_loop(
594
801
  self,
595
- conversation: str | Conversation,
802
+ conversation: Prompt,
596
803
  *,
597
804
  tools: list[Tool | dict | MCPServer] | None = None,
598
805
  max_rounds: int = 5,
@@ -605,8 +812,9 @@ class _LLMClient(BaseModel):
605
812
  instances or built‑in tool dictionaries.
606
813
  """
607
814
 
608
- if isinstance(conversation, str):
609
- conversation = Conversation.user(conversation)
815
+ if not isinstance(conversation, Conversation):
816
+ conversation = prompts_to_conversations([conversation])[0]
817
+ assert isinstance(conversation, Conversation)
610
818
 
611
819
  # Expand MCPServer objects to their constituent tools for tool execution
612
820
  expanded_tools: list[Tool] = []
@@ -618,23 +826,20 @@ class _LLMClient(BaseModel):
618
826
  mcp_tools = await tool.to_tools()
619
827
  expanded_tools.extend(mcp_tools)
620
828
 
621
- last_response: APIResponse | None = None
829
+ response: APIResponse | None = None
622
830
 
623
831
  for _ in range(max_rounds):
624
- responses = await self.process_prompts_async(
625
- [conversation],
832
+ response = await self.start(
833
+ conversation,
626
834
  tools=tools, # type: ignore
627
- return_completions_only=False,
628
- show_progress=show_progress,
629
835
  )
630
836
 
631
- last_response = responses[0]
632
- if last_response is None or last_response.content is None:
837
+ if response is None or response.content is None:
633
838
  break
634
839
 
635
- conversation = conversation.with_message(last_response.content)
840
+ conversation = conversation.with_message(response.content)
636
841
 
637
- tool_calls = last_response.content.tool_calls
842
+ tool_calls = response.content.tool_calls
638
843
  if not tool_calls:
639
844
  break
640
845
 
@@ -657,16 +862,16 @@ class _LLMClient(BaseModel):
657
862
  if not isinstance(result, (str, dict, list)):
658
863
  result = str(result)
659
864
 
660
- conversation.add_tool_result(call.id, result) # type: ignore
865
+ conversation.with_tool_result(call.id, result) # type: ignore
661
866
 
662
- if last_response is None:
867
+ if response is None:
663
868
  raise RuntimeError("model did not return a response")
664
869
 
665
- return conversation, last_response
870
+ return conversation, response
666
871
 
667
872
  def run_agent_loop_sync(
668
873
  self,
669
- conversation: str | Conversation,
874
+ conversation: Prompt,
670
875
  *,
671
876
  tools: list[Tool | dict | MCPServer] | None = None,
672
877
  max_rounds: int = 5,
@@ -685,7 +890,7 @@ class _LLMClient(BaseModel):
685
890
 
686
891
  async def submit_batch_job(
687
892
  self,
688
- prompts: Sequence[str | list[dict] | Conversation],
893
+ prompts: Prompt | Sequence[Prompt],
689
894
  *,
690
895
  tools: list[Tool] | None = None,
691
896
  cache: CachePattern | None = None,
@@ -747,11 +952,13 @@ def LLMClient(
747
952
  request_timeout: int = 30,
748
953
  cache: Any = None,
749
954
  extra_headers: dict[str, str] | None = None,
955
+ use_responses_api: bool = False,
956
+ background: bool = False,
750
957
  temperature: float = 0.75,
751
958
  top_p: float = 1.0,
752
959
  json_mode: bool = False,
753
960
  max_new_tokens: int = 512,
754
- reasoning_effort: Literal["low", "medium", "high", None] = None,
961
+ reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
755
962
  logprobs: bool = False,
756
963
  top_logprobs: int | None = None,
757
964
  force_local_mcp: bool = False,
@@ -774,11 +981,13 @@ def LLMClient(
774
981
  request_timeout: int = 30,
775
982
  cache: Any = None,
776
983
  extra_headers: dict[str, str] | None = None,
984
+ use_responses_api: bool = False,
985
+ background: bool = False,
777
986
  temperature: float = 0.75,
778
987
  top_p: float = 1.0,
779
988
  json_mode: bool = False,
780
989
  max_new_tokens: int = 512,
781
- reasoning_effort: Literal["low", "medium", "high", None] = None,
990
+ reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
782
991
  logprobs: bool = False,
783
992
  top_logprobs: int | None = None,
784
993
  force_local_mcp: bool = False,
@@ -800,11 +1009,13 @@ def LLMClient(
800
1009
  request_timeout: int = 30,
801
1010
  cache: Any = None,
802
1011
  extra_headers: dict[str, str] | None = None,
1012
+ use_responses_api: bool = False,
1013
+ background: bool = False,
803
1014
  temperature: float = 0.75,
804
1015
  top_p: float = 1.0,
805
1016
  json_mode: bool = False,
806
1017
  max_new_tokens: int = 512,
807
- reasoning_effort: Literal["low", "medium", "high", None] = None,
1018
+ reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
808
1019
  logprobs: bool = False,
809
1020
  top_logprobs: int | None = None,
810
1021
  force_local_mcp: bool = False,
@@ -838,6 +1049,8 @@ def LLMClient(
838
1049
  request_timeout=request_timeout,
839
1050
  cache=cache,
840
1051
  extra_headers=extra_headers,
1052
+ use_responses_api=use_responses_api,
1053
+ background=background,
841
1054
  temperature=temperature,
842
1055
  top_p=top_p,
843
1056
  json_mode=json_mode,
lm_deluge/config.py CHANGED
@@ -8,7 +8,7 @@ class SamplingParams(BaseModel):
8
8
  top_p: float = 1.0
9
9
  json_mode: bool = False
10
10
  max_new_tokens: int = 512
11
- reasoning_effort: Literal["low", "medium", "high", "none", None] = None
11
+ reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None
12
12
  logprobs: bool = False
13
13
  top_logprobs: int | None = None
14
14