PyPI - lm-deluge - Versions diffs - 0.0.56__py3-none-any.whl → 0.0.69__py3-none-any.whl - Mend

lm-deluge 0.0.56py3-none-any.whl → 0.0.69py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

lm_deluge/__init__.py +12 -1
lm_deluge/api_requests/anthropic.py +12 -1
lm_deluge/api_requests/base.py +87 -5
lm_deluge/api_requests/bedrock.py +3 -4
lm_deluge/api_requests/chat_reasoning.py +4 -0
lm_deluge/api_requests/gemini.py +7 -6
lm_deluge/api_requests/mistral.py +8 -9
lm_deluge/api_requests/openai.py +179 -124
lm_deluge/batches.py +25 -9
lm_deluge/client.py +280 -67
lm_deluge/config.py +1 -1
lm_deluge/file.py +382 -13
lm_deluge/mock_openai.py +482 -0
lm_deluge/models/__init__.py +12 -8
lm_deluge/models/anthropic.py +12 -20
lm_deluge/models/bedrock.py +0 -14
lm_deluge/models/cohere.py +0 -16
lm_deluge/models/google.py +0 -20
lm_deluge/models/grok.py +48 -4
lm_deluge/models/groq.py +2 -2
lm_deluge/models/kimi.py +34 -0
lm_deluge/models/meta.py +0 -8
lm_deluge/models/minimax.py +10 -0
lm_deluge/models/openai.py +28 -34
lm_deluge/models/openrouter.py +64 -1
lm_deluge/models/together.py +0 -16
lm_deluge/prompt.py +138 -29
lm_deluge/request_context.py +9 -11
lm_deluge/tool.py +395 -19
lm_deluge/tracker.py +11 -5
lm_deluge/warnings.py +46 -0
{lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/METADATA +3 -1
{lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/RECORD +36 -33
lm_deluge/agent.py +0 -0
lm_deluge/gemini_limits.py +0 -65
{lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/WHEEL +0 -0
{lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/licenses/LICENSE +0 -0
{lm_deluge-0.0.56.dist-info → lm_deluge-0.0.69.dist-info}/top_level.txt +0 -0

lm_deluge/client.py CHANGED Viewed

@@ -1,5 +1,15 @@
 import asyncio
-from typing import Any, AsyncGenerator, Callable, Literal, Self, Sequence, overload
+from typing import (
+    Any,
+    AsyncGenerator,
+    Callable,
+    ClassVar,
+    Literal,
+    Self,
+    Sequence,
+    cast,
+    overload,
+)
 import numpy as np
 import yaml
@@ -12,12 +22,17 @@ from lm_deluge.batches import (
     submit_batches_oa,
     wait_for_batch_completion_async,
 )
-from lm_deluge.prompt import CachePattern, Conversation, prompts_to_conversations
+from lm_deluge.prompt import (
+    CachePattern,
+    Conversation,
+    Prompt,
+    prompts_to_conversations,
+)
 from lm_deluge.tool import MCPServer, Tool
 from .api_requests.base import APIResponse
 from .config import SamplingParams
-from .models import APIModel, registry
+from .models import APIModel, register_model, registry
 from .request_context import RequestContext
 from .tracker import StatusTracker
@@ -29,6 +44,12 @@ class _LLMClient(BaseModel):
     Keeps all validation, serialization, and existing functionality.
     """
+    _REASONING_SUFFIXES: ClassVar[dict[str, Literal["low", "medium", "high"]]] = {
+        "-low": "low",
+        "-medium": "medium",
+        "-high": "high",
+    }
     model_names: str | list[str] = ["gpt-4.1-mini"]
     name: str | None = None
     max_requests_per_minute: int = 1_000
@@ -40,13 +61,16 @@ class _LLMClient(BaseModel):
     request_timeout: int = 30
     cache: Any = None
     extra_headers: dict[str, str] | None = None
+    extra_body: dict[str, str] | None = None
+    use_responses_api: bool = False
+    background: bool = False
     # sampling params - if provided, and sampling_params is not,
     # these override the defaults
     temperature: float = 0.75
     top_p: float = 1.0
     json_mode: bool = False
     max_new_tokens: int = 512
-    reasoning_effort: Literal["low", "medium", "high", None] = None
+    reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None
     logprobs: bool = False
     top_logprobs: int | None = None
     force_local_mcp: bool = False
@@ -100,13 +124,112 @@ class _LLMClient(BaseModel):
     # NEW! Builder methods
     def with_model(self, model: str):
-        self.model_names = [model]
+        self._update_models([model])
         return self
     def with_models(self, models: list[str]):
-        self.model_names = models
+        self._update_models(models)
         return self
+    def _update_models(self, models: list[str]) -> None:
+        normalized, per_model_efforts = self._normalize_model_names(models)
+        if self.reasoning_effort is None:
+            unique_efforts = {eff for eff in per_model_efforts if eff is not None}
+            if len(normalized) == 1 and per_model_efforts[0] is not None:
+                self.reasoning_effort = per_model_efforts[0]
+            elif (
+                len(unique_efforts) == 1
+                and len(unique_efforts) != 0
+                and None not in per_model_efforts
+            ):
+                self.reasoning_effort = next(iter(unique_efforts))  # type: ignore
+        self.model_names = normalized
+        self._align_sampling_params(per_model_efforts)
+        self._reset_model_weights()
+    def _normalize_model_names(
+        self, models: list[str]
+    ) -> tuple[list[str], list[Literal["low", "medium", "high"] | None]]:
+        normalized: list[str] = []
+        efforts: list[Literal["low", "medium", "high"] | None] = []
+        for name in models:
+            base_name = self._preprocess_openrouter_model(name)
+            trimmed_name, effort = self.__class__._strip_reasoning_suffix_if_registered(
+                base_name
+            )
+            normalized.append(trimmed_name)
+            efforts.append(effort)
+        return normalized, efforts
+    def _align_sampling_params(
+        self, per_model_efforts: list[Literal["low", "medium", "high"] | None]
+    ) -> None:
+        if len(per_model_efforts) < len(self.model_names):
+            per_model_efforts = per_model_efforts + [None] * (
+                len(self.model_names) - len(per_model_efforts)
+            )
+        if not self.model_names:
+            self.sampling_params = []
+            return
+        if not self.sampling_params:
+            self.sampling_params = []
+        if len(self.sampling_params) == 0:
+            for _ in self.model_names:
+                self.sampling_params.append(
+                    SamplingParams(
+                        temperature=self.temperature,
+                        top_p=self.top_p,
+                        json_mode=self.json_mode,
+                        max_new_tokens=self.max_new_tokens,
+                        reasoning_effort=self.reasoning_effort,
+                        logprobs=self.logprobs,
+                        top_logprobs=self.top_logprobs,
+                    )
+                )
+        elif len(self.sampling_params) == 1 and len(self.model_names) > 1:
+            base_param = self.sampling_params[0]
+            self.sampling_params = [
+                base_param.model_copy(deep=True) for _ in self.model_names
+            ]
+        elif len(self.sampling_params) != len(self.model_names):
+            base_param = self.sampling_params[0]
+            self.sampling_params = [
+                base_param.model_copy(deep=True) for _ in self.model_names
+            ]
+        if self.reasoning_effort is not None:
+            for sp in self.sampling_params:
+                sp.reasoning_effort = self.reasoning_effort
+        else:
+            for sp, effort in zip(self.sampling_params, per_model_efforts):
+                if effort is not None:
+                    sp.reasoning_effort = effort
+    def _reset_model_weights(self) -> None:
+        if not self.model_names:
+            self.model_weights = []
+            return
+        if isinstance(self.model_weights, list):
+            if len(self.model_weights) == len(self.model_names) and any(
+                self.model_weights
+            ):
+                total = sum(self.model_weights)
+                if total == 0:
+                    self.model_weights = [
+                        1 / len(self.model_names) for _ in self.model_names
+                    ]
+                else:
+                    self.model_weights = [w / total for w in self.model_weights]
+                return
+        # Fallback to uniform distribution
+        self.model_weights = [1 / len(self.model_names) for _ in self.model_names]
     def with_limits(
         self,
         max_requests_per_minute: int | None = None,
@@ -130,11 +253,64 @@ class _LLMClient(BaseModel):
     def models(self):
         return self.model_names  # why? idk
+    @staticmethod
+    def _preprocess_openrouter_model(model_name: str) -> str:
+        """Process openrouter: prefix and register model if needed."""
+        if model_name.startswith("openrouter:"):
+            slug = model_name.split(":", 1)[1]  # Everything after "openrouter:"
+            # Create a unique id by replacing slashes with hyphens
+            model_id = f"openrouter-{slug.replace('/', '-')}"
+            # Register the model if not already in registry
+            if model_id not in registry:
+                register_model(
+                    id=model_id,
+                    name=slug,  # The full slug sent to OpenRouter API (e.g., "openrouter/andromeda-alpha")
+                    api_base="https://openrouter.ai/api/v1",
+                    api_key_env_var="OPENROUTER_API_KEY",
+                    api_spec="openai",
+                    supports_json=True,
+                    supports_logprobs=False,
+                    supports_responses=False,
+                    input_cost=0,  # Unknown costs for generic models
+                    cached_input_cost=0,
+                    cache_write_cost=0,
+                    output_cost=0,
+                )
+            return model_id
+        return model_name
     @model_validator(mode="before")
     @classmethod
     def fix_lists(cls, data) -> "_LLMClient":
-        if isinstance(data.get("model_names"), str):
-            data["model_names"] = [data["model_names"]]
+        # Process model_names - handle both strings and lists
+        model_names = data.get("model_names")
+        if isinstance(model_names, str):
+            # Single model as string
+            # First, handle OpenRouter prefix
+            model_name = cls._preprocess_openrouter_model(model_names)
+            # Then handle reasoning effort suffix (e.g., "gpt-5-high")
+            model_name, effort = cls._strip_reasoning_suffix_if_registered(model_name)
+            if effort and data.get("reasoning_effort") is None:
+                data["reasoning_effort"] = effort
+            data["model_names"] = [model_name]
+        elif isinstance(model_names, list):
+            # List of models - process each one
+            processed_models = []
+            for model_name in model_names:
+                # Handle OpenRouter prefix for each model
+                processed_model = cls._preprocess_openrouter_model(model_name)
+                processed_model, _ = cls._strip_reasoning_suffix_if_registered(
+                    processed_model
+                )
+                processed_models.append(processed_model)
+            data["model_names"] = processed_models
         if not isinstance(data.get("sampling_params", []), list):
             data["sampling_params"] = [data["sampling_params"]]
         if "sampling_params" not in data or len(data.get("sampling_params", [])) == 0:
@@ -153,6 +329,18 @@ class _LLMClient(BaseModel):
             data["sampling_params"] = data["sampling_params"] * len(data["model_names"])
         return data
+    @classmethod
+    def _strip_reasoning_suffix_if_registered(
+        cls, model_name: str
+    ) -> tuple[str, Literal["low", "medium", "high"] | None]:
+        """Remove reasoning suffix only when the trimmed model already exists."""
+        for suffix, effort in cls._REASONING_SUFFIXES.items():
+            if model_name.endswith(suffix) and len(model_name) > len(suffix):
+                candidate = model_name[: -len(suffix)]
+                if candidate in registry:
+                    return candidate, effort
+        return model_name, None
     @model_validator(mode="after")
     def validate_client(self) -> Self:
         if isinstance(self.model_names, str):
@@ -171,6 +359,11 @@ class _LLMClient(BaseModel):
         # normalize weights
         self.model_weights = [w / sum(self.model_weights) for w in self.model_weights]
+        # background mode only allowed for responses api
+        if self.background:
+            assert (
+                self.use_responses_api
+            ), "background mode only allowed for responses api"
         # Auto-generate name if not provided
         if self.name is None:
             if len(self.model_names) == 1:
@@ -256,13 +449,6 @@ class _LLMClient(BaseModel):
             # Idle wait before next capacity check. Aim for ~RPM spacing.
             await asyncio.sleep(max(60.0 / self.max_requests_per_minute, 0.01))
-    async def _execute_request(self, context: RequestContext) -> APIResponse:
-        """Create and send a single API request using the provided context."""
-        model_obj = APIModel.from_registry(context.model_name)
-        request = model_obj.make_request(context)
-        response = await request.execute_once()
-        return response
     async def process_single_request(
         self, context: RequestContext, retry_queue: asyncio.Queue | None = None
     ) -> APIResponse:
@@ -290,7 +476,9 @@ class _LLMClient(BaseModel):
         # Execute single request
         assert context.status_tracker
         context.status_tracker.update_pbar()
-        response = await self._execute_request(context)
+        model_obj = APIModel.from_registry(context.model_name)
+        request = model_obj.make_request(context)
+        response = await request.execute_once()
         # Handle successful response
         if not response.is_error:
@@ -350,44 +538,46 @@ class _LLMClient(BaseModel):
     @overload
     async def process_prompts_async(
         self,
-        prompts: Sequence[str | list[dict] | Conversation],
+        prompts: Prompt | Sequence[Prompt],
         *,
         return_completions_only: Literal[True],
         show_progress: bool = ...,
         tools: list[Tool | dict | MCPServer] | None = ...,
         cache: CachePattern | None = ...,
-        use_responses_api: bool = ...,
+        service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
     ) -> list[str | None]: ...
     @overload
     async def process_prompts_async(
         self,
-        prompts: Sequence[str | list[dict] | Conversation],
+        prompts: Prompt | Sequence[Prompt],
         *,
         return_completions_only: Literal[False] = ...,
         show_progress: bool = ...,
         tools: list[Tool | dict | MCPServer] | None = ...,
         cache: CachePattern | None = ...,
-        use_responses_api: bool = ...,
-    ) -> list[APIResponse | None]: ...
+        service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
+    ) -> list[APIResponse]: ...
     async def process_prompts_async(
         self,
-        prompts: Sequence[str | list[dict] | Conversation],
+        prompts: Prompt | Sequence[Prompt],
         *,
         return_completions_only: bool = False,
         show_progress: bool = True,
         tools: list[Tool | dict | MCPServer] | None = None,
         cache: CachePattern | None = None,
-        use_responses_api: bool = False,
-    ) -> list[APIResponse | None] | list[str | None] | dict[str, int]:
+        service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
+    ) -> list[APIResponse] | list[str | None] | dict[str, int]:
         """Process multiple prompts asynchronously using the start_nowait/wait_for_all backend.
         This implementation creates all tasks upfront and waits for them to complete,
         avoiding issues with tracker state accumulating across multiple calls.
         """
         # Convert prompts to Conversations
-        prompts = prompts_to_conversations(prompts)
+        if not isinstance(prompts, list):
+            prompts = prompts = cast(Sequence[Prompt], [prompts])
+        prompts = prompts_to_conversations(cast(Sequence[Prompt], prompts))
         # Ensure tracker exists (start_nowait will call add_to_total for each task)
         if self._tracker is None:
@@ -398,13 +588,14 @@ class _LLMClient(BaseModel):
         # Start all tasks using start_nowait - tasks will coordinate via shared capacity lock
         task_ids = []
+        assert isinstance(prompts, Sequence)
         for prompt in prompts:
             assert isinstance(prompt, Conversation)
             task_id = self.start_nowait(
                 prompt,
                 tools=tools,
                 cache=cache,
-                use_responses_api=use_responses_api,
+                service_tier=service_tier,
             )
             task_ids.append(task_id)
@@ -443,13 +634,12 @@ class _LLMClient(BaseModel):
     def process_prompts_sync(
         self,
-        prompts: Sequence[str | list[dict] | Conversation],
+        prompts: Prompt | Sequence[Prompt],
         *,
         return_completions_only: bool = False,
         show_progress=True,
         tools: list[Tool | dict | MCPServer] | None = None,
         cache: CachePattern | None = None,
-        use_responses_api: bool = False,
     ):
         return asyncio.run(
             self.process_prompts_async(
@@ -458,7 +648,6 @@ class _LLMClient(BaseModel):
                 show_progress=show_progress,
                 tools=tools,
                 cache=cache,
-                use_responses_api=use_responses_api,
             )
         )
@@ -478,18 +667,18 @@ class _LLMClient(BaseModel):
     def start_nowait(
         self,
-        prompt: str | Conversation,
+        prompt: Prompt,
         *,
         tools: list[Tool | dict | MCPServer] | None = None,
         cache: CachePattern | None = None,
-        use_responses_api: bool = False,
+        service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
     ) -> int:
         tracker = self._get_tracker()
         task_id = self._next_task_id
         self._next_task_id += 1
         model, sampling_params = self._select_model()
-        if isinstance(prompt, str):
-            prompt = Conversation.user(prompt)
+        prompt = prompts_to_conversations([prompt])[0]
+        assert isinstance(prompt, Conversation)
         context = RequestContext(
             task_id=task_id,
             model_name=model,
@@ -500,7 +689,9 @@ class _LLMClient(BaseModel):
             status_tracker=tracker,
             tools=tools,
             cache=cache,
-            use_responses_api=use_responses_api,
+            use_responses_api=self.use_responses_api,
+            background=self.background,
+            service_tier=service_tier,
             extra_headers=self.extra_headers,
             force_local_mcp=self.force_local_mcp,
         )
@@ -511,33 +702,45 @@ class _LLMClient(BaseModel):
     async def start(
         self,
-        prompt: str | Conversation,
+        prompt: Prompt,
         *,
         tools: list[Tool | dict | MCPServer] | None = None,
         cache: CachePattern | None = None,
-        use_responses_api: bool = False,
-    ) -> APIResponse | None:
+        service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
+    ) -> APIResponse:
         task_id = self.start_nowait(
-            prompt, tools=tools, cache=cache, use_responses_api=use_responses_api
+            prompt, tools=tools, cache=cache, service_tier=service_tier
         )
         return await self.wait_for(task_id)
-    async def wait_for(self, task_id: int) -> APIResponse | None:
+    async def wait_for(self, task_id: int) -> APIResponse:
         task = self._tasks.get(task_id)
         if task:
             return await task
-        return self._results.get(task_id)
+        res = self._results.get(task_id)
+        if res:
+            return res
+        else:
+            return APIResponse(
+                id=-1,
+                model_internal="",
+                prompt=Conversation([]),
+                sampling_params=SamplingParams(),
+                status_code=500,
+                is_error=True,
+                error_message="Task not found",
+            )
     async def wait_for_all(
         self, task_ids: Sequence[int] | None = None
-    ) -> list[APIResponse | None]:
+    ) -> list[APIResponse]:
         if task_ids is None:
             task_ids = list(self._tasks.keys())
         return [await self.wait_for(tid) for tid in task_ids]
     async def as_completed(
         self, task_ids: Sequence[int] | None = None
-    ) -> AsyncGenerator[tuple[int, APIResponse | None], None]:
+    ) -> AsyncGenerator[tuple[int, APIResponse], None]:
         """Yield ``(task_id, result)`` pairs as tasks complete.
         Args:
@@ -561,7 +764,9 @@ class _LLMClient(BaseModel):
         for task in list(tasks_map.keys()):
             if task.done():
                 tid = tasks_map.pop(task)
-                yield tid, self._results.get(tid, await task)
+                task_result = self._results.get(tid, await task)
+                assert task_result
+                yield tid, task_result
         while tasks_map:
             done, _ = await asyncio.wait(
@@ -569,16 +774,18 @@ class _LLMClient(BaseModel):
             )
             for task in done:
                 tid = tasks_map.pop(task)
-                yield tid, self._results.get(tid, await task)
+                task_result = self._results.get(tid, await task)
+                assert task_result
+                yield tid, task_result
     async def stream(
         self,
-        prompt: str | Conversation,
+        prompt: Prompt,
         tools: list[Tool | dict | MCPServer] | None = None,
     ):
         model, sampling_params = self._select_model()
-        if isinstance(prompt, str):
-            prompt = Conversation.user(prompt)
+        prompt = prompts_to_conversations([prompt])[0]
+        assert isinstance(prompt, Conversation)
         async for item in stream_chat(
             model, prompt, sampling_params, tools, None, self.extra_headers
         ):
@@ -592,7 +799,7 @@ class _LLMClient(BaseModel):
     async def run_agent_loop(
         self,
-        conversation: str | Conversation,
+        conversation: Prompt,
         *,
         tools: list[Tool | dict | MCPServer] | None = None,
         max_rounds: int = 5,
@@ -605,8 +812,9 @@ class _LLMClient(BaseModel):
         instances or built‑in tool dictionaries.
         """
-        if isinstance(conversation, str):
-            conversation = Conversation.user(conversation)
+        if not isinstance(conversation, Conversation):
+            conversation = prompts_to_conversations([conversation])[0]
+            assert isinstance(conversation, Conversation)
         # Expand MCPServer objects to their constituent tools for tool execution
         expanded_tools: list[Tool] = []
@@ -618,23 +826,20 @@ class _LLMClient(BaseModel):
                     mcp_tools = await tool.to_tools()
                     expanded_tools.extend(mcp_tools)
-        last_response: APIResponse | None = None
+        response: APIResponse | None = None
         for _ in range(max_rounds):
-            responses = await self.process_prompts_async(
-                [conversation],
+            response = await self.start(
+                conversation,
                 tools=tools,  # type: ignore
-                return_completions_only=False,
-                show_progress=show_progress,
             )
-            last_response = responses[0]
-            if last_response is None or last_response.content is None:
+            if response is None or response.content is None:
                 break
-            conversation = conversation.with_message(last_response.content)
+            conversation = conversation.with_message(response.content)
-            tool_calls = last_response.content.tool_calls
+            tool_calls = response.content.tool_calls
             if not tool_calls:
                 break
@@ -657,16 +862,16 @@ class _LLMClient(BaseModel):
                 if not isinstance(result, (str, dict, list)):
                     result = str(result)
-                conversation.add_tool_result(call.id, result)  # type: ignore
+                conversation.with_tool_result(call.id, result)  # type: ignore
-        if last_response is None:
+        if response is None:
             raise RuntimeError("model did not return a response")
-        return conversation, last_response
+        return conversation, response
     def run_agent_loop_sync(
         self,
-        conversation: str | Conversation,
+        conversation: Prompt,
         *,
         tools: list[Tool | dict | MCPServer] | None = None,
         max_rounds: int = 5,
@@ -685,7 +890,7 @@ class _LLMClient(BaseModel):
     async def submit_batch_job(
         self,
-        prompts: Sequence[str | list[dict] | Conversation],
+        prompts: Prompt | Sequence[Prompt],
         *,
         tools: list[Tool] | None = None,
         cache: CachePattern | None = None,
@@ -747,11 +952,13 @@ def LLMClient(
     request_timeout: int = 30,
     cache: Any = None,
     extra_headers: dict[str, str] | None = None,
+    use_responses_api: bool = False,
+    background: bool = False,
     temperature: float = 0.75,
     top_p: float = 1.0,
     json_mode: bool = False,
     max_new_tokens: int = 512,
-    reasoning_effort: Literal["low", "medium", "high", None] = None,
+    reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
     logprobs: bool = False,
     top_logprobs: int | None = None,
     force_local_mcp: bool = False,
@@ -774,11 +981,13 @@ def LLMClient(
     request_timeout: int = 30,
     cache: Any = None,
     extra_headers: dict[str, str] | None = None,
+    use_responses_api: bool = False,
+    background: bool = False,
     temperature: float = 0.75,
     top_p: float = 1.0,
     json_mode: bool = False,
     max_new_tokens: int = 512,
-    reasoning_effort: Literal["low", "medium", "high", None] = None,
+    reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
     logprobs: bool = False,
     top_logprobs: int | None = None,
     force_local_mcp: bool = False,
@@ -800,11 +1009,13 @@ def LLMClient(
     request_timeout: int = 30,
     cache: Any = None,
     extra_headers: dict[str, str] | None = None,
+    use_responses_api: bool = False,
+    background: bool = False,
     temperature: float = 0.75,
     top_p: float = 1.0,
     json_mode: bool = False,
     max_new_tokens: int = 512,
-    reasoning_effort: Literal["low", "medium", "high", None] = None,
+    reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None,
     logprobs: bool = False,
     top_logprobs: int | None = None,
     force_local_mcp: bool = False,
@@ -838,6 +1049,8 @@ def LLMClient(
         request_timeout=request_timeout,
         cache=cache,
         extra_headers=extra_headers,
+        use_responses_api=use_responses_api,
+        background=background,
         temperature=temperature,
         top_p=top_p,
         json_mode=json_mode,

lm_deluge/config.py CHANGED Viewed

@@ -8,7 +8,7 @@ class SamplingParams(BaseModel):
     top_p: float = 1.0
     json_mode: bool = False
     max_new_tokens: int = 512
-    reasoning_effort: Literal["low", "medium", "high", "none", None] = None
+    reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None
     logprobs: bool = False
     top_logprobs: int | None = None

lm-deluge 0.0.56__py3-none-any.whl → 0.0.69__py3-none-any.whl

lm-deluge 0.0.56py3-none-any.whl → 0.0.69py3-none-any.whl