PyPI - lm-deluge - Versions diffs - 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl - Mend

lm-deluge 0.0.15py3-none-any.whl → 0.0.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

lm_deluge/api_requests/__init__.py +0 -2
lm_deluge/api_requests/anthropic.py +58 -84
lm_deluge/api_requests/base.py +43 -229
lm_deluge/api_requests/bedrock.py +173 -195
lm_deluge/api_requests/gemini.py +18 -44
lm_deluge/api_requests/mistral.py +30 -60
lm_deluge/api_requests/openai.py +147 -148
lm_deluge/api_requests/response.py +2 -1
lm_deluge/batches.py +1 -1
lm_deluge/{computer_use/anthropic_tools.py → built_in_tools/anthropic.py} +56 -5
lm_deluge/built_in_tools/openai.py +28 -0
lm_deluge/client.py +221 -150
lm_deluge/image.py +13 -8
lm_deluge/llm_tools/extract.py +23 -4
lm_deluge/llm_tools/ocr.py +1 -0
lm_deluge/models.py +39 -2
lm_deluge/prompt.py +43 -27
lm_deluge/request_context.py +75 -0
lm_deluge/tool.py +93 -15
lm_deluge/tracker.py +1 -0
{lm_deluge-0.0.15.dist-info → lm_deluge-0.0.16.dist-info}/METADATA +25 -1
{lm_deluge-0.0.15.dist-info → lm_deluge-0.0.16.dist-info}/RECORD +25 -22
{lm_deluge-0.0.15.dist-info → lm_deluge-0.0.16.dist-info}/WHEEL +0 -0
{lm_deluge-0.0.15.dist-info → lm_deluge-0.0.16.dist-info}/licenses/LICENSE +0 -0
{lm_deluge-0.0.15.dist-info → lm_deluge-0.0.16.dist-info}/top_level.txt +0 -0

lm_deluge/api_requests/mistral.py CHANGED Viewed

@@ -1,78 +1,46 @@
-import warnings
-from aiohttp import ClientResponse
 import json
 import os
-from typing import Callable
+import warnings
+from aiohttp import ClientResponse
-from .base import APIRequestBase, APIResponse
-from ..prompt import Conversation, Message, CachePattern
-from ..usage import Usage
-from ..tracker import StatusTracker
-from ..config import SamplingParams
 from ..models import APIModel
+from ..prompt import Message
+from ..request_context import RequestContext
+from ..usage import Usage
+from .base import APIRequestBase, APIResponse
 class MistralRequest(APIRequestBase):
-    def __init__(
-        self,
-        task_id: int,
-        # should always be 'role', 'content' keys.
-        # internal logic should handle translating to specific API format
-        model_name: str,  # must correspond to registry
-        prompt: Conversation,
-        attempts_left: int,
-        status_tracker: StatusTracker,
-        results_arr: list,
-        request_timeout: int = 30,
-        sampling_params: SamplingParams = SamplingParams(),
-        callback: Callable | None = None,
-        all_model_names: list[str] | None = None,
-        all_sampling_params: list[SamplingParams] | None = None,
-        tools: list | None = None,
-        cache: CachePattern | None = None,
-    ):
-        super().__init__(
-            task_id=task_id,
-            model_name=model_name,
-            prompt=prompt,
-            attempts_left=attempts_left,
-            status_tracker=status_tracker,
-            results_arr=results_arr,
-            request_timeout=request_timeout,
-            sampling_params=sampling_params,
-            callback=callback,
-            all_model_names=all_model_names,
-            all_sampling_params=all_sampling_params,
-            tools=tools,
-            cache=cache,
-        )
+    def __init__(self, context: RequestContext):
+        super().__init__(context=context)
         # Warn if cache is specified for non-Anthropic model
-        if cache is not None:
+        if self.context.cache is not None:
             warnings.warn(
-                f"Cache parameter '{cache}' is only supported for Anthropic models, ignoring for {model_name}"
+                f"Cache parameter '{self.context.cache}' is only supported for Anthropic models, ignoring for {self.context.model_name}"
             )
-        self.model = APIModel.from_registry(model_name)
+        self.model = APIModel.from_registry(self.context.model_name)
         self.url = f"{self.model.api_base}/chat/completions"
         self.request_header = {
             "Authorization": f"Bearer {os.getenv(self.model.api_key_env_var)}"
         }
         self.request_json = {
             "model": self.model.name,
-            "messages": prompt.to_mistral(),
-            "temperature": sampling_params.temperature,
-            "top_p": sampling_params.top_p,
-            "max_tokens": sampling_params.max_new_tokens,
+            "messages": self.context.prompt.to_mistral(),
+            "temperature": self.context.sampling_params.temperature,
+            "top_p": self.context.sampling_params.top_p,
+            "max_tokens": self.context.sampling_params.max_new_tokens,
         }
-        if sampling_params.reasoning_effort:
+        if self.context.sampling_params.reasoning_effort:
             warnings.warn(
-                f"Ignoring reasoning_effort param for non-reasoning model: {model_name}"
+                f"Ignoring reasoning_effort param for non-reasoning model: {self.context.model_name}"
             )
-        if sampling_params.logprobs:
+        if self.context.sampling_params.logprobs:
             warnings.warn(
-                f"Ignoring logprobs param for non-logprobs model: {model_name}"
+                f"Ignoring logprobs param for non-logprobs model: {self.context.model_name}"
             )
-        if sampling_params.json_mode and self.model.supports_json:
+        if self.context.sampling_params.json_mode and self.model.supports_json:
             self.request_json["response_format"] = {"type": "json_object"}
     async def handle_response(self, http_response: ClientResponse) -> APIResponse:
@@ -84,6 +52,8 @@ class MistralRequest(APIRequestBase):
         status_code = http_response.status
         mimetype = http_response.headers.get("Content-Type", None)
         data = None
+        assert self.context.status_tracker
         if status_code >= 200 and status_code < 300:
             try:
                 data = await http_response.json()
@@ -98,7 +68,7 @@ class MistralRequest(APIRequestBase):
                     completion = data["choices"][0]["message"]["content"]
                     usage = Usage.from_mistral_usage(data["usage"])
                     if (
-                        self.sampling_params.logprobs
+                        self.context.sampling_params.logprobs
                         and "logprobs" in data["choices"][0]
                     ):
                         logprobs = data["choices"][0]["logprobs"]["content"]
@@ -118,20 +88,20 @@ class MistralRequest(APIRequestBase):
         if is_error and error_message is not None:
             if "rate limit" in error_message.lower() or status_code == 429:
                 error_message += " (Rate limit error, triggering cooldown.)"
-                self.status_tracker.rate_limit_exceeded()
+                self.context.status_tracker.rate_limit_exceeded()
             if "context length" in error_message:
                 error_message += " (Context length exceeded, set retries to 0.)"
-                self.attempts_left = 0
+                self.context.attempts_left = 0
         return APIResponse(
-            id=self.task_id,
+            id=self.context.task_id,
             status_code=status_code,
             is_error=is_error,
             error_message=error_message,
-            prompt=self.prompt,
+            prompt=self.context.prompt,
             logprobs=logprobs,
             content=Message.ai(completion),
-            model_internal=self.model_name,
-            sampling_params=self.sampling_params,
+            model_internal=self.context.model_name,
+            sampling_params=self.context.sampling_params,
             usage=usage,
         )

lm_deluge/api_requests/openai.py CHANGED Viewed

@@ -1,17 +1,16 @@
 import json
 import os
 import warnings
-from typing import Callable
 import aiohttp
 from aiohttp import ClientResponse
-from lm_deluge.tool import Tool
+from lm_deluge.request_context import RequestContext
+from lm_deluge.tool import MCPServer, Tool
 from ..config import SamplingParams
 from ..models import APIModel
 from ..prompt import CachePattern, Conversation, Message, Text, Thinking, ToolCall
-from ..tracker import StatusTracker
 from ..usage import Usage
 from .base import APIRequestBase, APIResponse
@@ -53,54 +52,36 @@ def _build_oa_chat_request(
     return request_json
+def _build_oa_responses_request(
+    model: APIModel,
+    prompt: Conversation,
+    tools: list[Tool] | None,
+    sampling_params: SamplingParams,
+):
+    pass  # TODO: implement
 class OpenAIRequest(APIRequestBase):
-    def __init__(
-        self,
-        task_id: int,
-        # should always be 'role', 'content' keys.
-        # internal logic should handle translating to specific API format
-        model_name: str,  # must correspond to registry
-        prompt: Conversation,
-        attempts_left: int,
-        status_tracker: StatusTracker,
-        results_arr: list,
-        request_timeout: int = 30,
-        sampling_params: SamplingParams = SamplingParams(),
-        callback: Callable | None = None,
-        all_model_names: list[str] | None = None,
-        all_sampling_params: list[SamplingParams] | None = None,
-        tools: list | None = None,
-        cache: CachePattern | None = None,
-    ):
-        super().__init__(
-            task_id=task_id,
-            model_name=model_name,
-            prompt=prompt,
-            attempts_left=attempts_left,
-            status_tracker=status_tracker,
-            results_arr=results_arr,
-            request_timeout=request_timeout,
-            sampling_params=sampling_params,
-            callback=callback,
-            all_model_names=all_model_names,
-            all_sampling_params=all_sampling_params,
-            tools=tools,
-            cache=cache,
-        )
+    def __init__(self, context: RequestContext):
+        # Pass context to parent, which will handle backwards compatibility
+        super().__init__(context=context)
         # Warn if cache is specified for non-Anthropic model
-        if cache is not None:
+        if self.context.cache is not None:
             warnings.warn(
-                f"Cache parameter '{cache}' is only supported for Anthropic models, ignoring for {model_name}"
+                f"Cache parameter '{self.context.cache}' is only supported for Anthropic models, ignoring for {self.context.model_name}"
             )
-        self.model = APIModel.from_registry(model_name)
+        self.model = APIModel.from_registry(self.context.model_name)
         self.url = f"{self.model.api_base}/chat/completions"
         self.request_header = {
             "Authorization": f"Bearer {os.getenv(self.model.api_key_env_var)}"
         }
         self.request_json = _build_oa_chat_request(
-            self.model, prompt, tools, sampling_params
+            self.model,
+            self.context.prompt,
+            self.context.tools,
+            self.context.sampling_params,
         )
     async def handle_response(self, http_response: ClientResponse) -> APIResponse:
@@ -114,6 +95,8 @@ class OpenAIRequest(APIRequestBase):
         mimetype = http_response.headers.get("Content-Type", None)
         data = None
         finish_reason = None
+        assert self.context.status_tracker
         if status_code >= 200 and status_code < 300:
             try:
                 data = await http_response.json()
@@ -156,7 +139,7 @@ class OpenAIRequest(APIRequestBase):
                     usage = Usage.from_openai_usage(data["usage"])
                     if (
-                        self.sampling_params.logprobs
+                        self.context.sampling_params.logprobs
                         and "logprobs" in data["choices"][0]
                     ):
                         logprobs = data["choices"][0]["logprobs"]["content"]
@@ -176,22 +159,22 @@ class OpenAIRequest(APIRequestBase):
         if is_error and error_message is not None:
             if "rate limit" in error_message.lower() or status_code == 429:
                 error_message += " (Rate limit error, triggering cooldown.)"
-                self.status_tracker.rate_limit_exceeded()
+                self.context.status_tracker.rate_limit_exceeded()
             if "context length" in error_message:
                 error_message += " (Context length exceeded, set retries to 0.)"
-                self.attempts_left = 0
+                self.context.attempts_left = 0
         return APIResponse(
-            id=self.task_id,
+            id=self.context.task_id,
             status_code=status_code,
             is_error=is_error,
             error_message=error_message,
-            prompt=self.prompt,
+            prompt=self.context.prompt,
             logprobs=logprobs,
             thinking=thinking,
             content=content,
-            model_internal=self.model_name,
-            sampling_params=self.sampling_params,
+            model_internal=self.context.model_name,
+            sampling_params=self.context.sampling_params,
             usage=usage,
             raw_response=data,
             finish_reason=finish_reason,
@@ -199,117 +182,78 @@ class OpenAIRequest(APIRequestBase):
 class OpenAIResponsesRequest(APIRequestBase):
-    def __init__(
-        self,
-        task_id: int,
-        model_name: str,
-        prompt: Conversation,
-        attempts_left: int,
-        status_tracker: StatusTracker,
-        results_arr: list,
-        request_timeout: int = 30,
-        sampling_params: SamplingParams = SamplingParams(),
-        callback: Callable | None = None,
-        all_model_names: list[str] | None = None,
-        all_sampling_params: list[SamplingParams] | None = None,
-        tools: list | None = None,
-        cache: CachePattern | None = None,
-        computer_use: bool = False,
-        display_width: int = 1024,
-        display_height: int = 768,
-    ):
-        super().__init__(
-            task_id=task_id,
-            model_name=model_name,
-            prompt=prompt,
-            attempts_left=attempts_left,
-            status_tracker=status_tracker,
-            results_arr=results_arr,
-            request_timeout=request_timeout,
-            sampling_params=sampling_params,
-            callback=callback,
-            all_model_names=all_model_names,
-            all_sampling_params=all_sampling_params,
-            tools=tools,
-            cache=cache,
-        )
-        # Store computer use parameters
-        self.computer_use = computer_use
-        self.display_width = display_width
-        self.display_height = display_height
-        # Validate computer use requirements
-        if computer_use and model_name != "openai-computer-use-preview":
-            raise ValueError(
-                f"Computer use is only supported with openai-computer-use-preview model, got {model_name}"
-            )
+    def __init__(self, context: RequestContext):
+        super().__init__(context)
         # Warn if cache is specified for non-Anthropic model
-        if cache is not None:
+        if self.context.cache is not None:
             warnings.warn(
-                f"Cache parameter '{cache}' is only supported for Anthropic models, ignoring for {model_name}"
+                f"Cache parameter '{self.context.cache}' is only supported for Anthropic models, ignoring for {self.context.model_name}"
             )
-        self.model = APIModel.from_registry(model_name)
+        self.model = APIModel.from_registry(self.context.model_name)
         self.url = f"{self.model.api_base}/responses"
         self.request_header = {
             "Authorization": f"Bearer {os.getenv(self.model.api_key_env_var)}"
         }
         # Convert conversation to input format for Responses API
-        openai_responses_format = prompt.to_openai_responses()
+        openai_responses_format = self.context.prompt.to_openai_responses()
         self.request_json = {
             "model": self.model.name,
             "input": openai_responses_format["input"],
-            "temperature": sampling_params.temperature,
-            "top_p": sampling_params.top_p,
+            "temperature": self.context.sampling_params.temperature,
+            "top_p": self.context.sampling_params.top_p,
         }
         # Add max_output_tokens for responses API
-        if sampling_params.max_new_tokens:
-            self.request_json["max_output_tokens"] = sampling_params.max_new_tokens
+        if self.context.sampling_params.max_new_tokens:
+            self.request_json["max_output_tokens"] = (
+                self.context.sampling_params.max_new_tokens
+            )
         if self.model.reasoning_model:
-            if sampling_params.reasoning_effort in [None, "none"]:
+            if self.context.sampling_params.reasoning_effort in [None, "none"]:
                 # gemini models can switch reasoning off
                 if "gemini" in self.model.id:
-                    self.sampling_params.reasoning_effort = "none"  # expects string
+                    self.context.sampling_params.reasoning_effort = (
+                        "none"  # expects string
+                    )
                 # openai models can only go down to "low"
                 else:
-                    self.sampling_params.reasoning_effort = "low"
+                    self.context.sampling_params.reasoning_effort = "low"
             self.request_json["temperature"] = 1.0
             self.request_json["top_p"] = 1.0
             self.request_json["reasoning"] = {
-                "effort": sampling_params.reasoning_effort
+                "effort": self.context.sampling_params.reasoning_effort
             }
         else:
-            if sampling_params.reasoning_effort:
+            if self.context.sampling_params.reasoning_effort:
                 warnings.warn(
-                    f"Ignoring reasoning_effort param for non-reasoning model: {model_name}"
+                    f"Ignoring reasoning_effort param for non-reasoning model: {self.context.model_name}"
                 )
-        if sampling_params.json_mode and self.model.supports_json:
+        if self.context.sampling_params.json_mode and self.model.supports_json:
             self.request_json["text"] = {"format": {"type": "json_object"}}
         # Handle tools
         request_tools = []
-        if computer_use:
-            # Add computer use tool
-            request_tools.append(
-                {
-                    "type": "computer_use_preview",
-                    "display_width": display_width,
-                    "display_height": display_height,
-                    "environment": "browser",  # Default to browser, could be configurable
-                }
-            )
-            # Set truncation to auto as required for computer use
-            self.request_json["truncation"] = "auto"
-        if tools:
+        if self.context.tools:
             # Add regular function tools
-            request_tools.extend([tool.dump_for("openai-responses") for tool in tools])
+            for tool in self.context.tools:
+                if isinstance(tool, Tool):
+                    request_tools.append(tool.dump_for("openai-responses"))
+                elif isinstance(tool, dict):
+                    # if computer use, make sure model supports it
+                    if tool["type"] == "computer_use_preview":
+                        if self.context.model_name != "openai-computer-use-preview":
+                            raise ValueError(
+                                f"model {self.context.model_name} does not support computer use"
+                            )
+                        # have to use truncation
+                        self.request_json["truncation"] = "auto"
+                    request_tools.append(tool)  # allow passing dict
+                elif isinstance(tool, MCPServer):
+                    request_tools.append(tool.for_openai_responses())
         if request_tools:
             self.request_json["tools"] = request_tools
@@ -324,6 +268,7 @@ class OpenAIResponsesRequest(APIRequestBase):
         status_code = http_response.status
         mimetype = http_response.headers.get("Content-Type", None)
         data = None
+        assert self.context.status_tracker
         if status_code >= 200 and status_code < 300:
             try:
@@ -352,26 +297,83 @@ class OpenAIResponsesRequest(APIRequestBase):
                                 for content_item in message_content:
                                     if content_item.get("type") == "output_text":
                                         parts.append(Text(content_item["text"]))
-                                    # Handle tool calls if present
-                                    elif content_item.get("type") == "tool_call":
-                                        tool_call = content_item["tool_call"]
-                                        parts.append(
-                                            ToolCall(
-                                                id=tool_call["id"],
-                                                name=tool_call["function"]["name"],
-                                                arguments=json.loads(
-                                                    tool_call["function"]["arguments"]
-                                                ),
-                                            )
-                                        )
+                                    elif content_item.get("type") == "refusal":
+                                        parts.append(Text(content_item["refusal"]))
+                            elif item.get("type") == "reasoning":
+                                parts.append(Thinking(item["summary"]["text"]))
+                            elif item.get("type") == "function_call":
+                                parts.append(
+                                    ToolCall(
+                                        id=item["call_id"],
+                                        name=item["name"],
+                                        arguments=json.loads(item["arguments"]),
+                                    )
+                                )
+                            elif item.get("type") == "mcp_call":
+                                parts.append(
+                                    ToolCall(
+                                        id=item["id"],
+                                        name=item["name"],
+                                        arguments=json.loads(item["arguments"]),
+                                        built_in=True,
+                                        built_in_type="mcp_call",
+                                        extra_body={
+                                            "server_label": item["server_label"],
+                                            "error": item.get("error"),
+                                            "output": item.get("output"),
+                                        },
+                                    )
+                                )
                             elif item.get("type") == "computer_call":
-                                # Handle computer use actions
-                                action = item.get("action", {})
                                 parts.append(
                                     ToolCall(
                                         id=item["call_id"],
-                                        name=f"_computer_{action.get('type', 'action')}",
-                                        arguments=action,
+                                        name="computer_call",
+                                        arguments=item.get("action"),
+                                        built_in=True,
+                                        built_in_type="computer_call",
+                                    )
+                                )
+                            elif item.get("type") == "web_search_call":
+                                parts.append(
+                                    ToolCall(
+                                        id=item["id"],
+                                        name="web_search_call",
+                                        arguments={},
+                                        built_in=True,
+                                        built_in_type="web_search_call",
+                                        extra_body={"status": item["status"]},
+                                    )
+                                )
+                            elif item.get("type") == "file_search_call":
+                                parts.append(
+                                    ToolCall(
+                                        id=item["id"],
+                                        name="file_search_call",
+                                        arguments={"queries": item["queries"]},
+                                        built_in=True,
+                                        built_in_type="file_search_call",
+                                        extra_body={
+                                            "status": item["status"],
+                                            "results": item["results"],
+                                        },
+                                    )
+                                )
+                            elif item.get("type") == "image_generation_call":
+                                parts.append(
+                                    ToolCall(
+                                        id=item["id"],
+                                        name="image_generation_call",
+                                        arguments={},
+                                        built_in=True,
+                                        built_in_type="image_generation_call",
+                                        extra_body={
+                                            "status": item["status"],
+                                            "result": item["result"],
+                                        },
                                     )
                                 )
@@ -386,9 +388,6 @@ class OpenAIResponsesRequest(APIRequestBase):
                         if "usage" in data:
                             usage = Usage.from_openai_usage(data["usage"])
-                        # Extract response_id for computer use continuation
-                        # response_id = data.get("id")
                 except Exception as e:
                     is_error = True
                     error_message = f"Error parsing {self.model.name} responses API response: {str(e)}"
@@ -406,22 +405,22 @@ class OpenAIResponsesRequest(APIRequestBase):
         if is_error and error_message is not None:
             if "rate limit" in error_message.lower() or status_code == 429:
                 error_message += " (Rate limit error, triggering cooldown.)"
-                self.status_tracker.rate_limit_exceeded()
+                self.context.status_tracker.rate_limit_exceeded()
             if "context length" in error_message:
                 error_message += " (Context length exceeded, set retries to 0.)"
-                self.attempts_left = 0
+                self.context.attempts_left = 0
         return APIResponse(
-            id=self.task_id,
+            id=self.context.task_id,
             status_code=status_code,
             is_error=is_error,
             error_message=error_message,
-            prompt=self.prompt,
+            prompt=self.context.prompt,
             logprobs=logprobs,
             thinking=thinking,
             content=content,
-            model_internal=self.model_name,
-            sampling_params=self.sampling_params,
+            model_internal=self.context.model_name,
+            sampling_params=self.context.sampling_params,
             usage=usage,
             raw_response=data,
         )

lm_deluge/api_requests/response.py CHANGED Viewed

@@ -35,7 +35,8 @@ class APIResponse:
     logprobs: list | None = None
     finish_reason: str | None = None  # make required later
     cost: float | None = None  # calculated automatically
-    cache_hit: bool = False  # manually set if true
+    cache_hit: bool = False  # manually set if true (provider-side caching)
+    local_cache_hit: bool = False  # set if hit our local dynamic cache
     # set to true if is_error and should be retried with a different model
     retry_with_different_model: bool | None = False
     # set to true if should NOT retry with the same model (unrecoverable error)

lm_deluge/batches.py CHANGED Viewed

@@ -218,7 +218,7 @@ async def submit_batches_anthropic(
     batch_tasks = []
     async with aiohttp.ClientSession() as session:
         for batch in batches:
-            url = f"{registry[model]['api_base']}/messages/batches"
+            url = f"{registry[model].api_base}/messages/batches"
             data = {"requests": batch}
             async def submit_batch(data, url, headers):

lm-deluge 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl

lm-deluge 0.0.15py3-none-any.whl → 0.0.16py3-none-any.whl