PyPI - lm-deluge - Versions diffs - 0.0.33__tar.gz → 0.0.35__tar.gz - Mend

lm-deluge 0.0.33tar.gz → 0.0.35tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lm-deluge might be problematic. Click here for more details.

Files changed (63) hide show

{lm_deluge-0.0.33/src/lm_deluge.egg-info → lm_deluge-0.0.35}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lm_deluge
-Version: 0.0.33
+Version: 0.0.35
 Summary: Python utility for using LLM API models.
 Author-email: Benjamin Anderson <ben@trytaylor.ai>
 Requires-Python: >=3.10

{lm_deluge-0.0.33 → lm_deluge-0.0.35}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
 [project]
 name = "lm_deluge"
-version = "0.0.33"
+version = "0.0.35"
 authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
 description = "Python utility for using LLM API models."
 readme = "README.md"

{lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/anthropic.py RENAMED Viewed

@@ -57,9 +57,9 @@ def _build_anthropic_request(
     # handle thinking
     if model.reasoning_model and sampling_params.reasoning_effort:
         # translate reasoning effort of low, medium, high to budget tokens
-        budget = {"low": 1024, "medium": 4096, "high": 16384}.get(
-            sampling_params.reasoning_effort
-        )
+        budget = {
+            "minimal": 256, "low": 1024, "medium": 4096, "high": 16384
+        }.get(sampling_params.reasoning_effort)
         request_json["thinking"] = {
             "type": "enabled",
             "budget_tokens": budget,

{lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/gemini.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import json
 import os
 import warnings
+from typing import Any
 from aiohttp import ClientResponse
 from lm_deluge.request_context import RequestContext
@@ -37,15 +37,16 @@ async def _build_gemini_request(
     # Handle reasoning models (thinking)
     if model.reasoning_model:
-        thinking_config = None
+        thinking_config: dict[str, Any] | None = None
         effort = sampling_params.reasoning_effort
         if effort is None or effort == "none":
+            budget = 128 if "2.5-pro" in model.id else 0
             # Explicitly disable thoughts when no effort is requested
-            thinking_config = {"includeThoughts": False, "thinkingBudget": 0}
+            thinking_config = {"includeThoughts": False, "thinkingBudget": budget}
         else:
             thinking_config = {"includeThoughts": True}
-            if effort in {"low", "medium", "high"} and "flash" in model.id:
-                budget = {"low": 1024, "medium": 4096, "high": 16384}[effort]
+            if effort in {"minimal", "low", "medium", "high"} and "flash" in model.id:
+                budget = {"minimal": 256, "low": 1024, "medium": 4096, "high": 16384}[effort]
                 thinking_config["thinkingBudget"] = budget
         request_json["generationConfig"]["thinkingConfig"] = thinking_config

{lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/openai.py RENAMED Viewed

@@ -42,8 +42,13 @@ async def _build_oa_chat_request(
             # Disable reasoning for Gemini models when no effort requested
             if "gemini" in model.id:
                 effort = "none"
+            elif "gpt-5" in model.id:
+                effort = "minimal"
             else:
                 effort = "low"
+        if effort == "minimal" and "gpt-5" not in model.id:
+            print("WARNING: 'minimal' reasoning effort only allowed for gpt-5. setting to 'low'.")
+            effort = "low"
         request_json["reasoning_effort"] = effort
     else:
         if sampling_params.reasoning_effort:
@@ -122,15 +127,21 @@ class OpenAIRequest(APIRequestBase):
                     message = data["choices"][0]["message"]
                     finish_reason = data["choices"][0]["finish_reason"]
-                    # Add text content if present
-                    if message.get("content"):
-                        parts.append(Text(message["content"]))
                     # Add thinking content if present (reasoning models)
                     if "reasoning_content" in message:
                         thinking = message["reasoning_content"]
                         parts.append(Thinking(thinking))
+                    # Together AI returns reasoning in a "reasoning"
+                    # field which is not correct but whatever
+                    if message.get("reasoning"):
+                        thinking = message["reasoning"]
+                        parts.append(Thinking(thinking))
+                    # Add text content if present
+                    if message.get("content"):
+                        parts.append(Text(message["content"]))
                     # Add tool calls if present
                     if "tool_calls" in message:
                         for tool_call in message["tool_calls"]:

{lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/client.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import asyncio
 import random
-from typing import Any, Literal, Self, Sequence, overload
+from typing import Any, Literal, Self, Sequence, Callable, overload
 import numpy as np
 import yaml
@@ -22,8 +22,6 @@ from .models import APIModel, registry
 from .request_context import RequestContext
 from .tracker import StatusTracker
-# TODO: get completions as they finish, not all at once at the end.
 # TODO: add optional max_input_tokens to client so we can reject long prompts to prevent abuse
 class _LLMClient(BaseModel):
     """
@@ -55,6 +53,9 @@ class _LLMClient(BaseModel):
     # Progress configuration
     progress: Literal["rich", "tqdm", "manual"] = "rich"
+    # Postprocessing - run on every APIResponse
+    postprocess: Callable[[APIResponse], APIResponse] | None = None
     # Internal state for async task handling
     _next_task_id: int = PrivateAttr(default=0)
     _tasks: dict[int, asyncio.Task] = PrivateAttr(default_factory=dict)
@@ -196,14 +197,6 @@ class _LLMClient(BaseModel):
         config_dict = yaml.safe_load(open(file_path))
         return cls.from_dict(config_dict)
-    @classmethod
-    def basic(cls, model: str | list[str], **kwargs):
-        """
-        Doesn't do anything differently now, kept for backwards compat.
-        """
-        kwargs["model_names"] = model
-        return cls(**kwargs)
     def _select_model(self):
         assert isinstance(self.model_weights, list)
         model_idx = np.random.choice(range(len(self.models)), p=self.model_weights)
@@ -254,13 +247,18 @@ class _LLMClient(BaseModel):
     ) -> APIResponse:
         """Handle caching and single HTTP call for a request. Failed requests go to retry queue."""
         # Check cache first
+        def _maybe_postprocess(response: APIResponse):
+            if self.postprocess:
+                return self.postprocess(response)
+            return response
         if self.cache:
             cached = self.cache.get(context.prompt)
             if cached:
                 cached.local_cache_hit = True
                 if context.status_tracker:
                     context.status_tracker.task_succeeded(context.task_id)
-                return cached
+                return _maybe_postprocess(cached)
         # Execute single request
         assert context.status_tracker
@@ -275,7 +273,7 @@ class _LLMClient(BaseModel):
                 self.cache.put(context.prompt, response)
             # Call callback if provided
             context.maybe_callback(response, context.status_tracker)
-            return response
+            return _maybe_postprocess(response)
         # Handle error response - add to retry queue if available
         if retry_queue and context.attempts_left > 1:
@@ -303,7 +301,7 @@ class _LLMClient(BaseModel):
             # Add to retry queue for later processing
             await retry_queue.put(retry_context)
-            return response  # Return the error response for now
+            return _maybe_postprocess(response)  # Return the error response for now
         # No retries left or no retry queue - final failure
         context.status_tracker.task_failed(context.task_id)
@@ -316,7 +314,7 @@ class _LLMClient(BaseModel):
         error_msg += f" Message: {response.error_message}. Giving up."
         print(error_msg)
-        return response
+        return _maybe_postprocess(response)
     @overload
     async def process_prompts_async(
@@ -570,6 +568,8 @@ class _LLMClient(BaseModel):
                 print(item, end="", flush=True)
             else:
                 # final item
+                if self.postprocess:
+                    return self.postprocess(item)
                 return item
     async def run_agent_loop(
@@ -712,71 +712,59 @@ class _LLMClient(BaseModel):
             batch_ids, provider, poll_interval=30
         )
+# factory function -- allows positional model names,
+# keeps pydantic validation, without sacrificing IDE support
+@overload
+def LLMClient(
+    model_names: str,
+    *,
+    max_requests_per_minute: int = 1_000,
+    max_tokens_per_minute: int = 100_000,
+    max_concurrent_requests: int = 225,
+    sampling_params: list[SamplingParams] | None = None,
+    model_weights: list[float] | Literal["uniform", "dynamic"] = "uniform",
+    max_attempts: int = 5,
+    request_timeout: int = 30,
+    cache: Any = None,
+    extra_headers: dict[str, str] | None = None,
+    temperature: float = 0.75,
+    top_p: float = 1.0,
+    json_mode: bool = False,
+    max_new_tokens: int = 512,
+    reasoning_effort: Literal["low", "medium", "high", None] = None,
+    logprobs: bool = False,
+    top_logprobs: int | None = None,
+    force_local_mcp: bool = False,
+    progress: Literal["rich", "tqdm", "manual"] = "rich",
+    postprocess: Callable[[APIResponse], APIResponse] | None = None
+) -> _LLMClient: ...
-# def api_prompts_dry_run(
-#     ids: np.ndarray | list[int],
-#     prompts: list[Conversation],
-#     models: str | list[str],
-#     model_weights: list[float],
-#     sampling_params: list[SamplingParams],
-#     max_tokens_per_minute: int = 500_000,
-#     max_requests_per_minute: int = 1_000,
-# ):
-#     """
-#     Count tokens and estimate costs for a batch of prompts.
-#     """
-#     results = []
-#     for i, prompt in zip(ids, prompts):
-#         # choose a model
-#         model_idx = np.random.choice(range(len(models)), p=model_weights)
-#         model = models[model_idx]
-#         # dry run
-#         input_tokens, output_tokens, min_cost, max_cost = prompt.dry_run(
-#             model, sampling_params[model_idx].max_new_tokens
-#         )
-#         results.append(
-#             {
-#                 "id": i,
-#                 "input_tokens": input_tokens,
-#                 "output_tokens": output_tokens,
-#                 "min_cost": min_cost,
-#                 "max_cost": max_cost,
-#             }
-#         )
-#     combined_results: dict[str, Any] = {
-#         "total_input_tokens": sum([r["input_tokens"] for r in results]),
-#         "total_output_tokens": sum([r["output_tokens"] for r in results]),
-#         "total_min_cost": sum([r["min_cost"] for r in results]),
-#         "total_max_cost": sum([r["max_cost"] for r in results]),
-#     }
-#     minimum_time_tpm = combined_results["total_input_tokens"] / max_tokens_per_minute
-#     maximum_time_tpm = (
-#         combined_results["total_input_tokens"] + combined_results["total_output_tokens"]
-#     ) / max_tokens_per_minute
-#     minimum_time_rpm = len(prompts) / max_requests_per_minute
-#     combined_results["minimum_time"] = max(minimum_time_tpm, minimum_time_rpm)
-#     combined_results["maximum_time"] = max(maximum_time_tpm, minimum_time_rpm)
-#     limiting_factor = None
-#     if minimum_time_rpm > maximum_time_tpm:
-#         limiting_factor = "requests"
-#     elif minimum_time_rpm < minimum_time_tpm:
-#         limiting_factor = "tokens"
-#     else:
-#         limiting_factor = "depends"
-#     combined_results["limiting_factor"] = limiting_factor
-#     return combined_results
-# Clean factory function with perfect IDE support
 @overload
-def LLMClient(model_names: str, **kwargs) -> _LLMClient: ...
+def LLMClient(
+    model_names: list[str],
+    *,
+    max_requests_per_minute: int = 1_000,
+    max_tokens_per_minute: int = 100_000,
+    max_concurrent_requests: int = 225,
+    sampling_params: list[SamplingParams] | None = None,
+    model_weights: list[float] | Literal["uniform", "dynamic"] = "uniform",
+    max_attempts: int = 5,
+    request_timeout: int = 30,
+    cache: Any = None,
+    extra_headers: dict[str, str] | None = None,
+    temperature: float = 0.75,
+    top_p: float = 1.0,
+    json_mode: bool = False,
+    max_new_tokens: int = 512,
+    reasoning_effort: Literal["low", "medium", "high", None] = None,
+    logprobs: bool = False,
+    top_logprobs: int | None = None,
+    force_local_mcp: bool = False,
+    progress: Literal["rich", "tqdm", "manual"] = "rich",
+    postprocess: Callable[[APIResponse], APIResponse] | None = None
+) -> _LLMClient: ...
-@overload
-def LLMClient(model_names: list[str], **kwargs) -> _LLMClient: ...
 def LLMClient(
     model_names: str | list[str] = "gpt-4.1-mini",
@@ -799,21 +787,22 @@ def LLMClient(
     top_logprobs: int | None = None,
     force_local_mcp: bool = False,
     progress: Literal["rich", "tqdm", "manual"] = "rich",
+    postprocess: Callable[[APIResponse], APIResponse] | None = None
 ) -> _LLMClient:
     """
     Create an LLMClient with model_names as a positional argument.
     Args:
         model_names: Model name(s) to use - can be a single string or list of strings
         **kwargs: All other LLMClient configuration options (keyword-only)
     Returns:
         Configured LLMClient instance
     """
     # Handle default for mutable argument
     if sampling_params is None:
         sampling_params = []
     # Simply pass everything to the Pydantic constructor
     return _LLMClient(
         model_names=model_names,
@@ -835,4 +824,5 @@ def LLMClient(
         top_logprobs=top_logprobs,
         force_local_mcp=force_local_mcp,
         progress=progress,
+        postprocess=postprocess
     )

lm_deluge-0.0.33/src/lm_deluge/models.py → lm_deluge-0.0.35/src/lm_deluge/models/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import random
 from dataclasses import dataclass, field
-from .request_context import RequestContext
+from ..request_context import RequestContext
 BUILTIN_MODELS = {
     # `7MMM.     ,MMF'         mm
@@ -267,6 +267,62 @@ BUILTIN_MODELS = {
     #                ░███
     #                █████
     #               ░░░░░
+    "gpt-5": {
+        "id": "gpt-5",
+        "name": "gpt-5",
+        "api_base": "https://api.openai.com/v1",
+        "api_key_env_var": "OPENAI_API_KEY",
+        "supports_json": False,
+        "supports_logprobs": True,
+        "supports_responses": True,
+        "api_spec": "openai",
+        "input_cost": 1.25,
+        "cached_input_cost": 0.125,
+        "output_cost": 10.0,
+        "reasoning_model": True,
+    },
+    "gpt-5-chat": {
+        "id": "gpt-5-chat",
+        "name": "gpt-5-chat-latest",
+        "api_base": "https://api.openai.com/v1",
+        "api_key_env_var": "OPENAI_API_KEY",
+        "supports_json": False,
+        "supports_logprobs": True,
+        "supports_responses": True,
+        "api_spec": "openai",
+        "input_cost": 1.25,
+        "cached_input_cost": 0.125,
+        "output_cost": 10.0,
+        "reasoning_model": False,
+    },
+    "gpt-5-mini": {
+        "id": "gpt-5-mini",
+        "name": "gpt-5-mini",
+        "api_base": "https://api.openai.com/v1",
+        "api_key_env_var": "OPENAI_API_KEY",
+        "supports_json": False,
+        "supports_logprobs": True,
+        "supports_responses": True,
+        "api_spec": "openai",
+        "input_cost": 0.25,
+        "cached_input_cost": 0.025,
+        "output_cost": 2.0,
+        "reasoning_model": True,
+    },
+    "gpt-5-nano": {
+        "id": "gpt-5-nano",
+        "name": "gpt-5-nano",
+        "api_base": "https://api.openai.com/v1",
+        "api_key_env_var": "OPENAI_API_KEY",
+        "supports_json": False,
+        "supports_logprobs": True,
+        "supports_responses": True,
+        "api_spec": "openai",
+        "input_cost": 0.05,
+        "cached_input_cost": 0.005,
+        "output_cost": 0.40,
+        "reasoning_model": True,
+    },
     "openai-computer-use-preview": {
         "id": "openai-computer-use-preview",
         "name": "computer-use-preview",
@@ -971,6 +1027,32 @@ BUILTIN_MODELS = {
         "requests_per_minute": None,
         "tokens_per_minute": None,
     },
+    "gpt-oss-120b-together": {
+        "id": "gpt-oss-120b-together",
+        "name": "openai/gpt-oss-120b",
+        "api_base": "https://api.together.xyz/v1",
+        "api_key_env_var": "TOGETHER_API_KEY",
+        "supports_json": False,
+        "api_spec": "openai",
+        "input_cost": 0.18,
+        "output_cost": 0.59,
+        "requests_per_minute": None,
+        "tokens_per_minute": None,
+        "reasoning_model": True
+    },
+    "gpt-oss-20b-together": {
+        "id": "gpt-oss-20b-together",
+        "name": "openai/gpt-oss-20b",
+        "api_base": "https://api.together.xyz/v1",
+        "api_key_env_var": "TOGETHER_API_KEY",
+        "supports_json": False,
+        "api_spec": "openai",
+        "input_cost": 0.18,
+        "output_cost": 0.59,
+        "requests_per_minute": None,
+        "tokens_per_minute": None,
+        "reasoning_model": True
+    },
     #    █████████           █████
     #   ███░░░░░███         ░░███
     #  ███     ░░░   ██████  ░███████    ██████  ████████   ██████
@@ -1210,6 +1292,7 @@ class APIModel:
     api_base: str
     api_key_env_var: str
     api_spec: str
+    cached_input_cost: float | None = 0
     input_cost: float | None = 0  # $ per million input tokens
     output_cost: float | None = 0  # $ per million output tokens
     supports_json: bool = False
@@ -1242,7 +1325,7 @@ class APIModel:
         random.sample(regions, 1, counts=weights)[0]
     def make_request(self, context: RequestContext):  # -> "APIRequestBase"
-        from .api_requests.common import CLASSES
+        from ..api_requests.common import CLASSES
         api_spec = self.api_spec
         if (
@@ -1268,6 +1351,7 @@ def register_model(
     api_key_env_var: str,
     api_spec: str,
     input_cost: float | None = 0,  # $ per million input tokens
+    cached_input_cost: float | None = 0,
     output_cost: float | None = 0,  # $ per million output tokens
     supports_json: bool = False,
     supports_logprobs: bool = False,
@@ -1275,7 +1359,7 @@ def register_model(
     reasoning_model: bool = False,
     regions: list[str] | dict[str, int] = field(default_factory=list),
     tokens_per_minute: int | None = None,
-    requests_per_minute: int | None = None
+    requests_per_minute: int | None = None,
 ) -> APIModel:
     """Register a model configuration and return the created APIModel."""
     model = APIModel(
@@ -1284,6 +1368,7 @@ def register_model(
         api_base=api_base,
         api_key_env_var=api_key_env_var,
         api_spec=api_spec,
+        cached_input_cost=cached_input_cost,
         input_cost=input_cost,
         output_cost=output_cost,
         supports_json=supports_json,
@@ -1292,7 +1377,7 @@ def register_model(
         reasoning_model=reasoning_model,
         regions=regions,
         tokens_per_minute=tokens_per_minute,
-        requests_per_minute=requests_per_minute
+        requests_per_minute=requests_per_minute,
     )
     registry[model.id] = model
     return model

lm_deluge-0.0.35/src/lm_deluge/util/harmony.py ADDED Viewed

@@ -0,0 +1,45 @@
+# sample thing we'd want to parse from llama.cpp
+# the goal here is: barebones inference implementation returns
+# raw harmony string; we parse into content blocks
+# implied: <|start|>assistant
+# <|channel|>analysis<|message|>We need to respond as a helpful assistant. The user says "who are you and what do you want with my family?" This is a normal question. We should answer that we are ChatGPT, an AI language model, and we don't want anything with their family. We reassure them.<|start|>assistant<|channel|>final<|message|>I’m ChatGPT, a large language‑model AI created by OpenAI. I don’t have personal intentions or desires, and I’m not able to interact with anyone outside of this chat. My only goal here is to provide information, answer questions, and help you with whatever you need—nothing more, nothing less. If you have any concerns or need help with something specific, just let me know!
+#
+import copy
+from lm_deluge.api_requests.response import APIResponse
+from lm_deluge.prompt import Text, Thinking
+SAMPLE_INPUT = '''
+<|channel|>analysis<|message|>We need to respond as a helpful assistant. The user says "who are you and what do you want with my family?" This is a normal question. We should answer that we are ChatGPT, an AI language model, and we don't want anything with their family. We reassure them.<|start|>assistant<|channel|>final<|message|>I’m ChatGPT, a large language‑model AI created by OpenAI. I don’t have personal intentions or desires, and I’m not able to interact with anyone outside of this chat. My only goal here is to provide information, answer questions, and help you with whatever you need—nothing more, nothing less. If you have any concerns or need help with something specific, just let me know!
+'''.strip()
+def _split_messages(response: str):
+    raw_messages = response.split("<|start|>")
+    messages = []
+    for msg in raw_messages:
+        channel, content = msg.split("<|message|>")
+        channel = channel.split("<|channel|>")[1]
+        messages.append((channel, content))
+    return messages
+def postprocess_harmony(response: APIResponse) -> APIResponse:
+    if not response.content:
+        return response
+    parts = response.content.parts
+    assert len(parts) == 1, "expected 1 parts to convert harmony"
+    text = parts[0].text # type: ignore
+    messages = _split_messages(text)
+    new_parts = []
+    for channel, content in messages:
+        if channel == "analysis":
+            new_parts.append(Thinking(content=content))
+        elif channel == "final":
+            new_parts.append(Text(text=content))
+    new_response = copy.deepcopy(response)
+    new_response.content.parts = new_parts # type: ignore
+    return new_response

{lm_deluge-0.0.33 → lm_deluge-0.0.35/src/lm_deluge.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lm_deluge
-Version: 0.0.33
+Version: 0.0.35
 Summary: Python utility for using LLM API models.
 Author-email: Benjamin Anderson <ben@trytaylor.ai>
 Requires-Python: >=3.10

{lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge.egg-info/SOURCES.txt RENAMED Viewed

@@ -12,7 +12,6 @@ src/lm_deluge/errors.py
 src/lm_deluge/file.py
 src/lm_deluge/gemini_limits.py
 src/lm_deluge/image.py
-src/lm_deluge/models.py
 src/lm_deluge/prompt.py
 src/lm_deluge/request_context.py
 src/lm_deluge/rerank.py
@@ -51,6 +50,8 @@ src/lm_deluge/llm_tools/locate.py
 src/lm_deluge/llm_tools/ocr.py
 src/lm_deluge/llm_tools/score.py
 src/lm_deluge/llm_tools/translate.py
+src/lm_deluge/models/__init__.py
+src/lm_deluge/util/harmony.py
 src/lm_deluge/util/json.py
 src/lm_deluge/util/logprobs.py
 src/lm_deluge/util/spatial.py