lm-deluge 0.0.33__tar.gz → 0.0.35__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lm-deluge might be problematic. Click here for more details.
- {lm_deluge-0.0.33/src/lm_deluge.egg-info → lm_deluge-0.0.35}/PKG-INFO +1 -1
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/pyproject.toml +1 -1
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/anthropic.py +3 -3
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/gemini.py +6 -5
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/openai.py +15 -4
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/client.py +70 -80
- lm_deluge-0.0.33/src/lm_deluge/models.py → lm_deluge-0.0.35/src/lm_deluge/models/__init__.py +89 -4
- lm_deluge-0.0.35/src/lm_deluge/util/harmony.py +45 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35/src/lm_deluge.egg-info}/PKG-INFO +1 -1
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge.egg-info/SOURCES.txt +2 -1
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/LICENSE +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/README.md +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/setup.cfg +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/__init__.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/agent.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/__init__.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/base.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/bedrock.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/common.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/mistral.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/response.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/batches.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/anthropic/__init__.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/anthropic/bash.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/anthropic/computer_use.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/anthropic/editor.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/base.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/openai.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/cache.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/config.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/embed.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/errors.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/file.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/gemini_limits.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/image.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/__init__.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/classify.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/extract.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/locate.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/ocr.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/score.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/translate.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/prompt.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/request_context.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/rerank.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/tool.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/tracker.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/usage.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/util/json.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/util/logprobs.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/util/spatial.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/util/validation.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/util/xml.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge.egg-info/requires.txt +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge.egg-info/top_level.txt +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/tests/test_builtin_tools.py +0 -0
- {lm_deluge-0.0.33 → lm_deluge-0.0.35}/tests/test_native_mcp_server.py +0 -0
|
@@ -57,9 +57,9 @@ def _build_anthropic_request(
|
|
|
57
57
|
# handle thinking
|
|
58
58
|
if model.reasoning_model and sampling_params.reasoning_effort:
|
|
59
59
|
# translate reasoning effort of low, medium, high to budget tokens
|
|
60
|
-
budget = {
|
|
61
|
-
|
|
62
|
-
)
|
|
60
|
+
budget = {
|
|
61
|
+
"minimal": 256, "low": 1024, "medium": 4096, "high": 16384
|
|
62
|
+
}.get(sampling_params.reasoning_effort)
|
|
63
63
|
request_json["thinking"] = {
|
|
64
64
|
"type": "enabled",
|
|
65
65
|
"budget_tokens": budget,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import warnings
|
|
4
|
-
|
|
4
|
+
from typing import Any
|
|
5
5
|
from aiohttp import ClientResponse
|
|
6
6
|
|
|
7
7
|
from lm_deluge.request_context import RequestContext
|
|
@@ -37,15 +37,16 @@ async def _build_gemini_request(
|
|
|
37
37
|
|
|
38
38
|
# Handle reasoning models (thinking)
|
|
39
39
|
if model.reasoning_model:
|
|
40
|
-
thinking_config = None
|
|
40
|
+
thinking_config: dict[str, Any] | None = None
|
|
41
41
|
effort = sampling_params.reasoning_effort
|
|
42
42
|
if effort is None or effort == "none":
|
|
43
|
+
budget = 128 if "2.5-pro" in model.id else 0
|
|
43
44
|
# Explicitly disable thoughts when no effort is requested
|
|
44
|
-
thinking_config = {"includeThoughts": False, "thinkingBudget":
|
|
45
|
+
thinking_config = {"includeThoughts": False, "thinkingBudget": budget}
|
|
45
46
|
else:
|
|
46
47
|
thinking_config = {"includeThoughts": True}
|
|
47
|
-
if effort in {"low", "medium", "high"} and "flash" in model.id:
|
|
48
|
-
budget = {"low": 1024, "medium": 4096, "high": 16384}[effort]
|
|
48
|
+
if effort in {"minimal", "low", "medium", "high"} and "flash" in model.id:
|
|
49
|
+
budget = {"minimal": 256, "low": 1024, "medium": 4096, "high": 16384}[effort]
|
|
49
50
|
thinking_config["thinkingBudget"] = budget
|
|
50
51
|
request_json["generationConfig"]["thinkingConfig"] = thinking_config
|
|
51
52
|
|
|
@@ -42,8 +42,13 @@ async def _build_oa_chat_request(
|
|
|
42
42
|
# Disable reasoning for Gemini models when no effort requested
|
|
43
43
|
if "gemini" in model.id:
|
|
44
44
|
effort = "none"
|
|
45
|
+
elif "gpt-5" in model.id:
|
|
46
|
+
effort = "minimal"
|
|
45
47
|
else:
|
|
46
48
|
effort = "low"
|
|
49
|
+
if effort == "minimal" and "gpt-5" not in model.id:
|
|
50
|
+
print("WARNING: 'minimal' reasoning effort only allowed for gpt-5. setting to 'low'.")
|
|
51
|
+
effort = "low"
|
|
47
52
|
request_json["reasoning_effort"] = effort
|
|
48
53
|
else:
|
|
49
54
|
if sampling_params.reasoning_effort:
|
|
@@ -122,15 +127,21 @@ class OpenAIRequest(APIRequestBase):
|
|
|
122
127
|
message = data["choices"][0]["message"]
|
|
123
128
|
finish_reason = data["choices"][0]["finish_reason"]
|
|
124
129
|
|
|
125
|
-
# Add text content if present
|
|
126
|
-
if message.get("content"):
|
|
127
|
-
parts.append(Text(message["content"]))
|
|
128
|
-
|
|
129
130
|
# Add thinking content if present (reasoning models)
|
|
130
131
|
if "reasoning_content" in message:
|
|
131
132
|
thinking = message["reasoning_content"]
|
|
132
133
|
parts.append(Thinking(thinking))
|
|
133
134
|
|
|
135
|
+
# Together AI returns reasoning in a "reasoning"
|
|
136
|
+
# field which is not correct but whatever
|
|
137
|
+
if message.get("reasoning"):
|
|
138
|
+
thinking = message["reasoning"]
|
|
139
|
+
parts.append(Thinking(thinking))
|
|
140
|
+
|
|
141
|
+
# Add text content if present
|
|
142
|
+
if message.get("content"):
|
|
143
|
+
parts.append(Text(message["content"]))
|
|
144
|
+
|
|
134
145
|
# Add tool calls if present
|
|
135
146
|
if "tool_calls" in message:
|
|
136
147
|
for tool_call in message["tool_calls"]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import random
|
|
3
|
-
from typing import Any, Literal, Self, Sequence, overload
|
|
3
|
+
from typing import Any, Literal, Self, Sequence, Callable, overload
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import yaml
|
|
@@ -22,8 +22,6 @@ from .models import APIModel, registry
|
|
|
22
22
|
from .request_context import RequestContext
|
|
23
23
|
from .tracker import StatusTracker
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
# TODO: get completions as they finish, not all at once at the end.
|
|
27
25
|
# TODO: add optional max_input_tokens to client so we can reject long prompts to prevent abuse
|
|
28
26
|
class _LLMClient(BaseModel):
|
|
29
27
|
"""
|
|
@@ -55,6 +53,9 @@ class _LLMClient(BaseModel):
|
|
|
55
53
|
# Progress configuration
|
|
56
54
|
progress: Literal["rich", "tqdm", "manual"] = "rich"
|
|
57
55
|
|
|
56
|
+
# Postprocessing - run on every APIResponse
|
|
57
|
+
postprocess: Callable[[APIResponse], APIResponse] | None = None
|
|
58
|
+
|
|
58
59
|
# Internal state for async task handling
|
|
59
60
|
_next_task_id: int = PrivateAttr(default=0)
|
|
60
61
|
_tasks: dict[int, asyncio.Task] = PrivateAttr(default_factory=dict)
|
|
@@ -196,14 +197,6 @@ class _LLMClient(BaseModel):
|
|
|
196
197
|
config_dict = yaml.safe_load(open(file_path))
|
|
197
198
|
return cls.from_dict(config_dict)
|
|
198
199
|
|
|
199
|
-
@classmethod
|
|
200
|
-
def basic(cls, model: str | list[str], **kwargs):
|
|
201
|
-
"""
|
|
202
|
-
Doesn't do anything differently now, kept for backwards compat.
|
|
203
|
-
"""
|
|
204
|
-
kwargs["model_names"] = model
|
|
205
|
-
return cls(**kwargs)
|
|
206
|
-
|
|
207
200
|
def _select_model(self):
|
|
208
201
|
assert isinstance(self.model_weights, list)
|
|
209
202
|
model_idx = np.random.choice(range(len(self.models)), p=self.model_weights)
|
|
@@ -254,13 +247,18 @@ class _LLMClient(BaseModel):
|
|
|
254
247
|
) -> APIResponse:
|
|
255
248
|
"""Handle caching and single HTTP call for a request. Failed requests go to retry queue."""
|
|
256
249
|
# Check cache first
|
|
250
|
+
def _maybe_postprocess(response: APIResponse):
|
|
251
|
+
if self.postprocess:
|
|
252
|
+
return self.postprocess(response)
|
|
253
|
+
return response
|
|
254
|
+
|
|
257
255
|
if self.cache:
|
|
258
256
|
cached = self.cache.get(context.prompt)
|
|
259
257
|
if cached:
|
|
260
258
|
cached.local_cache_hit = True
|
|
261
259
|
if context.status_tracker:
|
|
262
260
|
context.status_tracker.task_succeeded(context.task_id)
|
|
263
|
-
return cached
|
|
261
|
+
return _maybe_postprocess(cached)
|
|
264
262
|
|
|
265
263
|
# Execute single request
|
|
266
264
|
assert context.status_tracker
|
|
@@ -275,7 +273,7 @@ class _LLMClient(BaseModel):
|
|
|
275
273
|
self.cache.put(context.prompt, response)
|
|
276
274
|
# Call callback if provided
|
|
277
275
|
context.maybe_callback(response, context.status_tracker)
|
|
278
|
-
return response
|
|
276
|
+
return _maybe_postprocess(response)
|
|
279
277
|
|
|
280
278
|
# Handle error response - add to retry queue if available
|
|
281
279
|
if retry_queue and context.attempts_left > 1:
|
|
@@ -303,7 +301,7 @@ class _LLMClient(BaseModel):
|
|
|
303
301
|
|
|
304
302
|
# Add to retry queue for later processing
|
|
305
303
|
await retry_queue.put(retry_context)
|
|
306
|
-
return response # Return the error response for now
|
|
304
|
+
return _maybe_postprocess(response) # Return the error response for now
|
|
307
305
|
|
|
308
306
|
# No retries left or no retry queue - final failure
|
|
309
307
|
context.status_tracker.task_failed(context.task_id)
|
|
@@ -316,7 +314,7 @@ class _LLMClient(BaseModel):
|
|
|
316
314
|
error_msg += f" Message: {response.error_message}. Giving up."
|
|
317
315
|
print(error_msg)
|
|
318
316
|
|
|
319
|
-
return response
|
|
317
|
+
return _maybe_postprocess(response)
|
|
320
318
|
|
|
321
319
|
@overload
|
|
322
320
|
async def process_prompts_async(
|
|
@@ -570,6 +568,8 @@ class _LLMClient(BaseModel):
|
|
|
570
568
|
print(item, end="", flush=True)
|
|
571
569
|
else:
|
|
572
570
|
# final item
|
|
571
|
+
if self.postprocess:
|
|
572
|
+
return self.postprocess(item)
|
|
573
573
|
return item
|
|
574
574
|
|
|
575
575
|
async def run_agent_loop(
|
|
@@ -712,71 +712,59 @@ class _LLMClient(BaseModel):
|
|
|
712
712
|
batch_ids, provider, poll_interval=30
|
|
713
713
|
)
|
|
714
714
|
|
|
715
|
+
# factory function -- allows positional model names,
|
|
716
|
+
# keeps pydantic validation, without sacrificing IDE support
|
|
717
|
+
@overload
|
|
718
|
+
def LLMClient(
|
|
719
|
+
model_names: str,
|
|
720
|
+
*,
|
|
721
|
+
max_requests_per_minute: int = 1_000,
|
|
722
|
+
max_tokens_per_minute: int = 100_000,
|
|
723
|
+
max_concurrent_requests: int = 225,
|
|
724
|
+
sampling_params: list[SamplingParams] | None = None,
|
|
725
|
+
model_weights: list[float] | Literal["uniform", "dynamic"] = "uniform",
|
|
726
|
+
max_attempts: int = 5,
|
|
727
|
+
request_timeout: int = 30,
|
|
728
|
+
cache: Any = None,
|
|
729
|
+
extra_headers: dict[str, str] | None = None,
|
|
730
|
+
temperature: float = 0.75,
|
|
731
|
+
top_p: float = 1.0,
|
|
732
|
+
json_mode: bool = False,
|
|
733
|
+
max_new_tokens: int = 512,
|
|
734
|
+
reasoning_effort: Literal["low", "medium", "high", None] = None,
|
|
735
|
+
logprobs: bool = False,
|
|
736
|
+
top_logprobs: int | None = None,
|
|
737
|
+
force_local_mcp: bool = False,
|
|
738
|
+
progress: Literal["rich", "tqdm", "manual"] = "rich",
|
|
739
|
+
postprocess: Callable[[APIResponse], APIResponse] | None = None
|
|
740
|
+
) -> _LLMClient: ...
|
|
741
|
+
|
|
715
742
|
|
|
716
|
-
# def api_prompts_dry_run(
|
|
717
|
-
# ids: np.ndarray | list[int],
|
|
718
|
-
# prompts: list[Conversation],
|
|
719
|
-
# models: str | list[str],
|
|
720
|
-
# model_weights: list[float],
|
|
721
|
-
# sampling_params: list[SamplingParams],
|
|
722
|
-
# max_tokens_per_minute: int = 500_000,
|
|
723
|
-
# max_requests_per_minute: int = 1_000,
|
|
724
|
-
# ):
|
|
725
|
-
# """
|
|
726
|
-
# Count tokens and estimate costs for a batch of prompts.
|
|
727
|
-
# """
|
|
728
|
-
# results = []
|
|
729
|
-
# for i, prompt in zip(ids, prompts):
|
|
730
|
-
# # choose a model
|
|
731
|
-
# model_idx = np.random.choice(range(len(models)), p=model_weights)
|
|
732
|
-
# model = models[model_idx]
|
|
733
|
-
|
|
734
|
-
# # dry run
|
|
735
|
-
# input_tokens, output_tokens, min_cost, max_cost = prompt.dry_run(
|
|
736
|
-
# model, sampling_params[model_idx].max_new_tokens
|
|
737
|
-
# )
|
|
738
|
-
# results.append(
|
|
739
|
-
# {
|
|
740
|
-
# "id": i,
|
|
741
|
-
# "input_tokens": input_tokens,
|
|
742
|
-
# "output_tokens": output_tokens,
|
|
743
|
-
# "min_cost": min_cost,
|
|
744
|
-
# "max_cost": max_cost,
|
|
745
|
-
# }
|
|
746
|
-
# )
|
|
747
|
-
|
|
748
|
-
# combined_results: dict[str, Any] = {
|
|
749
|
-
# "total_input_tokens": sum([r["input_tokens"] for r in results]),
|
|
750
|
-
# "total_output_tokens": sum([r["output_tokens"] for r in results]),
|
|
751
|
-
# "total_min_cost": sum([r["min_cost"] for r in results]),
|
|
752
|
-
# "total_max_cost": sum([r["max_cost"] for r in results]),
|
|
753
|
-
# }
|
|
754
|
-
# minimum_time_tpm = combined_results["total_input_tokens"] / max_tokens_per_minute
|
|
755
|
-
# maximum_time_tpm = (
|
|
756
|
-
# combined_results["total_input_tokens"] + combined_results["total_output_tokens"]
|
|
757
|
-
# ) / max_tokens_per_minute
|
|
758
|
-
# minimum_time_rpm = len(prompts) / max_requests_per_minute
|
|
759
|
-
|
|
760
|
-
# combined_results["minimum_time"] = max(minimum_time_tpm, minimum_time_rpm)
|
|
761
|
-
# combined_results["maximum_time"] = max(maximum_time_tpm, minimum_time_rpm)
|
|
762
|
-
# limiting_factor = None
|
|
763
|
-
# if minimum_time_rpm > maximum_time_tpm:
|
|
764
|
-
# limiting_factor = "requests"
|
|
765
|
-
# elif minimum_time_rpm < minimum_time_tpm:
|
|
766
|
-
# limiting_factor = "tokens"
|
|
767
|
-
# else:
|
|
768
|
-
# limiting_factor = "depends"
|
|
769
|
-
# combined_results["limiting_factor"] = limiting_factor
|
|
770
|
-
|
|
771
|
-
# return combined_results
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
# Clean factory function with perfect IDE support
|
|
775
743
|
@overload
|
|
776
|
-
def LLMClient(
|
|
744
|
+
def LLMClient(
|
|
745
|
+
model_names: list[str],
|
|
746
|
+
*,
|
|
747
|
+
max_requests_per_minute: int = 1_000,
|
|
748
|
+
max_tokens_per_minute: int = 100_000,
|
|
749
|
+
max_concurrent_requests: int = 225,
|
|
750
|
+
sampling_params: list[SamplingParams] | None = None,
|
|
751
|
+
model_weights: list[float] | Literal["uniform", "dynamic"] = "uniform",
|
|
752
|
+
max_attempts: int = 5,
|
|
753
|
+
request_timeout: int = 30,
|
|
754
|
+
cache: Any = None,
|
|
755
|
+
extra_headers: dict[str, str] | None = None,
|
|
756
|
+
temperature: float = 0.75,
|
|
757
|
+
top_p: float = 1.0,
|
|
758
|
+
json_mode: bool = False,
|
|
759
|
+
max_new_tokens: int = 512,
|
|
760
|
+
reasoning_effort: Literal["low", "medium", "high", None] = None,
|
|
761
|
+
logprobs: bool = False,
|
|
762
|
+
top_logprobs: int | None = None,
|
|
763
|
+
force_local_mcp: bool = False,
|
|
764
|
+
progress: Literal["rich", "tqdm", "manual"] = "rich",
|
|
765
|
+
postprocess: Callable[[APIResponse], APIResponse] | None = None
|
|
766
|
+
) -> _LLMClient: ...
|
|
777
767
|
|
|
778
|
-
@overload
|
|
779
|
-
def LLMClient(model_names: list[str], **kwargs) -> _LLMClient: ...
|
|
780
768
|
|
|
781
769
|
def LLMClient(
|
|
782
770
|
model_names: str | list[str] = "gpt-4.1-mini",
|
|
@@ -799,21 +787,22 @@ def LLMClient(
|
|
|
799
787
|
top_logprobs: int | None = None,
|
|
800
788
|
force_local_mcp: bool = False,
|
|
801
789
|
progress: Literal["rich", "tqdm", "manual"] = "rich",
|
|
790
|
+
postprocess: Callable[[APIResponse], APIResponse] | None = None
|
|
802
791
|
) -> _LLMClient:
|
|
803
792
|
"""
|
|
804
793
|
Create an LLMClient with model_names as a positional argument.
|
|
805
|
-
|
|
794
|
+
|
|
806
795
|
Args:
|
|
807
796
|
model_names: Model name(s) to use - can be a single string or list of strings
|
|
808
797
|
**kwargs: All other LLMClient configuration options (keyword-only)
|
|
809
|
-
|
|
798
|
+
|
|
810
799
|
Returns:
|
|
811
800
|
Configured LLMClient instance
|
|
812
801
|
"""
|
|
813
802
|
# Handle default for mutable argument
|
|
814
803
|
if sampling_params is None:
|
|
815
804
|
sampling_params = []
|
|
816
|
-
|
|
805
|
+
|
|
817
806
|
# Simply pass everything to the Pydantic constructor
|
|
818
807
|
return _LLMClient(
|
|
819
808
|
model_names=model_names,
|
|
@@ -835,4 +824,5 @@ def LLMClient(
|
|
|
835
824
|
top_logprobs=top_logprobs,
|
|
836
825
|
force_local_mcp=force_local_mcp,
|
|
837
826
|
progress=progress,
|
|
827
|
+
postprocess=postprocess
|
|
838
828
|
)
|
lm_deluge-0.0.33/src/lm_deluge/models.py → lm_deluge-0.0.35/src/lm_deluge/models/__init__.py
RENAMED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import random
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from ..request_context import RequestContext
|
|
7
7
|
|
|
8
8
|
BUILTIN_MODELS = {
|
|
9
9
|
# `7MMM. ,MMF' mm
|
|
@@ -267,6 +267,62 @@ BUILTIN_MODELS = {
|
|
|
267
267
|
# ░███
|
|
268
268
|
# █████
|
|
269
269
|
# ░░░░░
|
|
270
|
+
"gpt-5": {
|
|
271
|
+
"id": "gpt-5",
|
|
272
|
+
"name": "gpt-5",
|
|
273
|
+
"api_base": "https://api.openai.com/v1",
|
|
274
|
+
"api_key_env_var": "OPENAI_API_KEY",
|
|
275
|
+
"supports_json": False,
|
|
276
|
+
"supports_logprobs": True,
|
|
277
|
+
"supports_responses": True,
|
|
278
|
+
"api_spec": "openai",
|
|
279
|
+
"input_cost": 1.25,
|
|
280
|
+
"cached_input_cost": 0.125,
|
|
281
|
+
"output_cost": 10.0,
|
|
282
|
+
"reasoning_model": True,
|
|
283
|
+
},
|
|
284
|
+
"gpt-5-chat": {
|
|
285
|
+
"id": "gpt-5-chat",
|
|
286
|
+
"name": "gpt-5-chat-latest",
|
|
287
|
+
"api_base": "https://api.openai.com/v1",
|
|
288
|
+
"api_key_env_var": "OPENAI_API_KEY",
|
|
289
|
+
"supports_json": False,
|
|
290
|
+
"supports_logprobs": True,
|
|
291
|
+
"supports_responses": True,
|
|
292
|
+
"api_spec": "openai",
|
|
293
|
+
"input_cost": 1.25,
|
|
294
|
+
"cached_input_cost": 0.125,
|
|
295
|
+
"output_cost": 10.0,
|
|
296
|
+
"reasoning_model": False,
|
|
297
|
+
},
|
|
298
|
+
"gpt-5-mini": {
|
|
299
|
+
"id": "gpt-5-mini",
|
|
300
|
+
"name": "gpt-5-mini",
|
|
301
|
+
"api_base": "https://api.openai.com/v1",
|
|
302
|
+
"api_key_env_var": "OPENAI_API_KEY",
|
|
303
|
+
"supports_json": False,
|
|
304
|
+
"supports_logprobs": True,
|
|
305
|
+
"supports_responses": True,
|
|
306
|
+
"api_spec": "openai",
|
|
307
|
+
"input_cost": 0.25,
|
|
308
|
+
"cached_input_cost": 0.025,
|
|
309
|
+
"output_cost": 2.0,
|
|
310
|
+
"reasoning_model": True,
|
|
311
|
+
},
|
|
312
|
+
"gpt-5-nano": {
|
|
313
|
+
"id": "gpt-5-nano",
|
|
314
|
+
"name": "gpt-5-nano",
|
|
315
|
+
"api_base": "https://api.openai.com/v1",
|
|
316
|
+
"api_key_env_var": "OPENAI_API_KEY",
|
|
317
|
+
"supports_json": False,
|
|
318
|
+
"supports_logprobs": True,
|
|
319
|
+
"supports_responses": True,
|
|
320
|
+
"api_spec": "openai",
|
|
321
|
+
"input_cost": 0.05,
|
|
322
|
+
"cached_input_cost": 0.005,
|
|
323
|
+
"output_cost": 0.40,
|
|
324
|
+
"reasoning_model": True,
|
|
325
|
+
},
|
|
270
326
|
"openai-computer-use-preview": {
|
|
271
327
|
"id": "openai-computer-use-preview",
|
|
272
328
|
"name": "computer-use-preview",
|
|
@@ -971,6 +1027,32 @@ BUILTIN_MODELS = {
|
|
|
971
1027
|
"requests_per_minute": None,
|
|
972
1028
|
"tokens_per_minute": None,
|
|
973
1029
|
},
|
|
1030
|
+
"gpt-oss-120b-together": {
|
|
1031
|
+
"id": "gpt-oss-120b-together",
|
|
1032
|
+
"name": "openai/gpt-oss-120b",
|
|
1033
|
+
"api_base": "https://api.together.xyz/v1",
|
|
1034
|
+
"api_key_env_var": "TOGETHER_API_KEY",
|
|
1035
|
+
"supports_json": False,
|
|
1036
|
+
"api_spec": "openai",
|
|
1037
|
+
"input_cost": 0.18,
|
|
1038
|
+
"output_cost": 0.59,
|
|
1039
|
+
"requests_per_minute": None,
|
|
1040
|
+
"tokens_per_minute": None,
|
|
1041
|
+
"reasoning_model": True
|
|
1042
|
+
},
|
|
1043
|
+
"gpt-oss-20b-together": {
|
|
1044
|
+
"id": "gpt-oss-20b-together",
|
|
1045
|
+
"name": "openai/gpt-oss-20b",
|
|
1046
|
+
"api_base": "https://api.together.xyz/v1",
|
|
1047
|
+
"api_key_env_var": "TOGETHER_API_KEY",
|
|
1048
|
+
"supports_json": False,
|
|
1049
|
+
"api_spec": "openai",
|
|
1050
|
+
"input_cost": 0.18,
|
|
1051
|
+
"output_cost": 0.59,
|
|
1052
|
+
"requests_per_minute": None,
|
|
1053
|
+
"tokens_per_minute": None,
|
|
1054
|
+
"reasoning_model": True
|
|
1055
|
+
},
|
|
974
1056
|
# █████████ █████
|
|
975
1057
|
# ███░░░░░███ ░░███
|
|
976
1058
|
# ███ ░░░ ██████ ░███████ ██████ ████████ ██████
|
|
@@ -1210,6 +1292,7 @@ class APIModel:
|
|
|
1210
1292
|
api_base: str
|
|
1211
1293
|
api_key_env_var: str
|
|
1212
1294
|
api_spec: str
|
|
1295
|
+
cached_input_cost: float | None = 0
|
|
1213
1296
|
input_cost: float | None = 0 # $ per million input tokens
|
|
1214
1297
|
output_cost: float | None = 0 # $ per million output tokens
|
|
1215
1298
|
supports_json: bool = False
|
|
@@ -1242,7 +1325,7 @@ class APIModel:
|
|
|
1242
1325
|
random.sample(regions, 1, counts=weights)[0]
|
|
1243
1326
|
|
|
1244
1327
|
def make_request(self, context: RequestContext): # -> "APIRequestBase"
|
|
1245
|
-
from
|
|
1328
|
+
from ..api_requests.common import CLASSES
|
|
1246
1329
|
|
|
1247
1330
|
api_spec = self.api_spec
|
|
1248
1331
|
if (
|
|
@@ -1268,6 +1351,7 @@ def register_model(
|
|
|
1268
1351
|
api_key_env_var: str,
|
|
1269
1352
|
api_spec: str,
|
|
1270
1353
|
input_cost: float | None = 0, # $ per million input tokens
|
|
1354
|
+
cached_input_cost: float | None = 0,
|
|
1271
1355
|
output_cost: float | None = 0, # $ per million output tokens
|
|
1272
1356
|
supports_json: bool = False,
|
|
1273
1357
|
supports_logprobs: bool = False,
|
|
@@ -1275,7 +1359,7 @@ def register_model(
|
|
|
1275
1359
|
reasoning_model: bool = False,
|
|
1276
1360
|
regions: list[str] | dict[str, int] = field(default_factory=list),
|
|
1277
1361
|
tokens_per_minute: int | None = None,
|
|
1278
|
-
requests_per_minute: int | None = None
|
|
1362
|
+
requests_per_minute: int | None = None,
|
|
1279
1363
|
) -> APIModel:
|
|
1280
1364
|
"""Register a model configuration and return the created APIModel."""
|
|
1281
1365
|
model = APIModel(
|
|
@@ -1284,6 +1368,7 @@ def register_model(
|
|
|
1284
1368
|
api_base=api_base,
|
|
1285
1369
|
api_key_env_var=api_key_env_var,
|
|
1286
1370
|
api_spec=api_spec,
|
|
1371
|
+
cached_input_cost=cached_input_cost,
|
|
1287
1372
|
input_cost=input_cost,
|
|
1288
1373
|
output_cost=output_cost,
|
|
1289
1374
|
supports_json=supports_json,
|
|
@@ -1292,7 +1377,7 @@ def register_model(
|
|
|
1292
1377
|
reasoning_model=reasoning_model,
|
|
1293
1378
|
regions=regions,
|
|
1294
1379
|
tokens_per_minute=tokens_per_minute,
|
|
1295
|
-
requests_per_minute=requests_per_minute
|
|
1380
|
+
requests_per_minute=requests_per_minute,
|
|
1296
1381
|
)
|
|
1297
1382
|
registry[model.id] = model
|
|
1298
1383
|
return model
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# sample thing we'd want to parse from llama.cpp
|
|
2
|
+
# the goal here is: barebones inference implementation returns
|
|
3
|
+
# raw harmony string; we parse into content blocks
|
|
4
|
+
|
|
5
|
+
# implied: <|start|>assistant
|
|
6
|
+
# <|channel|>analysis<|message|>We need to respond as a helpful assistant. The user says "who are you and what do you want with my family?" This is a normal question. We should answer that we are ChatGPT, an AI language model, and we don't want anything with their family. We reassure them.<|start|>assistant<|channel|>final<|message|>I’m ChatGPT, a large language‑model AI created by OpenAI. I don’t have personal intentions or desires, and I’m not able to interact with anyone outside of this chat. My only goal here is to provide information, answer questions, and help you with whatever you need—nothing more, nothing less. If you have any concerns or need help with something specific, just let me know!
|
|
7
|
+
#
|
|
8
|
+
import copy
|
|
9
|
+
from lm_deluge.api_requests.response import APIResponse
|
|
10
|
+
from lm_deluge.prompt import Text, Thinking
|
|
11
|
+
|
|
12
|
+
SAMPLE_INPUT = '''
|
|
13
|
+
<|channel|>analysis<|message|>We need to respond as a helpful assistant. The user says "who are you and what do you want with my family?" This is a normal question. We should answer that we are ChatGPT, an AI language model, and we don't want anything with their family. We reassure them.<|start|>assistant<|channel|>final<|message|>I’m ChatGPT, a large language‑model AI created by OpenAI. I don’t have personal intentions or desires, and I’m not able to interact with anyone outside of this chat. My only goal here is to provide information, answer questions, and help you with whatever you need—nothing more, nothing less. If you have any concerns or need help with something specific, just let me know!
|
|
14
|
+
'''.strip()
|
|
15
|
+
|
|
16
|
+
def _split_messages(response: str):
|
|
17
|
+
raw_messages = response.split("<|start|>")
|
|
18
|
+
messages = []
|
|
19
|
+
for msg in raw_messages:
|
|
20
|
+
channel, content = msg.split("<|message|>")
|
|
21
|
+
channel = channel.split("<|channel|>")[1]
|
|
22
|
+
messages.append((channel, content))
|
|
23
|
+
|
|
24
|
+
return messages
|
|
25
|
+
|
|
26
|
+
def postprocess_harmony(response: APIResponse) -> APIResponse:
|
|
27
|
+
if not response.content:
|
|
28
|
+
return response
|
|
29
|
+
|
|
30
|
+
parts = response.content.parts
|
|
31
|
+
assert len(parts) == 1, "expected 1 parts to convert harmony"
|
|
32
|
+
text = parts[0].text # type: ignore
|
|
33
|
+
messages = _split_messages(text)
|
|
34
|
+
|
|
35
|
+
new_parts = []
|
|
36
|
+
for channel, content in messages:
|
|
37
|
+
if channel == "analysis":
|
|
38
|
+
new_parts.append(Thinking(content=content))
|
|
39
|
+
elif channel == "final":
|
|
40
|
+
new_parts.append(Text(text=content))
|
|
41
|
+
|
|
42
|
+
new_response = copy.deepcopy(response)
|
|
43
|
+
new_response.content.parts = new_parts # type: ignore
|
|
44
|
+
|
|
45
|
+
return new_response
|
|
@@ -12,7 +12,6 @@ src/lm_deluge/errors.py
|
|
|
12
12
|
src/lm_deluge/file.py
|
|
13
13
|
src/lm_deluge/gemini_limits.py
|
|
14
14
|
src/lm_deluge/image.py
|
|
15
|
-
src/lm_deluge/models.py
|
|
16
15
|
src/lm_deluge/prompt.py
|
|
17
16
|
src/lm_deluge/request_context.py
|
|
18
17
|
src/lm_deluge/rerank.py
|
|
@@ -51,6 +50,8 @@ src/lm_deluge/llm_tools/locate.py
|
|
|
51
50
|
src/lm_deluge/llm_tools/ocr.py
|
|
52
51
|
src/lm_deluge/llm_tools/score.py
|
|
53
52
|
src/lm_deluge/llm_tools/translate.py
|
|
53
|
+
src/lm_deluge/models/__init__.py
|
|
54
|
+
src/lm_deluge/util/harmony.py
|
|
54
55
|
src/lm_deluge/util/json.py
|
|
55
56
|
src/lm_deluge/util/logprobs.py
|
|
56
57
|
src/lm_deluge/util/spatial.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/anthropic/computer_use.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|