lm-deluge 0.0.57__tar.gz → 0.0.59__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lm-deluge might be problematic. Click here for more details.
- {lm_deluge-0.0.57/src/lm_deluge.egg-info → lm_deluge-0.0.59}/PKG-INFO +1 -1
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/pyproject.toml +1 -1
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/anthropic.py +1 -1
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/base.py +87 -5
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/openai.py +41 -3
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/batches.py +25 -9
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/client.py +82 -38
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/__init__.py +8 -8
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/anthropic.py +12 -20
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/bedrock.py +0 -14
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/cohere.py +0 -16
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/google.py +0 -20
- lm_deluge-0.0.59/src/lm_deluge/models/grok.py +82 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/groq.py +2 -2
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/meta.py +0 -8
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/openai.py +0 -34
- lm_deluge-0.0.59/src/lm_deluge/models/openrouter.py +64 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/together.py +0 -16
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/prompt.py +19 -7
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/request_context.py +9 -11
- {lm_deluge-0.0.57 → lm_deluge-0.0.59/src/lm_deluge.egg-info}/PKG-INFO +1 -1
- lm_deluge-0.0.57/src/lm_deluge/models/grok.py +0 -38
- lm_deluge-0.0.57/src/lm_deluge/models/openrouter.py +0 -1
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/LICENSE +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/README.md +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/setup.cfg +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/__init__.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/__init__.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/bedrock.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/common.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/gemini.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/mistral.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/api_requests/response.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/built_in_tools/anthropic/__init__.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/built_in_tools/anthropic/bash.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/built_in_tools/anthropic/computer_use.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/built_in_tools/anthropic/editor.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/built_in_tools/base.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/built_in_tools/openai.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/cache.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/cli.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/config.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/embed.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/errors.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/file.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/image.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/llm_tools/__init__.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/llm_tools/classify.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/llm_tools/extract.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/llm_tools/locate.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/llm_tools/ocr.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/llm_tools/score.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/llm_tools/translate.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/cerebras.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/deepseek.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/fireworks.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/models/mistral.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/presets/cerebras.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/presets/meta.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/rerank.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/tool.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/tracker.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/usage.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/util/harmony.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/util/json.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/util/logprobs.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/util/spatial.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/util/validation.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge/util/xml.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge.egg-info/SOURCES.txt +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge.egg-info/requires.txt +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/src/lm_deluge.egg-info/top_level.txt +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/tests/test_builtin_tools.py +0 -0
- {lm_deluge-0.0.57 → lm_deluge-0.0.59}/tests/test_native_mcp_server.py +0 -0
|
@@ -72,7 +72,7 @@ def _build_anthropic_request(
|
|
|
72
72
|
request_json["system"] = system_message
|
|
73
73
|
|
|
74
74
|
# handle temp + top_p for opus 4.1/sonnet 4.5
|
|
75
|
-
if model.name
|
|
75
|
+
if "4-1" in model.name or "4-5" in model.name:
|
|
76
76
|
if "temperature" in request_json and "top_p" in request_json:
|
|
77
77
|
request_json.pop("top_p")
|
|
78
78
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import time
|
|
2
3
|
import traceback
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
5
|
|
|
@@ -6,6 +7,7 @@ import aiohttp
|
|
|
6
7
|
from aiohttp import ClientResponse
|
|
7
8
|
|
|
8
9
|
from ..errors import raise_if_modal_exception
|
|
10
|
+
from ..models.openai import OPENAI_MODELS
|
|
9
11
|
from ..request_context import RequestContext
|
|
10
12
|
from .response import APIResponse
|
|
11
13
|
|
|
@@ -82,15 +84,95 @@ class APIRequestBase(ABC):
|
|
|
82
84
|
if self.context.status_tracker:
|
|
83
85
|
self.context.status_tracker.task_succeeded(self.context.task_id)
|
|
84
86
|
|
|
87
|
+
async def _execute_once_background_mode(self) -> APIResponse:
|
|
88
|
+
"""
|
|
89
|
+
ONLY for OpenAI responses API. Implement the
|
|
90
|
+
start -> poll -> result style of request.
|
|
91
|
+
"""
|
|
92
|
+
assert self.context.status_tracker, "no status tracker"
|
|
93
|
+
start_time = time.time()
|
|
94
|
+
async with aiohttp.ClientSession() as session:
|
|
95
|
+
last_status: str | None = None
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
self.context.status_tracker.total_requests += 1
|
|
99
|
+
assert self.url is not None, "URL is not set"
|
|
100
|
+
async with session.post(
|
|
101
|
+
url=self.url,
|
|
102
|
+
headers=self.request_header,
|
|
103
|
+
json=self.request_json,
|
|
104
|
+
) as http_response:
|
|
105
|
+
# make sure we created the Response object
|
|
106
|
+
http_response.raise_for_status()
|
|
107
|
+
data = await http_response.json()
|
|
108
|
+
response_id = data["id"]
|
|
109
|
+
last_status = data["status"]
|
|
110
|
+
|
|
111
|
+
while True:
|
|
112
|
+
if time.time() - start_time > self.context.request_timeout:
|
|
113
|
+
# cancel the response
|
|
114
|
+
async with session.post(
|
|
115
|
+
url=f"{self.url}/{response_id}/cancel",
|
|
116
|
+
headers=self.request_header,
|
|
117
|
+
) as http_response:
|
|
118
|
+
http_response.raise_for_status()
|
|
119
|
+
|
|
120
|
+
return APIResponse(
|
|
121
|
+
id=self.context.task_id,
|
|
122
|
+
model_internal=self.context.model_name,
|
|
123
|
+
prompt=self.context.prompt,
|
|
124
|
+
sampling_params=self.context.sampling_params,
|
|
125
|
+
status_code=None,
|
|
126
|
+
is_error=True,
|
|
127
|
+
error_message="Request timed out (terminated by client).",
|
|
128
|
+
content=None,
|
|
129
|
+
usage=None,
|
|
130
|
+
)
|
|
131
|
+
# poll for the response
|
|
132
|
+
await asyncio.sleep(5.0)
|
|
133
|
+
async with session.get(
|
|
134
|
+
url=f"{self.url}/{response_id}",
|
|
135
|
+
headers=self.request_header,
|
|
136
|
+
) as http_response:
|
|
137
|
+
http_response.raise_for_status()
|
|
138
|
+
data = await http_response.json()
|
|
139
|
+
|
|
140
|
+
if data["status"] != last_status:
|
|
141
|
+
print(
|
|
142
|
+
f"Background req {response_id} status updated to: {data['status']}"
|
|
143
|
+
)
|
|
144
|
+
last_status = data["status"]
|
|
145
|
+
if last_status not in ["queued", "in_progress"]:
|
|
146
|
+
return await self.handle_response(http_response)
|
|
147
|
+
|
|
148
|
+
except Exception as e:
|
|
149
|
+
raise_if_modal_exception(e)
|
|
150
|
+
tb = traceback.format_exc()
|
|
151
|
+
print(tb)
|
|
152
|
+
return APIResponse(
|
|
153
|
+
id=self.context.task_id,
|
|
154
|
+
model_internal=self.context.model_name,
|
|
155
|
+
prompt=self.context.prompt,
|
|
156
|
+
sampling_params=self.context.sampling_params,
|
|
157
|
+
status_code=None,
|
|
158
|
+
is_error=True,
|
|
159
|
+
error_message=f"Unexpected {type(e).__name__}: {str(e) or 'No message.'}",
|
|
160
|
+
content=None,
|
|
161
|
+
usage=None,
|
|
162
|
+
)
|
|
163
|
+
|
|
85
164
|
async def execute_once(self) -> APIResponse:
|
|
86
165
|
"""Send the HTTP request once and return the parsed APIResponse."""
|
|
87
166
|
await self.build_request()
|
|
88
167
|
assert self.context.status_tracker
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
168
|
+
|
|
169
|
+
if (
|
|
170
|
+
self.context.background
|
|
171
|
+
and self.context.use_responses_api
|
|
172
|
+
and self.context.model_name in OPENAI_MODELS
|
|
173
|
+
):
|
|
174
|
+
return await self._execute_once_background_mode()
|
|
175
|
+
|
|
94
176
|
try:
|
|
95
177
|
self.context.status_tracker.total_requests += 1
|
|
96
178
|
timeout = aiohttp.ClientTimeout(total=self.context.request_timeout)
|
|
@@ -30,6 +30,26 @@ async def _build_oa_chat_request(
|
|
|
30
30
|
"temperature": sampling_params.temperature,
|
|
31
31
|
"top_p": sampling_params.top_p,
|
|
32
32
|
}
|
|
33
|
+
if context.service_tier:
|
|
34
|
+
assert context.service_tier in [
|
|
35
|
+
"auto",
|
|
36
|
+
"default",
|
|
37
|
+
"flex",
|
|
38
|
+
"priority",
|
|
39
|
+
], f"Invalid service tier: {context.service_tier}"
|
|
40
|
+
# flex is only supported for o3, o4-mini, gpt-5 models
|
|
41
|
+
if context.service_tier == "flex":
|
|
42
|
+
model_supports_flex = any(x in model.id for x in ["o3", "o4-mini", "gpt-5"])
|
|
43
|
+
if not model_supports_flex:
|
|
44
|
+
print(
|
|
45
|
+
f"WARNING: service_tier='flex' only supported for o3, o4-mini, gpt-5. "
|
|
46
|
+
f"Using 'auto' instead for model {model.id}."
|
|
47
|
+
)
|
|
48
|
+
request_json["service_tier"] = "auto"
|
|
49
|
+
else:
|
|
50
|
+
request_json["service_tier"] = context.service_tier
|
|
51
|
+
else:
|
|
52
|
+
request_json["service_tier"] = context.service_tier
|
|
33
53
|
# set max_tokens or max_completion_tokens dep. on provider
|
|
34
54
|
if "cohere" in model.api_base:
|
|
35
55
|
request_json["max_tokens"] = sampling_params.max_new_tokens
|
|
@@ -213,9 +233,6 @@ class OpenAIRequest(APIRequestBase):
|
|
|
213
233
|
async def _build_oa_responses_request(
|
|
214
234
|
model: APIModel,
|
|
215
235
|
context: RequestContext,
|
|
216
|
-
# prompt: Conversation,
|
|
217
|
-
# tools: list[Tool] | None,
|
|
218
|
-
# sampling_params: SamplingParams,
|
|
219
236
|
):
|
|
220
237
|
prompt = context.prompt
|
|
221
238
|
sampling_params = context.sampling_params
|
|
@@ -226,7 +243,28 @@ async def _build_oa_responses_request(
|
|
|
226
243
|
"input": openai_responses_format["input"],
|
|
227
244
|
"temperature": sampling_params.temperature,
|
|
228
245
|
"top_p": sampling_params.top_p,
|
|
246
|
+
"background": context.background or False,
|
|
229
247
|
}
|
|
248
|
+
if context.service_tier:
|
|
249
|
+
assert context.service_tier in [
|
|
250
|
+
"auto",
|
|
251
|
+
"default",
|
|
252
|
+
"flex",
|
|
253
|
+
"priority",
|
|
254
|
+
], f"Invalid service tier: {context.service_tier}"
|
|
255
|
+
# flex is only supported for o3, o4-mini, gpt-5 models
|
|
256
|
+
if context.service_tier == "flex":
|
|
257
|
+
model_supports_flex = any(x in model.id for x in ["o3", "o4-mini", "gpt-5"])
|
|
258
|
+
if not model_supports_flex:
|
|
259
|
+
print(
|
|
260
|
+
f"WARNING: service_tier='flex' only supported for o3, o4-mini, gpt-5. "
|
|
261
|
+
f"Model {model.id} doesn't support flex. Using 'auto' instead."
|
|
262
|
+
)
|
|
263
|
+
request_json["service_tier"] = "auto"
|
|
264
|
+
else:
|
|
265
|
+
request_json["service_tier"] = context.service_tier
|
|
266
|
+
else:
|
|
267
|
+
request_json["service_tier"] = context.service_tier
|
|
230
268
|
if sampling_params.max_new_tokens:
|
|
231
269
|
request_json["max_output_tokens"] = sampling_params.max_new_tokens
|
|
232
270
|
|
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import os
|
|
4
4
|
import tempfile
|
|
5
5
|
import time
|
|
6
|
-
from typing import Literal, Sequence
|
|
6
|
+
from typing import Literal, Sequence, cast
|
|
7
7
|
|
|
8
8
|
import aiohttp
|
|
9
9
|
from rich.console import Console
|
|
@@ -16,7 +16,12 @@ from lm_deluge.api_requests.anthropic import _build_anthropic_request
|
|
|
16
16
|
from lm_deluge.api_requests.openai import _build_oa_chat_request
|
|
17
17
|
from lm_deluge.config import SamplingParams
|
|
18
18
|
from lm_deluge.models import APIModel, registry
|
|
19
|
-
from lm_deluge.prompt import
|
|
19
|
+
from lm_deluge.prompt import (
|
|
20
|
+
CachePattern,
|
|
21
|
+
Conversation,
|
|
22
|
+
Prompt,
|
|
23
|
+
prompts_to_conversations,
|
|
24
|
+
)
|
|
20
25
|
from lm_deluge.request_context import RequestContext
|
|
21
26
|
|
|
22
27
|
|
|
@@ -166,14 +171,18 @@ async def _submit_anthropic_batch(file_path: str, headers: dict, model: str):
|
|
|
166
171
|
async def create_batch_files_oa(
|
|
167
172
|
model: str,
|
|
168
173
|
sampling_params: SamplingParams,
|
|
169
|
-
prompts:
|
|
174
|
+
prompts: Prompt | Sequence[Prompt],
|
|
170
175
|
batch_size: int = 50_000,
|
|
171
176
|
destination: str | None = None, # if none provided, temp files
|
|
172
177
|
):
|
|
173
178
|
MAX_BATCH_SIZE_BYTES = 200 * 1024 * 1024 # 200MB
|
|
174
179
|
MAX_BATCH_SIZE_ITEMS = batch_size
|
|
175
180
|
|
|
176
|
-
|
|
181
|
+
if not isinstance(prompts, list):
|
|
182
|
+
prompts = cast(Sequence[Prompt], [prompts])
|
|
183
|
+
|
|
184
|
+
prompts = prompts_to_conversations(cast(Sequence[Prompt], prompts))
|
|
185
|
+
assert isinstance(prompts, Sequence)
|
|
177
186
|
if any(p is None for p in prompts):
|
|
178
187
|
raise ValueError("All prompts must be valid.")
|
|
179
188
|
|
|
@@ -251,14 +260,18 @@ async def create_batch_files_oa(
|
|
|
251
260
|
async def submit_batches_oa(
|
|
252
261
|
model: str,
|
|
253
262
|
sampling_params: SamplingParams,
|
|
254
|
-
prompts:
|
|
263
|
+
prompts: Prompt | Sequence[Prompt],
|
|
255
264
|
batch_size: int = 50_000,
|
|
256
265
|
):
|
|
257
266
|
"""Write OpenAI batch requests to a file and submit."""
|
|
258
267
|
MAX_BATCH_SIZE_BYTES = 200 * 1024 * 1024 # 200MB
|
|
259
268
|
MAX_BATCH_SIZE_ITEMS = batch_size
|
|
260
269
|
|
|
261
|
-
|
|
270
|
+
if not isinstance(prompts, list):
|
|
271
|
+
prompts = prompts = cast(Sequence[Prompt], [prompts])
|
|
272
|
+
|
|
273
|
+
prompts = prompts_to_conversations(cast(Sequence[Prompt], prompts))
|
|
274
|
+
assert isinstance(prompts, Sequence)
|
|
262
275
|
if any(p is None for p in prompts):
|
|
263
276
|
raise ValueError("All prompts must be valid.")
|
|
264
277
|
|
|
@@ -342,7 +355,7 @@ async def submit_batches_oa(
|
|
|
342
355
|
async def submit_batches_anthropic(
|
|
343
356
|
model: str,
|
|
344
357
|
sampling_params: SamplingParams,
|
|
345
|
-
prompts:
|
|
358
|
+
prompts: Prompt | Sequence[Prompt],
|
|
346
359
|
*,
|
|
347
360
|
cache: CachePattern | None = None,
|
|
348
361
|
batch_size=100_000,
|
|
@@ -362,13 +375,16 @@ async def submit_batches_anthropic(
|
|
|
362
375
|
MAX_BATCH_SIZE_ITEMS = batch_size
|
|
363
376
|
|
|
364
377
|
# Convert prompts to Conversations
|
|
365
|
-
|
|
378
|
+
if not isinstance(prompts, list):
|
|
379
|
+
prompts = prompts = cast(Sequence[Prompt], [prompts])
|
|
380
|
+
|
|
381
|
+
prompts = prompts_to_conversations(cast(Sequence[Prompt], prompts))
|
|
366
382
|
|
|
367
383
|
request_headers = None
|
|
368
384
|
batch_tasks = []
|
|
369
385
|
current_batch = []
|
|
370
386
|
current_batch_size = 0
|
|
371
|
-
|
|
387
|
+
assert isinstance(prompts, Sequence)
|
|
372
388
|
for idx, prompt in enumerate(prompts):
|
|
373
389
|
assert isinstance(prompt, Conversation)
|
|
374
390
|
context = RequestContext(
|
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import (
|
|
3
|
+
Any,
|
|
4
|
+
AsyncGenerator,
|
|
5
|
+
Callable,
|
|
6
|
+
Literal,
|
|
7
|
+
Self,
|
|
8
|
+
Sequence,
|
|
9
|
+
cast,
|
|
10
|
+
overload,
|
|
11
|
+
)
|
|
3
12
|
|
|
4
13
|
import numpy as np
|
|
5
14
|
import yaml
|
|
@@ -12,7 +21,12 @@ from lm_deluge.batches import (
|
|
|
12
21
|
submit_batches_oa,
|
|
13
22
|
wait_for_batch_completion_async,
|
|
14
23
|
)
|
|
15
|
-
from lm_deluge.prompt import
|
|
24
|
+
from lm_deluge.prompt import (
|
|
25
|
+
CachePattern,
|
|
26
|
+
Conversation,
|
|
27
|
+
Prompt,
|
|
28
|
+
prompts_to_conversations,
|
|
29
|
+
)
|
|
16
30
|
from lm_deluge.tool import MCPServer, Tool
|
|
17
31
|
|
|
18
32
|
from .api_requests.base import APIResponse
|
|
@@ -40,6 +54,9 @@ class _LLMClient(BaseModel):
|
|
|
40
54
|
request_timeout: int = 30
|
|
41
55
|
cache: Any = None
|
|
42
56
|
extra_headers: dict[str, str] | None = None
|
|
57
|
+
extra_body: dict[str, str] | None = None
|
|
58
|
+
use_responses_api: bool = False
|
|
59
|
+
background: bool = False
|
|
43
60
|
# sampling params - if provided, and sampling_params is not,
|
|
44
61
|
# these override the defaults
|
|
45
62
|
temperature: float = 0.75
|
|
@@ -171,6 +188,11 @@ class _LLMClient(BaseModel):
|
|
|
171
188
|
# normalize weights
|
|
172
189
|
self.model_weights = [w / sum(self.model_weights) for w in self.model_weights]
|
|
173
190
|
|
|
191
|
+
# background mode only allowed for responses api
|
|
192
|
+
if self.background:
|
|
193
|
+
assert (
|
|
194
|
+
self.use_responses_api
|
|
195
|
+
), "background mode only allowed for responses api"
|
|
174
196
|
# Auto-generate name if not provided
|
|
175
197
|
if self.name is None:
|
|
176
198
|
if len(self.model_names) == 1:
|
|
@@ -256,13 +278,6 @@ class _LLMClient(BaseModel):
|
|
|
256
278
|
# Idle wait before next capacity check. Aim for ~RPM spacing.
|
|
257
279
|
await asyncio.sleep(max(60.0 / self.max_requests_per_minute, 0.01))
|
|
258
280
|
|
|
259
|
-
async def _execute_request(self, context: RequestContext) -> APIResponse:
|
|
260
|
-
"""Create and send a single API request using the provided context."""
|
|
261
|
-
model_obj = APIModel.from_registry(context.model_name)
|
|
262
|
-
request = model_obj.make_request(context)
|
|
263
|
-
response = await request.execute_once()
|
|
264
|
-
return response
|
|
265
|
-
|
|
266
281
|
async def process_single_request(
|
|
267
282
|
self, context: RequestContext, retry_queue: asyncio.Queue | None = None
|
|
268
283
|
) -> APIResponse:
|
|
@@ -290,7 +305,9 @@ class _LLMClient(BaseModel):
|
|
|
290
305
|
# Execute single request
|
|
291
306
|
assert context.status_tracker
|
|
292
307
|
context.status_tracker.update_pbar()
|
|
293
|
-
|
|
308
|
+
model_obj = APIModel.from_registry(context.model_name)
|
|
309
|
+
request = model_obj.make_request(context)
|
|
310
|
+
response = await request.execute_once()
|
|
294
311
|
|
|
295
312
|
# Handle successful response
|
|
296
313
|
if not response.is_error:
|
|
@@ -350,44 +367,46 @@ class _LLMClient(BaseModel):
|
|
|
350
367
|
@overload
|
|
351
368
|
async def process_prompts_async(
|
|
352
369
|
self,
|
|
353
|
-
prompts:
|
|
370
|
+
prompts: Prompt | Sequence[Prompt],
|
|
354
371
|
*,
|
|
355
372
|
return_completions_only: Literal[True],
|
|
356
373
|
show_progress: bool = ...,
|
|
357
374
|
tools: list[Tool | dict | MCPServer] | None = ...,
|
|
358
375
|
cache: CachePattern | None = ...,
|
|
359
|
-
|
|
376
|
+
service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
|
|
360
377
|
) -> list[str | None]: ...
|
|
361
378
|
|
|
362
379
|
@overload
|
|
363
380
|
async def process_prompts_async(
|
|
364
381
|
self,
|
|
365
|
-
prompts:
|
|
382
|
+
prompts: Prompt | Sequence[Prompt],
|
|
366
383
|
*,
|
|
367
384
|
return_completions_only: Literal[False] = ...,
|
|
368
385
|
show_progress: bool = ...,
|
|
369
386
|
tools: list[Tool | dict | MCPServer] | None = ...,
|
|
370
387
|
cache: CachePattern | None = ...,
|
|
371
|
-
|
|
372
|
-
) -> list[APIResponse
|
|
388
|
+
service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
|
|
389
|
+
) -> list[APIResponse]: ...
|
|
373
390
|
|
|
374
391
|
async def process_prompts_async(
|
|
375
392
|
self,
|
|
376
|
-
prompts:
|
|
393
|
+
prompts: Prompt | Sequence[Prompt],
|
|
377
394
|
*,
|
|
378
395
|
return_completions_only: bool = False,
|
|
379
396
|
show_progress: bool = True,
|
|
380
397
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
381
398
|
cache: CachePattern | None = None,
|
|
382
|
-
|
|
383
|
-
) -> list[APIResponse
|
|
399
|
+
service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
|
|
400
|
+
) -> list[APIResponse] | list[str | None] | dict[str, int]:
|
|
384
401
|
"""Process multiple prompts asynchronously using the start_nowait/wait_for_all backend.
|
|
385
402
|
|
|
386
403
|
This implementation creates all tasks upfront and waits for them to complete,
|
|
387
404
|
avoiding issues with tracker state accumulating across multiple calls.
|
|
388
405
|
"""
|
|
389
406
|
# Convert prompts to Conversations
|
|
390
|
-
|
|
407
|
+
if not isinstance(prompts, list):
|
|
408
|
+
prompts = prompts = cast(Sequence[Prompt], [prompts])
|
|
409
|
+
prompts = prompts_to_conversations(cast(Sequence[Prompt], prompts))
|
|
391
410
|
|
|
392
411
|
# Ensure tracker exists (start_nowait will call add_to_total for each task)
|
|
393
412
|
if self._tracker is None:
|
|
@@ -398,13 +417,14 @@ class _LLMClient(BaseModel):
|
|
|
398
417
|
|
|
399
418
|
# Start all tasks using start_nowait - tasks will coordinate via shared capacity lock
|
|
400
419
|
task_ids = []
|
|
420
|
+
assert isinstance(prompts, Sequence)
|
|
401
421
|
for prompt in prompts:
|
|
402
422
|
assert isinstance(prompt, Conversation)
|
|
403
423
|
task_id = self.start_nowait(
|
|
404
424
|
prompt,
|
|
405
425
|
tools=tools,
|
|
406
426
|
cache=cache,
|
|
407
|
-
|
|
427
|
+
service_tier=service_tier,
|
|
408
428
|
)
|
|
409
429
|
task_ids.append(task_id)
|
|
410
430
|
|
|
@@ -443,13 +463,12 @@ class _LLMClient(BaseModel):
|
|
|
443
463
|
|
|
444
464
|
def process_prompts_sync(
|
|
445
465
|
self,
|
|
446
|
-
prompts:
|
|
466
|
+
prompts: Prompt | Sequence[Prompt],
|
|
447
467
|
*,
|
|
448
468
|
return_completions_only: bool = False,
|
|
449
469
|
show_progress=True,
|
|
450
470
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
451
471
|
cache: CachePattern | None = None,
|
|
452
|
-
use_responses_api: bool = False,
|
|
453
472
|
):
|
|
454
473
|
return asyncio.run(
|
|
455
474
|
self.process_prompts_async(
|
|
@@ -458,7 +477,6 @@ class _LLMClient(BaseModel):
|
|
|
458
477
|
show_progress=show_progress,
|
|
459
478
|
tools=tools,
|
|
460
479
|
cache=cache,
|
|
461
|
-
use_responses_api=use_responses_api,
|
|
462
480
|
)
|
|
463
481
|
)
|
|
464
482
|
|
|
@@ -478,18 +496,18 @@ class _LLMClient(BaseModel):
|
|
|
478
496
|
|
|
479
497
|
def start_nowait(
|
|
480
498
|
self,
|
|
481
|
-
prompt:
|
|
499
|
+
prompt: Prompt,
|
|
482
500
|
*,
|
|
483
501
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
484
502
|
cache: CachePattern | None = None,
|
|
485
|
-
|
|
503
|
+
service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
|
|
486
504
|
) -> int:
|
|
487
505
|
tracker = self._get_tracker()
|
|
488
506
|
task_id = self._next_task_id
|
|
489
507
|
self._next_task_id += 1
|
|
490
508
|
model, sampling_params = self._select_model()
|
|
491
|
-
|
|
492
|
-
|
|
509
|
+
prompt = prompts_to_conversations([prompt])[0]
|
|
510
|
+
assert isinstance(prompt, Conversation)
|
|
493
511
|
context = RequestContext(
|
|
494
512
|
task_id=task_id,
|
|
495
513
|
model_name=model,
|
|
@@ -500,7 +518,9 @@ class _LLMClient(BaseModel):
|
|
|
500
518
|
status_tracker=tracker,
|
|
501
519
|
tools=tools,
|
|
502
520
|
cache=cache,
|
|
503
|
-
use_responses_api=use_responses_api,
|
|
521
|
+
use_responses_api=self.use_responses_api,
|
|
522
|
+
background=self.background,
|
|
523
|
+
service_tier=service_tier,
|
|
504
524
|
extra_headers=self.extra_headers,
|
|
505
525
|
force_local_mcp=self.force_local_mcp,
|
|
506
526
|
)
|
|
@@ -515,29 +535,41 @@ class _LLMClient(BaseModel):
|
|
|
515
535
|
*,
|
|
516
536
|
tools: list[Tool | dict | MCPServer] | None = None,
|
|
517
537
|
cache: CachePattern | None = None,
|
|
518
|
-
|
|
519
|
-
) -> APIResponse
|
|
538
|
+
service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
|
|
539
|
+
) -> APIResponse:
|
|
520
540
|
task_id = self.start_nowait(
|
|
521
|
-
prompt, tools=tools, cache=cache,
|
|
541
|
+
prompt, tools=tools, cache=cache, service_tier=service_tier
|
|
522
542
|
)
|
|
523
543
|
return await self.wait_for(task_id)
|
|
524
544
|
|
|
525
|
-
async def wait_for(self, task_id: int) -> APIResponse
|
|
545
|
+
async def wait_for(self, task_id: int) -> APIResponse:
|
|
526
546
|
task = self._tasks.get(task_id)
|
|
527
547
|
if task:
|
|
528
548
|
return await task
|
|
529
|
-
|
|
549
|
+
res = self._results.get(task_id)
|
|
550
|
+
if res:
|
|
551
|
+
return res
|
|
552
|
+
else:
|
|
553
|
+
return APIResponse(
|
|
554
|
+
id=-1,
|
|
555
|
+
model_internal="",
|
|
556
|
+
prompt=Conversation([]),
|
|
557
|
+
sampling_params=SamplingParams(),
|
|
558
|
+
status_code=500,
|
|
559
|
+
is_error=True,
|
|
560
|
+
error_message="Task not found",
|
|
561
|
+
)
|
|
530
562
|
|
|
531
563
|
async def wait_for_all(
|
|
532
564
|
self, task_ids: Sequence[int] | None = None
|
|
533
|
-
) -> list[APIResponse
|
|
565
|
+
) -> list[APIResponse]:
|
|
534
566
|
if task_ids is None:
|
|
535
567
|
task_ids = list(self._tasks.keys())
|
|
536
568
|
return [await self.wait_for(tid) for tid in task_ids]
|
|
537
569
|
|
|
538
570
|
async def as_completed(
|
|
539
571
|
self, task_ids: Sequence[int] | None = None
|
|
540
|
-
) -> AsyncGenerator[tuple[int, APIResponse
|
|
572
|
+
) -> AsyncGenerator[tuple[int, APIResponse], None]:
|
|
541
573
|
"""Yield ``(task_id, result)`` pairs as tasks complete.
|
|
542
574
|
|
|
543
575
|
Args:
|
|
@@ -561,7 +593,9 @@ class _LLMClient(BaseModel):
|
|
|
561
593
|
for task in list(tasks_map.keys()):
|
|
562
594
|
if task.done():
|
|
563
595
|
tid = tasks_map.pop(task)
|
|
564
|
-
|
|
596
|
+
task_result = self._results.get(tid, await task)
|
|
597
|
+
assert task_result
|
|
598
|
+
yield tid, task_result
|
|
565
599
|
|
|
566
600
|
while tasks_map:
|
|
567
601
|
done, _ = await asyncio.wait(
|
|
@@ -569,7 +603,9 @@ class _LLMClient(BaseModel):
|
|
|
569
603
|
)
|
|
570
604
|
for task in done:
|
|
571
605
|
tid = tasks_map.pop(task)
|
|
572
|
-
|
|
606
|
+
task_result = self._results.get(tid, await task)
|
|
607
|
+
assert task_result
|
|
608
|
+
yield tid, task_result
|
|
573
609
|
|
|
574
610
|
async def stream(
|
|
575
611
|
self,
|
|
@@ -682,7 +718,7 @@ class _LLMClient(BaseModel):
|
|
|
682
718
|
|
|
683
719
|
async def submit_batch_job(
|
|
684
720
|
self,
|
|
685
|
-
prompts:
|
|
721
|
+
prompts: Prompt | Sequence[Prompt],
|
|
686
722
|
*,
|
|
687
723
|
tools: list[Tool] | None = None,
|
|
688
724
|
cache: CachePattern | None = None,
|
|
@@ -744,6 +780,8 @@ def LLMClient(
|
|
|
744
780
|
request_timeout: int = 30,
|
|
745
781
|
cache: Any = None,
|
|
746
782
|
extra_headers: dict[str, str] | None = None,
|
|
783
|
+
use_responses_api: bool = False,
|
|
784
|
+
background: bool = False,
|
|
747
785
|
temperature: float = 0.75,
|
|
748
786
|
top_p: float = 1.0,
|
|
749
787
|
json_mode: bool = False,
|
|
@@ -771,6 +809,8 @@ def LLMClient(
|
|
|
771
809
|
request_timeout: int = 30,
|
|
772
810
|
cache: Any = None,
|
|
773
811
|
extra_headers: dict[str, str] | None = None,
|
|
812
|
+
use_responses_api: bool = False,
|
|
813
|
+
background: bool = False,
|
|
774
814
|
temperature: float = 0.75,
|
|
775
815
|
top_p: float = 1.0,
|
|
776
816
|
json_mode: bool = False,
|
|
@@ -797,6 +837,8 @@ def LLMClient(
|
|
|
797
837
|
request_timeout: int = 30,
|
|
798
838
|
cache: Any = None,
|
|
799
839
|
extra_headers: dict[str, str] | None = None,
|
|
840
|
+
use_responses_api: bool = False,
|
|
841
|
+
background: bool = False,
|
|
800
842
|
temperature: float = 0.75,
|
|
801
843
|
top_p: float = 1.0,
|
|
802
844
|
json_mode: bool = False,
|
|
@@ -835,6 +877,8 @@ def LLMClient(
|
|
|
835
877
|
request_timeout=request_timeout,
|
|
836
878
|
cache=cache,
|
|
837
879
|
extra_headers=extra_headers,
|
|
880
|
+
use_responses_api=use_responses_api,
|
|
881
|
+
background=background,
|
|
838
882
|
temperature=temperature,
|
|
839
883
|
top_p=top_p,
|
|
840
884
|
json_mode=json_mode,
|
|
@@ -38,9 +38,9 @@ class APIModel:
|
|
|
38
38
|
supports_responses: bool = False
|
|
39
39
|
reasoning_model: bool = False
|
|
40
40
|
regions: list[str] | dict[str, int] = field(default_factory=list)
|
|
41
|
-
tokens_per_minute: int | None = None
|
|
42
|
-
requests_per_minute: int | None = None
|
|
43
|
-
gpus: list[str] | None = None
|
|
41
|
+
# tokens_per_minute: int | None = None
|
|
42
|
+
# requests_per_minute: int | None = None
|
|
43
|
+
# gpus: list[str] | None = None
|
|
44
44
|
|
|
45
45
|
@classmethod
|
|
46
46
|
def from_registry(cls, name: str):
|
|
@@ -62,7 +62,7 @@ class APIModel:
|
|
|
62
62
|
raise ValueError("no regions to sample")
|
|
63
63
|
random.sample(regions, 1, counts=weights)[0]
|
|
64
64
|
|
|
65
|
-
def make_request(self, context: RequestContext):
|
|
65
|
+
def make_request(self, context: RequestContext):
|
|
66
66
|
from ..api_requests.common import CLASSES
|
|
67
67
|
|
|
68
68
|
api_spec = self.api_spec
|
|
@@ -97,8 +97,8 @@ def register_model(
|
|
|
97
97
|
supports_responses: bool = False,
|
|
98
98
|
reasoning_model: bool = False,
|
|
99
99
|
regions: list[str] | dict[str, int] = field(default_factory=list),
|
|
100
|
-
tokens_per_minute: int | None = None,
|
|
101
|
-
requests_per_minute: int | None = None,
|
|
100
|
+
# tokens_per_minute: int | None = None,
|
|
101
|
+
# requests_per_minute: int | None = None,
|
|
102
102
|
) -> APIModel:
|
|
103
103
|
"""Register a model configuration and return the created APIModel."""
|
|
104
104
|
model = APIModel(
|
|
@@ -116,8 +116,8 @@ def register_model(
|
|
|
116
116
|
supports_responses=supports_responses,
|
|
117
117
|
reasoning_model=reasoning_model,
|
|
118
118
|
regions=regions,
|
|
119
|
-
tokens_per_minute=tokens_per_minute,
|
|
120
|
-
requests_per_minute=requests_per_minute,
|
|
119
|
+
# tokens_per_minute=tokens_per_minute,
|
|
120
|
+
# requests_per_minute=requests_per_minute,
|
|
121
121
|
)
|
|
122
122
|
registry[model.id] = model
|
|
123
123
|
return model
|