lm-deluge 0.0.58__tar.gz → 0.0.60__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lm-deluge might be problematic. Click here for more details.
- {lm_deluge-0.0.58/src/lm_deluge.egg-info → lm_deluge-0.0.60}/PKG-INFO +1 -1
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/pyproject.toml +1 -1
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/base.py +87 -5
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/bedrock.py +3 -4
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/gemini.py +7 -6
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/mistral.py +8 -9
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/openai.py +57 -16
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/batches.py +25 -9
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/client.py +187 -31
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/__init__.py +1 -1
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/openai.py +28 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/prompt.py +89 -21
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/request_context.py +9 -11
- lm_deluge-0.0.60/src/lm_deluge/warnings.py +46 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60/src/lm_deluge.egg-info}/PKG-INFO +1 -1
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge.egg-info/SOURCES.txt +1 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/LICENSE +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/README.md +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/setup.cfg +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/__init__.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/__init__.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/anthropic.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/common.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/api_requests/response.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/built_in_tools/anthropic/__init__.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/built_in_tools/anthropic/bash.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/built_in_tools/anthropic/computer_use.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/built_in_tools/anthropic/editor.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/built_in_tools/base.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/built_in_tools/openai.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/cache.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/cli.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/config.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/embed.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/errors.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/file.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/image.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/llm_tools/__init__.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/llm_tools/classify.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/llm_tools/extract.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/llm_tools/locate.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/llm_tools/ocr.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/llm_tools/score.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/llm_tools/translate.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/anthropic.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/bedrock.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/cerebras.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/cohere.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/deepseek.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/fireworks.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/google.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/grok.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/groq.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/meta.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/mistral.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/openrouter.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/models/together.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/presets/cerebras.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/presets/meta.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/rerank.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/tool.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/tracker.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/usage.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/util/harmony.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/util/json.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/util/logprobs.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/util/spatial.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/util/validation.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge/util/xml.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge.egg-info/requires.txt +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/src/lm_deluge.egg-info/top_level.txt +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/tests/test_builtin_tools.py +0 -0
- {lm_deluge-0.0.58 → lm_deluge-0.0.60}/tests/test_native_mcp_server.py +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import time
|
|
2
3
|
import traceback
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
5
|
|
|
@@ -6,6 +7,7 @@ import aiohttp
|
|
|
6
7
|
from aiohttp import ClientResponse
|
|
7
8
|
|
|
8
9
|
from ..errors import raise_if_modal_exception
|
|
10
|
+
from ..models.openai import OPENAI_MODELS
|
|
9
11
|
from ..request_context import RequestContext
|
|
10
12
|
from .response import APIResponse
|
|
11
13
|
|
|
@@ -82,15 +84,95 @@ class APIRequestBase(ABC):
|
|
|
82
84
|
if self.context.status_tracker:
|
|
83
85
|
self.context.status_tracker.task_succeeded(self.context.task_id)
|
|
84
86
|
|
|
87
|
+
async def _execute_once_background_mode(self) -> APIResponse:
|
|
88
|
+
"""
|
|
89
|
+
ONLY for OpenAI responses API. Implement the
|
|
90
|
+
start -> poll -> result style of request.
|
|
91
|
+
"""
|
|
92
|
+
assert self.context.status_tracker, "no status tracker"
|
|
93
|
+
start_time = time.time()
|
|
94
|
+
async with aiohttp.ClientSession() as session:
|
|
95
|
+
last_status: str | None = None
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
self.context.status_tracker.total_requests += 1
|
|
99
|
+
assert self.url is not None, "URL is not set"
|
|
100
|
+
async with session.post(
|
|
101
|
+
url=self.url,
|
|
102
|
+
headers=self.request_header,
|
|
103
|
+
json=self.request_json,
|
|
104
|
+
) as http_response:
|
|
105
|
+
# make sure we created the Response object
|
|
106
|
+
http_response.raise_for_status()
|
|
107
|
+
data = await http_response.json()
|
|
108
|
+
response_id = data["id"]
|
|
109
|
+
last_status = data["status"]
|
|
110
|
+
|
|
111
|
+
while True:
|
|
112
|
+
if time.time() - start_time > self.context.request_timeout:
|
|
113
|
+
# cancel the response
|
|
114
|
+
async with session.post(
|
|
115
|
+
url=f"{self.url}/{response_id}/cancel",
|
|
116
|
+
headers=self.request_header,
|
|
117
|
+
) as http_response:
|
|
118
|
+
http_response.raise_for_status()
|
|
119
|
+
|
|
120
|
+
return APIResponse(
|
|
121
|
+
id=self.context.task_id,
|
|
122
|
+
model_internal=self.context.model_name,
|
|
123
|
+
prompt=self.context.prompt,
|
|
124
|
+
sampling_params=self.context.sampling_params,
|
|
125
|
+
status_code=None,
|
|
126
|
+
is_error=True,
|
|
127
|
+
error_message="Request timed out (terminated by client).",
|
|
128
|
+
content=None,
|
|
129
|
+
usage=None,
|
|
130
|
+
)
|
|
131
|
+
# poll for the response
|
|
132
|
+
await asyncio.sleep(5.0)
|
|
133
|
+
async with session.get(
|
|
134
|
+
url=f"{self.url}/{response_id}",
|
|
135
|
+
headers=self.request_header,
|
|
136
|
+
) as http_response:
|
|
137
|
+
http_response.raise_for_status()
|
|
138
|
+
data = await http_response.json()
|
|
139
|
+
|
|
140
|
+
if data["status"] != last_status:
|
|
141
|
+
print(
|
|
142
|
+
f"Background req {response_id} status updated to: {data['status']}"
|
|
143
|
+
)
|
|
144
|
+
last_status = data["status"]
|
|
145
|
+
if last_status not in ["queued", "in_progress"]:
|
|
146
|
+
return await self.handle_response(http_response)
|
|
147
|
+
|
|
148
|
+
except Exception as e:
|
|
149
|
+
raise_if_modal_exception(e)
|
|
150
|
+
tb = traceback.format_exc()
|
|
151
|
+
print(tb)
|
|
152
|
+
return APIResponse(
|
|
153
|
+
id=self.context.task_id,
|
|
154
|
+
model_internal=self.context.model_name,
|
|
155
|
+
prompt=self.context.prompt,
|
|
156
|
+
sampling_params=self.context.sampling_params,
|
|
157
|
+
status_code=None,
|
|
158
|
+
is_error=True,
|
|
159
|
+
error_message=f"Unexpected {type(e).__name__}: {str(e) or 'No message.'}",
|
|
160
|
+
content=None,
|
|
161
|
+
usage=None,
|
|
162
|
+
)
|
|
163
|
+
|
|
85
164
|
async def execute_once(self) -> APIResponse:
|
|
86
165
|
"""Send the HTTP request once and return the parsed APIResponse."""
|
|
87
166
|
await self.build_request()
|
|
88
167
|
assert self.context.status_tracker
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
168
|
+
|
|
169
|
+
if (
|
|
170
|
+
self.context.background
|
|
171
|
+
and self.context.use_responses_api
|
|
172
|
+
and self.context.model_name in OPENAI_MODELS
|
|
173
|
+
):
|
|
174
|
+
return await self._execute_once_background_mode()
|
|
175
|
+
|
|
94
176
|
try:
|
|
95
177
|
self.context.status_tracker.total_requests += 1
|
|
96
178
|
timeout = aiohttp.ClientTimeout(total=self.context.request_timeout)
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
|
-
import warnings
|
|
5
4
|
|
|
6
5
|
from aiohttp import ClientResponse
|
|
7
6
|
|
|
7
|
+
from lm_deluge.warnings import maybe_warn
|
|
8
|
+
|
|
8
9
|
try:
|
|
9
10
|
from requests_aws4auth import AWS4Auth
|
|
10
11
|
except ImportError:
|
|
@@ -187,9 +188,7 @@ async def _build_openai_bedrock_request(
|
|
|
187
188
|
# Note: GPT-OSS on Bedrock doesn't support response_format parameter
|
|
188
189
|
# Even though the model supports JSON, we can't use the response_format parameter
|
|
189
190
|
if sampling_params.json_mode and model.supports_json:
|
|
190
|
-
|
|
191
|
-
f"JSON mode requested for {model.name} but response_format parameter not supported on Bedrock"
|
|
192
|
-
)
|
|
191
|
+
maybe_warn("WARN_JSON_MODE_UNSUPPORTED", model_name=model.name)
|
|
193
192
|
|
|
194
193
|
if tools:
|
|
195
194
|
request_tools = []
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
import warnings
|
|
4
3
|
from typing import Any
|
|
4
|
+
|
|
5
5
|
from aiohttp import ClientResponse
|
|
6
6
|
|
|
7
7
|
from lm_deluge.request_context import RequestContext
|
|
8
8
|
from lm_deluge.tool import Tool
|
|
9
|
+
from lm_deluge.warnings import maybe_warn
|
|
9
10
|
|
|
10
11
|
from ..config import SamplingParams
|
|
11
12
|
from ..models import APIModel
|
|
@@ -54,9 +55,7 @@ async def _build_gemini_request(
|
|
|
54
55
|
|
|
55
56
|
else:
|
|
56
57
|
if sampling_params.reasoning_effort:
|
|
57
|
-
|
|
58
|
-
f"Ignoring reasoning_effort param for non-reasoning model: {model.name}"
|
|
59
|
-
)
|
|
58
|
+
maybe_warn("WARN_REASONING_UNSUPPORTED", model_name=model.name)
|
|
60
59
|
|
|
61
60
|
# Add tools if provided
|
|
62
61
|
if tools:
|
|
@@ -76,8 +75,10 @@ class GeminiRequest(APIRequestBase):
|
|
|
76
75
|
|
|
77
76
|
# Warn if cache is specified for Gemini model
|
|
78
77
|
if self.context.cache is not None:
|
|
79
|
-
|
|
80
|
-
|
|
78
|
+
maybe_warn(
|
|
79
|
+
"WARN_CACHING_UNSUPPORTED",
|
|
80
|
+
model_name=self.context.model_name,
|
|
81
|
+
cache_param=self.context.cache,
|
|
81
82
|
)
|
|
82
83
|
|
|
83
84
|
self.model = APIModel.from_registry(self.context.model_name)
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
import warnings
|
|
4
3
|
|
|
5
4
|
from aiohttp import ClientResponse
|
|
6
5
|
|
|
6
|
+
from lm_deluge.warnings import maybe_warn
|
|
7
|
+
|
|
7
8
|
from ..models import APIModel
|
|
8
9
|
from ..prompt import Message
|
|
9
10
|
from ..request_context import RequestContext
|
|
@@ -17,8 +18,10 @@ class MistralRequest(APIRequestBase):
|
|
|
17
18
|
|
|
18
19
|
# Warn if cache is specified for non-Anthropic model
|
|
19
20
|
if self.context.cache is not None:
|
|
20
|
-
|
|
21
|
-
|
|
21
|
+
maybe_warn(
|
|
22
|
+
"WARN_CACHING_UNSUPPORTED",
|
|
23
|
+
model_name=self.context.model_name,
|
|
24
|
+
cache_param=self.context.cache,
|
|
22
25
|
)
|
|
23
26
|
self.model = APIModel.from_registry(self.context.model_name)
|
|
24
27
|
|
|
@@ -38,13 +41,9 @@ class MistralRequest(APIRequestBase):
|
|
|
38
41
|
"max_tokens": self.context.sampling_params.max_new_tokens,
|
|
39
42
|
}
|
|
40
43
|
if self.context.sampling_params.reasoning_effort:
|
|
41
|
-
|
|
42
|
-
f"Ignoring reasoning_effort param for non-reasoning model: {self.context.model_name}"
|
|
43
|
-
)
|
|
44
|
+
maybe_warn("WARN_REASONING_UNSUPPORTED", model_name=self.context.model_name)
|
|
44
45
|
if self.context.sampling_params.logprobs:
|
|
45
|
-
|
|
46
|
-
f"Ignoring logprobs param for non-logprobs model: {self.context.model_name}"
|
|
47
|
-
)
|
|
46
|
+
maybe_warn("WARN_LOGPROBS_UNSUPPORTED", model_name=self.context.model_name)
|
|
48
47
|
if self.context.sampling_params.json_mode and self.model.supports_json:
|
|
49
48
|
self.request_json["response_format"] = {"type": "json_object"}
|
|
50
49
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import traceback as tb
|
|
4
|
-
import warnings
|
|
5
4
|
from types import SimpleNamespace
|
|
6
5
|
|
|
7
6
|
import aiohttp
|
|
@@ -9,6 +8,7 @@ from aiohttp import ClientResponse
|
|
|
9
8
|
|
|
10
9
|
from lm_deluge.request_context import RequestContext
|
|
11
10
|
from lm_deluge.tool import MCPServer, Tool
|
|
11
|
+
from lm_deluge.warnings import maybe_warn
|
|
12
12
|
|
|
13
13
|
from ..config import SamplingParams
|
|
14
14
|
from ..models import APIModel
|
|
@@ -30,6 +30,26 @@ async def _build_oa_chat_request(
|
|
|
30
30
|
"temperature": sampling_params.temperature,
|
|
31
31
|
"top_p": sampling_params.top_p,
|
|
32
32
|
}
|
|
33
|
+
if context.service_tier:
|
|
34
|
+
assert context.service_tier in [
|
|
35
|
+
"auto",
|
|
36
|
+
"default",
|
|
37
|
+
"flex",
|
|
38
|
+
"priority",
|
|
39
|
+
], f"Invalid service tier: {context.service_tier}"
|
|
40
|
+
# flex is only supported for o3, o4-mini, gpt-5 models
|
|
41
|
+
if context.service_tier == "flex":
|
|
42
|
+
model_supports_flex = any(x in model.id for x in ["o3", "o4-mini", "gpt-5"])
|
|
43
|
+
if not model_supports_flex:
|
|
44
|
+
print(
|
|
45
|
+
f"WARNING: service_tier='flex' only supported for o3, o4-mini, gpt-5. "
|
|
46
|
+
f"Using 'auto' instead for model {model.id}."
|
|
47
|
+
)
|
|
48
|
+
request_json["service_tier"] = "auto"
|
|
49
|
+
else:
|
|
50
|
+
request_json["service_tier"] = context.service_tier
|
|
51
|
+
else:
|
|
52
|
+
request_json["service_tier"] = context.service_tier
|
|
33
53
|
# set max_tokens or max_completion_tokens dep. on provider
|
|
34
54
|
if "cohere" in model.api_base:
|
|
35
55
|
request_json["max_tokens"] = sampling_params.max_new_tokens
|
|
@@ -55,9 +75,8 @@ async def _build_oa_chat_request(
|
|
|
55
75
|
request_json["reasoning_effort"] = effort
|
|
56
76
|
else:
|
|
57
77
|
if sampling_params.reasoning_effort:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
)
|
|
78
|
+
maybe_warn("WARN_REASONING_UNSUPPORTED", model_name=context.model_name)
|
|
79
|
+
|
|
61
80
|
if sampling_params.logprobs:
|
|
62
81
|
request_json["logprobs"] = True
|
|
63
82
|
if sampling_params.top_logprobs is not None:
|
|
@@ -85,8 +104,10 @@ class OpenAIRequest(APIRequestBase):
|
|
|
85
104
|
|
|
86
105
|
# Warn if cache is specified for non-Anthropic model
|
|
87
106
|
if self.context.cache is not None:
|
|
88
|
-
|
|
89
|
-
|
|
107
|
+
maybe_warn(
|
|
108
|
+
"WARN_CACHING_UNSUPPORTED",
|
|
109
|
+
model_name=self.context.model_name,
|
|
110
|
+
cache_param=self.context.cache,
|
|
90
111
|
)
|
|
91
112
|
self.model = APIModel.from_registry(self.context.model_name)
|
|
92
113
|
|
|
@@ -213,9 +234,6 @@ class OpenAIRequest(APIRequestBase):
|
|
|
213
234
|
async def _build_oa_responses_request(
|
|
214
235
|
model: APIModel,
|
|
215
236
|
context: RequestContext,
|
|
216
|
-
# prompt: Conversation,
|
|
217
|
-
# tools: list[Tool] | None,
|
|
218
|
-
# sampling_params: SamplingParams,
|
|
219
237
|
):
|
|
220
238
|
prompt = context.prompt
|
|
221
239
|
sampling_params = context.sampling_params
|
|
@@ -226,7 +244,28 @@ async def _build_oa_responses_request(
|
|
|
226
244
|
"input": openai_responses_format["input"],
|
|
227
245
|
"temperature": sampling_params.temperature,
|
|
228
246
|
"top_p": sampling_params.top_p,
|
|
247
|
+
"background": context.background or False,
|
|
229
248
|
}
|
|
249
|
+
if context.service_tier:
|
|
250
|
+
assert context.service_tier in [
|
|
251
|
+
"auto",
|
|
252
|
+
"default",
|
|
253
|
+
"flex",
|
|
254
|
+
"priority",
|
|
255
|
+
], f"Invalid service tier: {context.service_tier}"
|
|
256
|
+
# flex is only supported for o3, o4-mini, gpt-5 models
|
|
257
|
+
if context.service_tier == "flex":
|
|
258
|
+
model_supports_flex = any(x in model.id for x in ["o3", "o4-mini", "gpt-5"])
|
|
259
|
+
if not model_supports_flex:
|
|
260
|
+
print(
|
|
261
|
+
f"WARNING: service_tier='flex' only supported for o3, o4-mini, gpt-5. "
|
|
262
|
+
f"Model {model.id} doesn't support flex. Using 'auto' instead."
|
|
263
|
+
)
|
|
264
|
+
request_json["service_tier"] = "auto"
|
|
265
|
+
else:
|
|
266
|
+
request_json["service_tier"] = context.service_tier
|
|
267
|
+
else:
|
|
268
|
+
request_json["service_tier"] = context.service_tier
|
|
230
269
|
if sampling_params.max_new_tokens:
|
|
231
270
|
request_json["max_output_tokens"] = sampling_params.max_new_tokens
|
|
232
271
|
|
|
@@ -245,9 +284,7 @@ async def _build_oa_responses_request(
|
|
|
245
284
|
}
|
|
246
285
|
else:
|
|
247
286
|
if sampling_params.reasoning_effort:
|
|
248
|
-
|
|
249
|
-
f"Ignoring reasoning_effort for non-reasoning model: {model.id}"
|
|
250
|
-
)
|
|
287
|
+
maybe_warn("WARN_REASONING_UNSUPPORTED", model_name=context.model_name)
|
|
251
288
|
|
|
252
289
|
if sampling_params.json_mode and model.supports_json:
|
|
253
290
|
request_json["text"] = {"format": {"type": "json_object"}}
|
|
@@ -284,8 +321,10 @@ class OpenAIResponsesRequest(APIRequestBase):
|
|
|
284
321
|
super().__init__(context)
|
|
285
322
|
# Warn if cache is specified for non-Anthropic model
|
|
286
323
|
if self.context.cache is not None:
|
|
287
|
-
|
|
288
|
-
|
|
324
|
+
maybe_warn(
|
|
325
|
+
"WARN_CACHING_UNSUPPORTED",
|
|
326
|
+
model_name=self.context.model_name,
|
|
327
|
+
cache_param=self.context.cache,
|
|
289
328
|
)
|
|
290
329
|
self.model = APIModel.from_registry(self.context.model_name)
|
|
291
330
|
|
|
@@ -488,8 +527,10 @@ async def stream_chat(
|
|
|
488
527
|
extra_headers: dict[str, str] | None = None,
|
|
489
528
|
):
|
|
490
529
|
if cache is not None:
|
|
491
|
-
|
|
492
|
-
|
|
530
|
+
maybe_warn(
|
|
531
|
+
"WARN_CACHING_UNSUPPORTED",
|
|
532
|
+
model_name=model_name,
|
|
533
|
+
cache_param=cache,
|
|
493
534
|
)
|
|
494
535
|
|
|
495
536
|
model = APIModel.from_registry(model_name)
|
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import os
|
|
4
4
|
import tempfile
|
|
5
5
|
import time
|
|
6
|
-
from typing import Literal, Sequence
|
|
6
|
+
from typing import Literal, Sequence, cast
|
|
7
7
|
|
|
8
8
|
import aiohttp
|
|
9
9
|
from rich.console import Console
|
|
@@ -16,7 +16,12 @@ from lm_deluge.api_requests.anthropic import _build_anthropic_request
|
|
|
16
16
|
from lm_deluge.api_requests.openai import _build_oa_chat_request
|
|
17
17
|
from lm_deluge.config import SamplingParams
|
|
18
18
|
from lm_deluge.models import APIModel, registry
|
|
19
|
-
from lm_deluge.prompt import
|
|
19
|
+
from lm_deluge.prompt import (
|
|
20
|
+
CachePattern,
|
|
21
|
+
Conversation,
|
|
22
|
+
Prompt,
|
|
23
|
+
prompts_to_conversations,
|
|
24
|
+
)
|
|
20
25
|
from lm_deluge.request_context import RequestContext
|
|
21
26
|
|
|
22
27
|
|
|
@@ -166,14 +171,18 @@ async def _submit_anthropic_batch(file_path: str, headers: dict, model: str):
|
|
|
166
171
|
async def create_batch_files_oa(
|
|
167
172
|
model: str,
|
|
168
173
|
sampling_params: SamplingParams,
|
|
169
|
-
prompts:
|
|
174
|
+
prompts: Prompt | Sequence[Prompt],
|
|
170
175
|
batch_size: int = 50_000,
|
|
171
176
|
destination: str | None = None, # if none provided, temp files
|
|
172
177
|
):
|
|
173
178
|
MAX_BATCH_SIZE_BYTES = 200 * 1024 * 1024 # 200MB
|
|
174
179
|
MAX_BATCH_SIZE_ITEMS = batch_size
|
|
175
180
|
|
|
176
|
-
|
|
181
|
+
if not isinstance(prompts, list):
|
|
182
|
+
prompts = cast(Sequence[Prompt], [prompts])
|
|
183
|
+
|
|
184
|
+
prompts = prompts_to_conversations(cast(Sequence[Prompt], prompts))
|
|
185
|
+
assert isinstance(prompts, Sequence)
|
|
177
186
|
if any(p is None for p in prompts):
|
|
178
187
|
raise ValueError("All prompts must be valid.")
|
|
179
188
|
|
|
@@ -251,14 +260,18 @@ async def create_batch_files_oa(
|
|
|
251
260
|
async def submit_batches_oa(
|
|
252
261
|
model: str,
|
|
253
262
|
sampling_params: SamplingParams,
|
|
254
|
-
prompts:
|
|
263
|
+
prompts: Prompt | Sequence[Prompt],
|
|
255
264
|
batch_size: int = 50_000,
|
|
256
265
|
):
|
|
257
266
|
"""Write OpenAI batch requests to a file and submit."""
|
|
258
267
|
MAX_BATCH_SIZE_BYTES = 200 * 1024 * 1024 # 200MB
|
|
259
268
|
MAX_BATCH_SIZE_ITEMS = batch_size
|
|
260
269
|
|
|
261
|
-
|
|
270
|
+
if not isinstance(prompts, list):
|
|
271
|
+
prompts = prompts = cast(Sequence[Prompt], [prompts])
|
|
272
|
+
|
|
273
|
+
prompts = prompts_to_conversations(cast(Sequence[Prompt], prompts))
|
|
274
|
+
assert isinstance(prompts, Sequence)
|
|
262
275
|
if any(p is None for p in prompts):
|
|
263
276
|
raise ValueError("All prompts must be valid.")
|
|
264
277
|
|
|
@@ -342,7 +355,7 @@ async def submit_batches_oa(
|
|
|
342
355
|
async def submit_batches_anthropic(
|
|
343
356
|
model: str,
|
|
344
357
|
sampling_params: SamplingParams,
|
|
345
|
-
prompts:
|
|
358
|
+
prompts: Prompt | Sequence[Prompt],
|
|
346
359
|
*,
|
|
347
360
|
cache: CachePattern | None = None,
|
|
348
361
|
batch_size=100_000,
|
|
@@ -362,13 +375,16 @@ async def submit_batches_anthropic(
|
|
|
362
375
|
MAX_BATCH_SIZE_ITEMS = batch_size
|
|
363
376
|
|
|
364
377
|
# Convert prompts to Conversations
|
|
365
|
-
|
|
378
|
+
if not isinstance(prompts, list):
|
|
379
|
+
prompts = prompts = cast(Sequence[Prompt], [prompts])
|
|
380
|
+
|
|
381
|
+
prompts = prompts_to_conversations(cast(Sequence[Prompt], prompts))
|
|
366
382
|
|
|
367
383
|
request_headers = None
|
|
368
384
|
batch_tasks = []
|
|
369
385
|
current_batch = []
|
|
370
386
|
current_batch_size = 0
|
|
371
|
-
|
|
387
|
+
assert isinstance(prompts, Sequence)
|
|
372
388
|
for idx, prompt in enumerate(prompts):
|
|
373
389
|
assert isinstance(prompt, Conversation)
|
|
374
390
|
context = RequestContext(
|