lm-deluge 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lm-deluge might be problematic. Click here for more details.
- lm_deluge/api_requests/anthropic.py +24 -9
- lm_deluge/api_requests/base.py +40 -16
- lm_deluge/api_requests/bedrock.py +26 -13
- lm_deluge/api_requests/mistral.py +14 -7
- lm_deluge/api_requests/openai.py +13 -7
- lm_deluge/client.py +10 -1
- lm_deluge/prompt.py +87 -1
- lm_deluge/usage.py +114 -0
- {lm_deluge-0.0.11.dist-info → lm_deluge-0.0.12.dist-info}/METADATA +74 -16
- {lm_deluge-0.0.11.dist-info → lm_deluge-0.0.12.dist-info}/RECORD +13 -12
- {lm_deluge-0.0.11.dist-info → lm_deluge-0.0.12.dist-info}/WHEEL +0 -0
- {lm_deluge-0.0.11.dist-info → lm_deluge-0.0.12.dist-info}/licenses/LICENSE +0 -0
- {lm_deluge-0.0.11.dist-info → lm_deluge-0.0.12.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,15 @@ import warnings
|
|
|
6
6
|
from tqdm import tqdm
|
|
7
7
|
from typing import Callable
|
|
8
8
|
|
|
9
|
-
from lm_deluge.prompt import
|
|
9
|
+
from lm_deluge.prompt import (
|
|
10
|
+
Conversation,
|
|
11
|
+
Message,
|
|
12
|
+
Text,
|
|
13
|
+
ToolCall,
|
|
14
|
+
Thinking,
|
|
15
|
+
CachePattern,
|
|
16
|
+
)
|
|
17
|
+
from lm_deluge.usage import Usage
|
|
10
18
|
from .base import APIRequestBase, APIResponse
|
|
11
19
|
|
|
12
20
|
from ..tracker import StatusTracker
|
|
@@ -35,6 +43,7 @@ class AnthropicRequest(APIRequestBase):
|
|
|
35
43
|
all_model_names: list[str] | None = None,
|
|
36
44
|
all_sampling_params: list[SamplingParams] | None = None,
|
|
37
45
|
tools: list | None = None,
|
|
46
|
+
cache: CachePattern | None = None,
|
|
38
47
|
):
|
|
39
48
|
super().__init__(
|
|
40
49
|
task_id=task_id,
|
|
@@ -52,11 +61,16 @@ class AnthropicRequest(APIRequestBase):
|
|
|
52
61
|
all_model_names=all_model_names,
|
|
53
62
|
all_sampling_params=all_sampling_params,
|
|
54
63
|
tools=tools,
|
|
64
|
+
cache=cache,
|
|
55
65
|
)
|
|
56
66
|
self.model = APIModel.from_registry(model_name)
|
|
57
67
|
self.url = f"{self.model.api_base}/messages"
|
|
58
68
|
|
|
59
|
-
|
|
69
|
+
# Lock images as bytes if caching is enabled
|
|
70
|
+
if cache is not None:
|
|
71
|
+
prompt.lock_images_as_bytes()
|
|
72
|
+
|
|
73
|
+
self.system_message, messages = prompt.to_anthropic(cache_pattern=cache)
|
|
60
74
|
self.request_header = {
|
|
61
75
|
"x-api-key": os.getenv(self.model.api_key_env_var),
|
|
62
76
|
"anthropic-version": "2023-06-01",
|
|
@@ -97,15 +111,18 @@ class AnthropicRequest(APIRequestBase):
|
|
|
97
111
|
if self.system_message is not None:
|
|
98
112
|
self.request_json["system"] = self.system_message
|
|
99
113
|
if tools:
|
|
100
|
-
|
|
114
|
+
tool_definitions = [tool.dump_for("anthropic") for tool in tools]
|
|
115
|
+
# Add cache control to last tool if tools_only caching is specified
|
|
116
|
+
if cache == "tools_only" and tool_definitions:
|
|
117
|
+
tool_definitions[-1]["cache_control"] = {"type": "ephemeral"}
|
|
118
|
+
self.request_json["tools"] = tool_definitions
|
|
101
119
|
|
|
102
120
|
async def handle_response(self, http_response: ClientResponse) -> APIResponse:
|
|
103
121
|
is_error = False
|
|
104
122
|
error_message = None
|
|
105
123
|
thinking = None
|
|
106
124
|
content = None
|
|
107
|
-
|
|
108
|
-
output_tokens = None
|
|
125
|
+
usage = None
|
|
109
126
|
status_code = http_response.status
|
|
110
127
|
mimetype = http_response.headers.get("Content-Type", None)
|
|
111
128
|
rate_limits = {}
|
|
@@ -143,8 +160,7 @@ class AnthropicRequest(APIRequestBase):
|
|
|
143
160
|
)
|
|
144
161
|
|
|
145
162
|
content = Message("assistant", parts)
|
|
146
|
-
|
|
147
|
-
output_tokens = data["usage"]["output_tokens"]
|
|
163
|
+
usage = Usage.from_anthropic_usage(data["usage"])
|
|
148
164
|
except Exception as e:
|
|
149
165
|
is_error = True
|
|
150
166
|
error_message = (
|
|
@@ -182,6 +198,5 @@ class AnthropicRequest(APIRequestBase):
|
|
|
182
198
|
thinking=thinking,
|
|
183
199
|
model_internal=self.model_name,
|
|
184
200
|
sampling_params=self.sampling_params,
|
|
185
|
-
|
|
186
|
-
output_tokens=output_tokens,
|
|
201
|
+
usage=usage,
|
|
187
202
|
)
|
lm_deluge/api_requests/base.py
CHANGED
|
@@ -7,7 +7,8 @@ from dataclasses import dataclass
|
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
8
|
from typing import Callable
|
|
9
9
|
|
|
10
|
-
from lm_deluge.prompt import Conversation, Message
|
|
10
|
+
from lm_deluge.prompt import Conversation, Message, CachePattern
|
|
11
|
+
from lm_deluge.usage import Usage
|
|
11
12
|
|
|
12
13
|
from ..tracker import StatusTracker
|
|
13
14
|
from ..sampling_params import SamplingParams
|
|
@@ -29,9 +30,8 @@ class APIResponse:
|
|
|
29
30
|
is_error: bool | None
|
|
30
31
|
error_message: str | None
|
|
31
32
|
|
|
32
|
-
# completion information
|
|
33
|
-
|
|
34
|
-
output_tokens: int | None
|
|
33
|
+
# completion information - unified usage tracking
|
|
34
|
+
usage: Usage | None = None
|
|
35
35
|
|
|
36
36
|
# response content - structured format
|
|
37
37
|
content: Message | None = None
|
|
@@ -56,6 +56,26 @@ class APIResponse:
|
|
|
56
56
|
return self.content.completion
|
|
57
57
|
return None
|
|
58
58
|
|
|
59
|
+
@property
|
|
60
|
+
def input_tokens(self) -> int | None:
|
|
61
|
+
"""Get input tokens from usage object."""
|
|
62
|
+
return self.usage.input_tokens if self.usage else None
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def output_tokens(self) -> int | None:
|
|
66
|
+
"""Get output tokens from usage object."""
|
|
67
|
+
return self.usage.output_tokens if self.usage else None
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def cache_read_tokens(self) -> int | None:
|
|
71
|
+
"""Get cache read tokens from usage object."""
|
|
72
|
+
return self.usage.cache_read_tokens if self.usage else None
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def cache_write_tokens(self) -> int | None:
|
|
76
|
+
"""Get cache write tokens from usage object."""
|
|
77
|
+
return self.usage.cache_write_tokens if self.usage else None
|
|
78
|
+
|
|
59
79
|
def __post_init__(self):
|
|
60
80
|
# calculate cost & get external model name
|
|
61
81
|
self.id = int(self.id)
|
|
@@ -63,14 +83,13 @@ class APIResponse:
|
|
|
63
83
|
self.model_external = api_model.name
|
|
64
84
|
self.cost = None
|
|
65
85
|
if (
|
|
66
|
-
self.
|
|
67
|
-
and self.output_tokens is not None
|
|
86
|
+
self.usage is not None
|
|
68
87
|
and api_model.input_cost is not None
|
|
69
88
|
and api_model.output_cost is not None
|
|
70
89
|
):
|
|
71
90
|
self.cost = (
|
|
72
|
-
self.input_tokens * api_model.input_cost / 1e6
|
|
73
|
-
+ self.output_tokens * api_model.output_cost / 1e6
|
|
91
|
+
self.usage.input_tokens * api_model.input_cost / 1e6
|
|
92
|
+
+ self.usage.output_tokens * api_model.output_cost / 1e6
|
|
74
93
|
)
|
|
75
94
|
elif self.content is not None and self.completion is not None:
|
|
76
95
|
print(
|
|
@@ -90,8 +109,7 @@ class APIResponse:
|
|
|
90
109
|
"error_message": self.error_message,
|
|
91
110
|
"completion": self.completion, # computed property
|
|
92
111
|
"content": self.content.to_log() if self.content else None,
|
|
93
|
-
"
|
|
94
|
-
"output_tokens": self.output_tokens,
|
|
112
|
+
"usage": self.usage.to_dict() if self.usage else None,
|
|
95
113
|
"finish_reason": self.finish_reason,
|
|
96
114
|
"cost": self.cost,
|
|
97
115
|
}
|
|
@@ -107,6 +125,10 @@ class APIResponse:
|
|
|
107
125
|
# Backward compatibility: create a Message with just text
|
|
108
126
|
content = Message.ai(data["completion"])
|
|
109
127
|
|
|
128
|
+
usage = None
|
|
129
|
+
if "usage" in data and data["usage"] is not None:
|
|
130
|
+
usage = Usage.from_dict(data["usage"])
|
|
131
|
+
|
|
110
132
|
return cls(
|
|
111
133
|
id=data.get("id", random.randint(0, 1_000_000_000)),
|
|
112
134
|
model_internal=data["model_internal"],
|
|
@@ -115,8 +137,7 @@ class APIResponse:
|
|
|
115
137
|
status_code=data["status_code"],
|
|
116
138
|
is_error=data["is_error"],
|
|
117
139
|
error_message=data["error_message"],
|
|
118
|
-
|
|
119
|
-
output_tokens=data["output_tokens"],
|
|
140
|
+
usage=usage,
|
|
120
141
|
content=content,
|
|
121
142
|
thinking=data.get("thinking"),
|
|
122
143
|
model_external=data.get("model_external"),
|
|
@@ -168,6 +189,7 @@ class APIRequestBase(ABC):
|
|
|
168
189
|
all_model_names: list[str] | None = None,
|
|
169
190
|
all_sampling_params: list[SamplingParams] | None = None,
|
|
170
191
|
tools: list | None = None,
|
|
192
|
+
cache: CachePattern | None = None,
|
|
171
193
|
):
|
|
172
194
|
if all_model_names is None:
|
|
173
195
|
raise ValueError("all_model_names must be provided.")
|
|
@@ -190,6 +212,7 @@ class APIRequestBase(ABC):
|
|
|
190
212
|
self.all_model_names = all_model_names
|
|
191
213
|
self.all_sampling_params = all_sampling_params
|
|
192
214
|
self.tools = tools
|
|
215
|
+
self.cache: CachePattern | None = cache
|
|
193
216
|
self.result = [] # list of APIResponse objects from each attempt
|
|
194
217
|
|
|
195
218
|
# these should be set in the __init__ of the subclass
|
|
@@ -280,6 +303,7 @@ class APIRequestBase(ABC):
|
|
|
280
303
|
all_model_names=self.all_model_names,
|
|
281
304
|
all_sampling_params=self.all_sampling_params,
|
|
282
305
|
tools=self.tools,
|
|
306
|
+
cache=self.cache,
|
|
283
307
|
)
|
|
284
308
|
# PROBLEM: new request is never put into results array, so we can't get the result.
|
|
285
309
|
self.retry_queue.put_nowait(new_request)
|
|
@@ -323,8 +347,7 @@ class APIRequestBase(ABC):
|
|
|
323
347
|
is_error=True,
|
|
324
348
|
error_message="Request timed out (terminated by client).",
|
|
325
349
|
content=None,
|
|
326
|
-
|
|
327
|
-
output_tokens=None,
|
|
350
|
+
usage=None,
|
|
328
351
|
)
|
|
329
352
|
)
|
|
330
353
|
self.handle_error(create_new_request=False)
|
|
@@ -341,8 +364,7 @@ class APIRequestBase(ABC):
|
|
|
341
364
|
is_error=True,
|
|
342
365
|
error_message=f"Unexpected {type(e).__name__}: {str(e) or 'No message.'}",
|
|
343
366
|
content=None,
|
|
344
|
-
|
|
345
|
-
output_tokens=None,
|
|
367
|
+
usage=None,
|
|
346
368
|
)
|
|
347
369
|
)
|
|
348
370
|
# maybe consider making True?
|
|
@@ -370,6 +392,7 @@ def create_api_request(
|
|
|
370
392
|
all_model_names: list[str] | None = None,
|
|
371
393
|
all_sampling_params: list[SamplingParams] | None = None,
|
|
372
394
|
tools: list | None = None,
|
|
395
|
+
cache: CachePattern | None = None,
|
|
373
396
|
) -> APIRequestBase:
|
|
374
397
|
from .common import CLASSES # circular import so made it lazy, does this work?
|
|
375
398
|
|
|
@@ -395,5 +418,6 @@ def create_api_request(
|
|
|
395
418
|
all_model_names=all_model_names,
|
|
396
419
|
all_sampling_params=all_sampling_params,
|
|
397
420
|
tools=tools,
|
|
421
|
+
cache=cache,
|
|
398
422
|
**kwargs,
|
|
399
423
|
)
|
|
@@ -12,7 +12,15 @@ except ImportError:
|
|
|
12
12
|
"aws4auth is required for bedrock support. Install with: pip install requests-aws4auth"
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
-
from lm_deluge.prompt import
|
|
15
|
+
from lm_deluge.prompt import (
|
|
16
|
+
Conversation,
|
|
17
|
+
Message,
|
|
18
|
+
Text,
|
|
19
|
+
ToolCall,
|
|
20
|
+
Thinking,
|
|
21
|
+
CachePattern,
|
|
22
|
+
)
|
|
23
|
+
from lm_deluge.usage import Usage
|
|
16
24
|
from .base import APIRequestBase, APIResponse
|
|
17
25
|
|
|
18
26
|
from ..tracker import StatusTracker
|
|
@@ -38,6 +46,7 @@ class BedrockRequest(APIRequestBase):
|
|
|
38
46
|
all_model_names: list[str] | None = None,
|
|
39
47
|
all_sampling_params: list[SamplingParams] | None = None,
|
|
40
48
|
tools: list | None = None,
|
|
49
|
+
cache: CachePattern | None = None,
|
|
41
50
|
):
|
|
42
51
|
super().__init__(
|
|
43
52
|
task_id=task_id,
|
|
@@ -55,8 +64,13 @@ class BedrockRequest(APIRequestBase):
|
|
|
55
64
|
all_model_names=all_model_names,
|
|
56
65
|
all_sampling_params=all_sampling_params,
|
|
57
66
|
tools=tools,
|
|
67
|
+
cache=cache,
|
|
58
68
|
)
|
|
59
69
|
|
|
70
|
+
# Lock images as bytes if caching is enabled
|
|
71
|
+
if cache is not None:
|
|
72
|
+
prompt.lock_images_as_bytes()
|
|
73
|
+
|
|
60
74
|
self.model = APIModel.from_registry(model_name)
|
|
61
75
|
|
|
62
76
|
# Get AWS credentials from environment
|
|
@@ -87,7 +101,7 @@ class BedrockRequest(APIRequestBase):
|
|
|
87
101
|
self.url = f"https://bedrock-runtime.{self.region}.amazonaws.com/model/{self.model.name}/invoke"
|
|
88
102
|
|
|
89
103
|
# Convert prompt to Anthropic format for bedrock
|
|
90
|
-
self.system_message, messages = prompt.to_anthropic()
|
|
104
|
+
self.system_message, messages = prompt.to_anthropic(cache_pattern=cache)
|
|
91
105
|
|
|
92
106
|
# Prepare request body in Anthropic's bedrock format
|
|
93
107
|
self.request_json = {
|
|
@@ -102,7 +116,11 @@ class BedrockRequest(APIRequestBase):
|
|
|
102
116
|
self.request_json["system"] = self.system_message
|
|
103
117
|
|
|
104
118
|
if tools:
|
|
105
|
-
|
|
119
|
+
tool_definitions = [tool.dump_for("anthropic") for tool in tools]
|
|
120
|
+
# Add cache control to last tool if tools_only caching is specified
|
|
121
|
+
if cache == "tools_only" and tool_definitions:
|
|
122
|
+
tool_definitions[-1]["cache_control"] = {"type": "ephemeral"}
|
|
123
|
+
self.request_json["tools"] = tool_definitions
|
|
106
124
|
|
|
107
125
|
# Setup AWS4Auth for signing
|
|
108
126
|
self.auth = AWS4Auth(
|
|
@@ -179,8 +197,7 @@ class BedrockRequest(APIRequestBase):
|
|
|
179
197
|
is_error=True,
|
|
180
198
|
error_message="Request timed out (terminated by client).",
|
|
181
199
|
content=None,
|
|
182
|
-
|
|
183
|
-
output_tokens=None,
|
|
200
|
+
usage=None,
|
|
184
201
|
)
|
|
185
202
|
)
|
|
186
203
|
self.handle_error(create_new_request=False)
|
|
@@ -199,8 +216,7 @@ class BedrockRequest(APIRequestBase):
|
|
|
199
216
|
is_error=True,
|
|
200
217
|
error_message=f"Unexpected {type(e).__name__}: {str(e) or 'No message.'}",
|
|
201
218
|
content=None,
|
|
202
|
-
|
|
203
|
-
output_tokens=None,
|
|
219
|
+
usage=None,
|
|
204
220
|
)
|
|
205
221
|
)
|
|
206
222
|
self.handle_error(create_new_request=False)
|
|
@@ -210,8 +226,7 @@ class BedrockRequest(APIRequestBase):
|
|
|
210
226
|
error_message = None
|
|
211
227
|
thinking = None
|
|
212
228
|
content = None
|
|
213
|
-
|
|
214
|
-
output_tokens = None
|
|
229
|
+
usage = None
|
|
215
230
|
status_code = http_response.status
|
|
216
231
|
mimetype = http_response.headers.get("Content-Type", None)
|
|
217
232
|
|
|
@@ -238,8 +253,7 @@ class BedrockRequest(APIRequestBase):
|
|
|
238
253
|
)
|
|
239
254
|
|
|
240
255
|
content = Message("assistant", parts)
|
|
241
|
-
|
|
242
|
-
output_tokens = data["usage"]["output_tokens"]
|
|
256
|
+
usage = Usage.from_anthropic_usage(data["usage"])
|
|
243
257
|
except Exception as e:
|
|
244
258
|
is_error = True
|
|
245
259
|
error_message = (
|
|
@@ -278,6 +292,5 @@ class BedrockRequest(APIRequestBase):
|
|
|
278
292
|
model_internal=self.model_name,
|
|
279
293
|
region=self.region,
|
|
280
294
|
sampling_params=self.sampling_params,
|
|
281
|
-
|
|
282
|
-
output_tokens=output_tokens,
|
|
295
|
+
usage=usage,
|
|
283
296
|
)
|
|
@@ -7,7 +7,8 @@ from tqdm.auto import tqdm
|
|
|
7
7
|
from typing import Callable
|
|
8
8
|
|
|
9
9
|
from .base import APIRequestBase, APIResponse
|
|
10
|
-
from ..prompt import Conversation, Message
|
|
10
|
+
from ..prompt import Conversation, Message, CachePattern
|
|
11
|
+
from ..usage import Usage
|
|
11
12
|
from ..tracker import StatusTracker
|
|
12
13
|
from ..sampling_params import SamplingParams
|
|
13
14
|
from ..models import APIModel
|
|
@@ -35,6 +36,7 @@ class MistralRequest(APIRequestBase):
|
|
|
35
36
|
all_model_names: list[str] | None = None,
|
|
36
37
|
all_sampling_params: list[SamplingParams] | None = None,
|
|
37
38
|
tools: list | None = None,
|
|
39
|
+
cache: CachePattern | None = None,
|
|
38
40
|
):
|
|
39
41
|
super().__init__(
|
|
40
42
|
task_id=task_id,
|
|
@@ -53,7 +55,15 @@ class MistralRequest(APIRequestBase):
|
|
|
53
55
|
debug=debug,
|
|
54
56
|
all_model_names=all_model_names,
|
|
55
57
|
all_sampling_params=all_sampling_params,
|
|
58
|
+
tools=tools,
|
|
59
|
+
cache=cache,
|
|
56
60
|
)
|
|
61
|
+
|
|
62
|
+
# Warn if cache is specified for non-Anthropic model
|
|
63
|
+
if cache is not None:
|
|
64
|
+
warnings.warn(
|
|
65
|
+
f"Cache parameter '{cache}' is only supported for Anthropic models, ignoring for {model_name}"
|
|
66
|
+
)
|
|
57
67
|
self.model = APIModel.from_registry(model_name)
|
|
58
68
|
self.url = f"{self.model.api_base}/chat/completions"
|
|
59
69
|
self.request_header = {
|
|
@@ -81,8 +91,7 @@ class MistralRequest(APIRequestBase):
|
|
|
81
91
|
is_error = False
|
|
82
92
|
error_message = None
|
|
83
93
|
completion = None
|
|
84
|
-
|
|
85
|
-
output_tokens = None
|
|
94
|
+
usage = None
|
|
86
95
|
logprobs = None
|
|
87
96
|
status_code = http_response.status
|
|
88
97
|
mimetype = http_response.headers.get("Content-Type", None)
|
|
@@ -99,8 +108,7 @@ class MistralRequest(APIRequestBase):
|
|
|
99
108
|
assert data is not None, "data is None"
|
|
100
109
|
try:
|
|
101
110
|
completion = data["choices"][0]["message"]["content"]
|
|
102
|
-
|
|
103
|
-
output_tokens = data["usage"]["completion_tokens"]
|
|
111
|
+
usage = Usage.from_mistral_usage(data["usage"])
|
|
104
112
|
if self.logprobs and "logprobs" in data["choices"][0]:
|
|
105
113
|
logprobs = data["choices"][0]["logprobs"]["content"]
|
|
106
114
|
except Exception:
|
|
@@ -134,6 +142,5 @@ class MistralRequest(APIRequestBase):
|
|
|
134
142
|
content=Message.ai(completion),
|
|
135
143
|
model_internal=self.model_name,
|
|
136
144
|
sampling_params=self.sampling_params,
|
|
137
|
-
|
|
138
|
-
output_tokens=output_tokens,
|
|
145
|
+
usage=usage,
|
|
139
146
|
)
|
lm_deluge/api_requests/openai.py
CHANGED
|
@@ -7,7 +7,8 @@ from tqdm.auto import tqdm
|
|
|
7
7
|
from typing import Callable
|
|
8
8
|
|
|
9
9
|
from .base import APIRequestBase, APIResponse
|
|
10
|
-
from ..prompt import Conversation, Message, Text, ToolCall, Thinking
|
|
10
|
+
from ..prompt import Conversation, Message, Text, ToolCall, Thinking, CachePattern
|
|
11
|
+
from ..usage import Usage
|
|
11
12
|
from ..tracker import StatusTracker
|
|
12
13
|
from ..sampling_params import SamplingParams
|
|
13
14
|
from ..models import APIModel
|
|
@@ -35,6 +36,7 @@ class OpenAIRequest(APIRequestBase):
|
|
|
35
36
|
all_model_names: list[str] | None = None,
|
|
36
37
|
all_sampling_params: list[SamplingParams] | None = None,
|
|
37
38
|
tools: list | None = None,
|
|
39
|
+
cache: CachePattern | None = None,
|
|
38
40
|
):
|
|
39
41
|
super().__init__(
|
|
40
42
|
task_id=task_id,
|
|
@@ -54,7 +56,14 @@ class OpenAIRequest(APIRequestBase):
|
|
|
54
56
|
all_model_names=all_model_names,
|
|
55
57
|
all_sampling_params=all_sampling_params,
|
|
56
58
|
tools=tools,
|
|
59
|
+
cache=cache,
|
|
57
60
|
)
|
|
61
|
+
|
|
62
|
+
# Warn if cache is specified for non-Anthropic model
|
|
63
|
+
if cache is not None:
|
|
64
|
+
warnings.warn(
|
|
65
|
+
f"Cache parameter '{cache}' is only supported for Anthropic models, ignoring for {model_name}"
|
|
66
|
+
)
|
|
58
67
|
self.model = APIModel.from_registry(model_name)
|
|
59
68
|
self.url = f"{self.model.api_base}/chat/completions"
|
|
60
69
|
self.request_header = {
|
|
@@ -97,8 +106,7 @@ class OpenAIRequest(APIRequestBase):
|
|
|
97
106
|
error_message = None
|
|
98
107
|
thinking = None
|
|
99
108
|
content = None
|
|
100
|
-
|
|
101
|
-
output_tokens = None
|
|
109
|
+
usage = None
|
|
102
110
|
logprobs = None
|
|
103
111
|
status_code = http_response.status
|
|
104
112
|
mimetype = http_response.headers.get("Content-Type", None)
|
|
@@ -142,8 +150,7 @@ class OpenAIRequest(APIRequestBase):
|
|
|
142
150
|
|
|
143
151
|
content = Message("assistant", parts)
|
|
144
152
|
|
|
145
|
-
|
|
146
|
-
output_tokens = data["usage"]["completion_tokens"]
|
|
153
|
+
usage = Usage.from_openai_usage(data["usage"])
|
|
147
154
|
if self.logprobs and "logprobs" in data["choices"][0]:
|
|
148
155
|
logprobs = data["choices"][0]["logprobs"]["content"]
|
|
149
156
|
except Exception:
|
|
@@ -178,6 +185,5 @@ class OpenAIRequest(APIRequestBase):
|
|
|
178
185
|
content=content,
|
|
179
186
|
model_internal=self.model_name,
|
|
180
187
|
sampling_params=self.sampling_params,
|
|
181
|
-
|
|
182
|
-
output_tokens=output_tokens,
|
|
188
|
+
usage=usage,
|
|
183
189
|
)
|
lm_deluge/client.py
CHANGED
|
@@ -8,7 +8,7 @@ from dataclasses import dataclass
|
|
|
8
8
|
from typing import Sequence, overload, Literal, Any
|
|
9
9
|
from tqdm.auto import tqdm
|
|
10
10
|
|
|
11
|
-
from lm_deluge.prompt import Conversation
|
|
11
|
+
from lm_deluge.prompt import Conversation, CachePattern
|
|
12
12
|
from lm_deluge.tool import Tool
|
|
13
13
|
|
|
14
14
|
from .tracker import StatusTracker
|
|
@@ -232,6 +232,7 @@ class LLMClient:
|
|
|
232
232
|
dry_run: Literal[True],
|
|
233
233
|
verbose: bool = ...,
|
|
234
234
|
tools: list[Tool] | None = ...,
|
|
235
|
+
cache: CachePattern | None = ...,
|
|
235
236
|
) -> dict[str, int]: ...
|
|
236
237
|
|
|
237
238
|
@overload
|
|
@@ -244,6 +245,7 @@ class LLMClient:
|
|
|
244
245
|
dry_run: bool = ...,
|
|
245
246
|
verbose: bool = ...,
|
|
246
247
|
tools: list[Tool] | None = ...,
|
|
248
|
+
cache: CachePattern | None = ...,
|
|
247
249
|
) -> list[str | None]: ...
|
|
248
250
|
|
|
249
251
|
@overload
|
|
@@ -256,6 +258,7 @@ class LLMClient:
|
|
|
256
258
|
dry_run: bool = ...,
|
|
257
259
|
verbose: bool = ...,
|
|
258
260
|
tools: list[Tool] | None = ...,
|
|
261
|
+
cache: CachePattern | None = ...,
|
|
259
262
|
) -> list[APIResponse | None]: ...
|
|
260
263
|
|
|
261
264
|
async def process_prompts_async(
|
|
@@ -267,6 +270,7 @@ class LLMClient:
|
|
|
267
270
|
dry_run: bool = False,
|
|
268
271
|
verbose: bool = False,
|
|
269
272
|
tools: list[Tool] | None = None,
|
|
273
|
+
cache: CachePattern | None = None,
|
|
270
274
|
) -> list[APIResponse | None] | list[str | None] | dict[str, int]:
|
|
271
275
|
# if prompts are not Conversations, convert them.
|
|
272
276
|
# can only handle strings for now
|
|
@@ -345,6 +349,7 @@ class LLMClient:
|
|
|
345
349
|
use_qps=self.use_qps,
|
|
346
350
|
verbose=verbose,
|
|
347
351
|
tools=tools,
|
|
352
|
+
cache=cache,
|
|
348
353
|
)
|
|
349
354
|
)
|
|
350
355
|
api_results: list[APIResponse] = await api_task
|
|
@@ -373,6 +378,7 @@ class LLMClient:
|
|
|
373
378
|
dry_run: bool = False,
|
|
374
379
|
verbose: bool = False,
|
|
375
380
|
tools: list[Tool] | None = None,
|
|
381
|
+
cache: CachePattern | None = None,
|
|
376
382
|
):
|
|
377
383
|
return asyncio.run(
|
|
378
384
|
self.process_prompts_async(
|
|
@@ -382,6 +388,7 @@ class LLMClient:
|
|
|
382
388
|
dry_run=dry_run,
|
|
383
389
|
verbose=verbose,
|
|
384
390
|
tools=tools,
|
|
391
|
+
cache=cache,
|
|
385
392
|
)
|
|
386
393
|
)
|
|
387
394
|
|
|
@@ -570,6 +577,7 @@ async def process_api_prompts_async(
|
|
|
570
577
|
use_qps: bool = False,
|
|
571
578
|
verbose: bool = False,
|
|
572
579
|
tools: list[Tool] | None = None,
|
|
580
|
+
cache: CachePattern | None = None,
|
|
573
581
|
):
|
|
574
582
|
"""Processes API requests in parallel, throttling to stay under rate limits."""
|
|
575
583
|
# change ids to integer list
|
|
@@ -654,6 +662,7 @@ async def process_api_prompts_async(
|
|
|
654
662
|
all_model_names=models,
|
|
655
663
|
all_sampling_params=sampling_params,
|
|
656
664
|
tools=tools,
|
|
665
|
+
cache=cache,
|
|
657
666
|
)
|
|
658
667
|
status_tracker.num_tasks_started += 1
|
|
659
668
|
status_tracker.num_tasks_in_progress += 1
|
lm_deluge/prompt.py
CHANGED
|
@@ -8,6 +8,14 @@ from typing import Literal
|
|
|
8
8
|
from lm_deluge.models import APIModel
|
|
9
9
|
from lm_deluge.image import Image
|
|
10
10
|
|
|
11
|
+
CachePattern = Literal[
|
|
12
|
+
"tools_only",
|
|
13
|
+
"system_and_tools",
|
|
14
|
+
"last_user_message",
|
|
15
|
+
"last_2_user_messages",
|
|
16
|
+
"last_3_user_messages",
|
|
17
|
+
]
|
|
18
|
+
|
|
11
19
|
###############################################################################
|
|
12
20
|
# 1. Low-level content blocks – either text or an image #
|
|
13
21
|
###############################################################################
|
|
@@ -516,7 +524,9 @@ class Conversation:
|
|
|
516
524
|
# OpenAI Responses = single “input” array, role must be user/assistant
|
|
517
525
|
return {"input": [m.oa_resp() for m in self.messages if m.role != "system"]}
|
|
518
526
|
|
|
519
|
-
def to_anthropic(
|
|
527
|
+
def to_anthropic(
|
|
528
|
+
self, cache_pattern: CachePattern | None = None
|
|
529
|
+
) -> tuple[str | list[dict] | None, list[dict]]:
|
|
520
530
|
system_msg = next(
|
|
521
531
|
(
|
|
522
532
|
m.parts[0].text
|
|
@@ -535,8 +545,84 @@ class Conversation:
|
|
|
535
545
|
other.append(user_msg.anthropic())
|
|
536
546
|
else:
|
|
537
547
|
other.append(m.anthropic())
|
|
548
|
+
|
|
549
|
+
# Apply cache control if specified
|
|
550
|
+
if cache_pattern is not None:
|
|
551
|
+
system_msg, other = self._apply_cache_control(
|
|
552
|
+
system_msg, other, cache_pattern
|
|
553
|
+
)
|
|
554
|
+
|
|
538
555
|
return system_msg, other
|
|
539
556
|
|
|
557
|
+
def _apply_cache_control(
|
|
558
|
+
self,
|
|
559
|
+
system_msg: str | None | list[dict],
|
|
560
|
+
messages: list[dict],
|
|
561
|
+
cache_pattern: CachePattern,
|
|
562
|
+
) -> tuple[str | list[dict] | None, list[dict]]:
|
|
563
|
+
"""Apply cache control to system message and/or messages based on the pattern."""
|
|
564
|
+
|
|
565
|
+
if cache_pattern == "system_and_tools" and system_msg is not None:
|
|
566
|
+
# Convert system message to structured format with cache control
|
|
567
|
+
# This caches tools+system prefix (since system comes after tools)
|
|
568
|
+
system_msg = [
|
|
569
|
+
{
|
|
570
|
+
"type": "text",
|
|
571
|
+
"text": system_msg,
|
|
572
|
+
"cache_control": {"type": "ephemeral"},
|
|
573
|
+
}
|
|
574
|
+
]
|
|
575
|
+
|
|
576
|
+
if cache_pattern == "last_user_message":
|
|
577
|
+
# Cache the last user message
|
|
578
|
+
user_messages = [i for i, m in enumerate(messages) if m["role"] == "user"]
|
|
579
|
+
if user_messages:
|
|
580
|
+
last_user_idx = user_messages[-1]
|
|
581
|
+
self._add_cache_control_to_message(messages[last_user_idx])
|
|
582
|
+
|
|
583
|
+
elif cache_pattern == "last_2_user_messages":
|
|
584
|
+
# Cache the last 2 user messages
|
|
585
|
+
user_messages = [i for i, m in enumerate(messages) if m["role"] == "user"]
|
|
586
|
+
for idx in user_messages[-2:]:
|
|
587
|
+
self._add_cache_control_to_message(messages[idx])
|
|
588
|
+
|
|
589
|
+
elif cache_pattern == "last_3_user_messages":
|
|
590
|
+
# Cache the last 3 user messages
|
|
591
|
+
user_messages = [i for i, m in enumerate(messages) if m["role"] == "user"]
|
|
592
|
+
for idx in user_messages[-3:]:
|
|
593
|
+
self._add_cache_control_to_message(messages[idx])
|
|
594
|
+
|
|
595
|
+
return system_msg, messages
|
|
596
|
+
|
|
597
|
+
def lock_images_as_bytes(self) -> "Conversation":
|
|
598
|
+
"""
|
|
599
|
+
Convert all images to bytes format to ensure they remain unchanged for caching.
|
|
600
|
+
This should be called when caching is enabled to prevent cache invalidation
|
|
601
|
+
from image reference changes.
|
|
602
|
+
"""
|
|
603
|
+
for message in self.messages:
|
|
604
|
+
for part in message.parts:
|
|
605
|
+
if isinstance(part, Image):
|
|
606
|
+
# Force conversion to bytes if not already
|
|
607
|
+
part.data = part._bytes()
|
|
608
|
+
return self
|
|
609
|
+
|
|
610
|
+
def _add_cache_control_to_message(self, message: dict) -> None:
|
|
611
|
+
"""Add cache control to a message's content."""
|
|
612
|
+
content = message["content"]
|
|
613
|
+
if isinstance(content, str):
|
|
614
|
+
# Convert string content to structured format with cache control
|
|
615
|
+
message["content"] = [
|
|
616
|
+
{
|
|
617
|
+
"type": "text",
|
|
618
|
+
"text": content,
|
|
619
|
+
"cache_control": {"type": "ephemeral"},
|
|
620
|
+
}
|
|
621
|
+
]
|
|
622
|
+
elif isinstance(content, list) and content:
|
|
623
|
+
# Add cache control to the last content block
|
|
624
|
+
content[-1]["cache_control"] = {"type": "ephemeral"}
|
|
625
|
+
|
|
540
626
|
def to_gemini(self) -> tuple[str | None, list[dict]]:
|
|
541
627
|
system_msg = next(
|
|
542
628
|
(
|
lm_deluge/usage.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class Usage:
|
|
7
|
+
"""
|
|
8
|
+
Unified usage tracking for all API providers.
|
|
9
|
+
|
|
10
|
+
Tracks token usage including cache hits and writes for providers that support it.
|
|
11
|
+
For providers that don't support caching, cache_read and cache_write will be None.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
input_tokens: int = 0
|
|
15
|
+
output_tokens: int = 0
|
|
16
|
+
cache_read_tokens: Optional[int] = None # Tokens read from cache (Anthropic)
|
|
17
|
+
cache_write_tokens: Optional[int] = None # Tokens written to cache (Anthropic)
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def total_input_tokens(self) -> int:
|
|
21
|
+
"""Total input tokens including both fresh input, cache writes, and cache reads."""
|
|
22
|
+
result = self.input_tokens
|
|
23
|
+
if self.cache_read_tokens is not None:
|
|
24
|
+
result += self.cache_read_tokens
|
|
25
|
+
if self.cache_write_tokens is not None:
|
|
26
|
+
result += self.cache_write_tokens
|
|
27
|
+
return result
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def total_tokens(self) -> int:
|
|
31
|
+
"""Total tokens processed (input + output)."""
|
|
32
|
+
return self.total_input_tokens + self.output_tokens
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def has_cache_hit(self) -> bool:
|
|
36
|
+
"""Whether this request had any cache hits."""
|
|
37
|
+
return self.cache_read_tokens is not None and self.cache_read_tokens > 0
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def has_cache_write(self) -> bool:
|
|
41
|
+
"""Whether this request wrote to cache."""
|
|
42
|
+
return self.cache_write_tokens is not None and self.cache_write_tokens > 0
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_anthropic_usage(cls, usage_data: dict) -> "Usage":
|
|
46
|
+
"""Create Usage from Anthropic API response usage data."""
|
|
47
|
+
return cls(
|
|
48
|
+
input_tokens=usage_data.get("input_tokens", 0),
|
|
49
|
+
output_tokens=usage_data.get("output_tokens", 0),
|
|
50
|
+
cache_read_tokens=usage_data.get("cache_read_input_tokens"),
|
|
51
|
+
cache_write_tokens=usage_data.get("cache_creation_input_tokens"),
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_openai_usage(cls, usage_data: dict) -> "Usage":
|
|
56
|
+
"""Create Usage from OpenAI API response usage data."""
|
|
57
|
+
return cls(
|
|
58
|
+
input_tokens=usage_data.get("prompt_tokens", 0),
|
|
59
|
+
output_tokens=usage_data.get("completion_tokens", 0),
|
|
60
|
+
cache_read_tokens=None, # OpenAI doesn't support caching yet
|
|
61
|
+
cache_write_tokens=None,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def from_mistral_usage(cls, usage_data: dict) -> "Usage":
|
|
66
|
+
"""Create Usage from Mistral API response usage data."""
|
|
67
|
+
return cls(
|
|
68
|
+
input_tokens=usage_data.get("prompt_tokens", 0),
|
|
69
|
+
output_tokens=usage_data.get("completion_tokens", 0),
|
|
70
|
+
cache_read_tokens=None, # Mistral doesn't support caching
|
|
71
|
+
cache_write_tokens=None,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def to_dict(self) -> dict:
|
|
75
|
+
"""Convert to dictionary for serialization."""
|
|
76
|
+
return {
|
|
77
|
+
"input_tokens": self.input_tokens,
|
|
78
|
+
"output_tokens": self.output_tokens,
|
|
79
|
+
"cache_read_tokens": self.cache_read_tokens,
|
|
80
|
+
"cache_write_tokens": self.cache_write_tokens,
|
|
81
|
+
"total_input_tokens": self.total_input_tokens,
|
|
82
|
+
"total_tokens": self.total_tokens,
|
|
83
|
+
"has_cache_hit": self.has_cache_hit,
|
|
84
|
+
"has_cache_write": self.has_cache_write,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def from_dict(cls, data: dict) -> "Usage":
|
|
89
|
+
"""Create Usage from dictionary."""
|
|
90
|
+
return cls(
|
|
91
|
+
input_tokens=data.get("input_tokens", 0),
|
|
92
|
+
output_tokens=data.get("output_tokens", 0),
|
|
93
|
+
cache_read_tokens=data.get("cache_read_tokens"),
|
|
94
|
+
cache_write_tokens=data.get("cache_write_tokens"),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def __add__(self, other: "Usage") -> "Usage":
|
|
98
|
+
"""Add two Usage objects together."""
|
|
99
|
+
return Usage(
|
|
100
|
+
input_tokens=self.input_tokens + other.input_tokens,
|
|
101
|
+
output_tokens=self.output_tokens + other.output_tokens,
|
|
102
|
+
cache_read_tokens=(
|
|
103
|
+
(self.cache_read_tokens or 0) + (other.cache_read_tokens or 0)
|
|
104
|
+
if self.cache_read_tokens is not None
|
|
105
|
+
or other.cache_read_tokens is not None
|
|
106
|
+
else None
|
|
107
|
+
),
|
|
108
|
+
cache_write_tokens=(
|
|
109
|
+
(self.cache_write_tokens or 0) + (other.cache_write_tokens or 0)
|
|
110
|
+
if self.cache_write_tokens is not None
|
|
111
|
+
or other.cache_write_tokens is not None
|
|
112
|
+
else None
|
|
113
|
+
),
|
|
114
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lm_deluge
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: Python utility for using LLM API models.
|
|
5
5
|
Author-email: Benjamin Anderson <ben@trytaylor.ai>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -47,7 +47,7 @@ Dynamic: license-file
|
|
|
47
47
|
pip install lm-deluge
|
|
48
48
|
```
|
|
49
49
|
|
|
50
|
-
The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
|
|
50
|
+
The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
|
|
51
51
|
|
|
52
52
|
## Quickstart
|
|
53
53
|
|
|
@@ -63,7 +63,7 @@ print(resp[0].completion)
|
|
|
63
63
|
|
|
64
64
|
## Spraying Across Models
|
|
65
65
|
|
|
66
|
-
To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
|
|
66
|
+
To distribute your requests across models, just provide a list of more than one model to the constructor. See all available models in `models.py`. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
|
|
67
67
|
|
|
68
68
|
```python
|
|
69
69
|
from lm_deluge import LLMClient
|
|
@@ -84,7 +84,7 @@ API calls can be customized in a few ways.
|
|
|
84
84
|
|
|
85
85
|
1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
|
|
86
86
|
2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
|
|
87
|
-
3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
|
|
87
|
+
3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
|
|
88
88
|
|
|
89
89
|
Putting it all together:
|
|
90
90
|
|
|
@@ -123,7 +123,9 @@ resps = client.process_prompts_sync([prompt])
|
|
|
123
123
|
|
|
124
124
|
This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
|
|
125
125
|
|
|
126
|
-
|
|
126
|
+
See a full multi-turn chat example in `examples/multiturn.md`.
|
|
127
|
+
|
|
128
|
+
## Tool Use
|
|
127
129
|
|
|
128
130
|
Define tools from Python functions and use them with any model:
|
|
129
131
|
|
|
@@ -135,27 +137,83 @@ def get_weather(city: str) -> str:
|
|
|
135
137
|
|
|
136
138
|
tool = Tool.from_function(get_weather)
|
|
137
139
|
client = LLMClient.basic("claude-3-haiku")
|
|
138
|
-
|
|
139
|
-
|
|
140
|
+
resps = client.process_prompts_sync(
|
|
141
|
+
["What's the weather in Paris?"],
|
|
142
|
+
tools=[tool]
|
|
143
|
+
)
|
|
140
144
|
|
|
141
|
-
|
|
145
|
+
# you can iterate over the tool calls in the response automatically
|
|
146
|
+
for tool_call in resps[0].tool_calls:
|
|
147
|
+
print(tool_call.name, tool_call.arguments)
|
|
148
|
+
```
|
|
142
149
|
|
|
143
|
-
|
|
150
|
+
You can also automatically instantiate tools from MCP servers. Under the hood, the the constructor connects to the server, asks it what tools it has, and then creates a `Tool` from each of them, *with a built-in `call` and `acall` interface*.
|
|
144
151
|
|
|
145
152
|
```python
|
|
146
153
|
from lm_deluge import LLMClient, Tool
|
|
147
154
|
|
|
148
|
-
# Connect to a local MCP server
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
155
|
+
# Connect to a local MCP server and get all of its tools
|
|
156
|
+
filesystem_tools = Tool.from_mcp(
|
|
157
|
+
"filesystem",
|
|
158
|
+
command="npx",
|
|
159
|
+
args=["-y", "@modelcontextprotocol/server-filesystem", "/path/to/directory"]
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# or load ALL the tools from a Claude Desktop like config
|
|
163
|
+
config = {
|
|
164
|
+
"mcpServers": {
|
|
165
|
+
"exa": {
|
|
166
|
+
"url": f"https://mcp.exa.ai/mcp?exaApiKey={os.getenv('EXA_API_KEY')}"
|
|
167
|
+
},
|
|
168
|
+
"zapier": {
|
|
169
|
+
"url": f"https://mcp.zapier.com/api/mcp/s/{os.getenv('ZAPIER_MCP_SECRET')}/mcp"
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
all_tools = Tool.from_mcp_config(config)
|
|
174
|
+
|
|
175
|
+
# let the model use the tools
|
|
176
|
+
client = LLMClient.basic("gpt-4o-mini")
|
|
177
|
+
resps = client.process_prompts_sync(
|
|
178
|
+
["List the files in the current directory"],
|
|
179
|
+
tools=tools
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# call the tools
|
|
183
|
+
for tool_call in resps[0].tool_calls:
|
|
184
|
+
# this is dumb sorry will make it better
|
|
185
|
+
tool_to_call = [x for x in tools if x.name == tool_call.name][0]
|
|
186
|
+
tool_to_call.call(**tool_call.arguments) # in async code, use .acall()
|
|
152
187
|
```
|
|
153
188
|
|
|
154
|
-
|
|
189
|
+
### Prompt Caching (Anthropic)
|
|
190
|
+
|
|
191
|
+
For Anthropic models, you can use prompt caching to reduce costs and latency for repeated context. This uses Anthropic's server-side prompt caching. Other providers like OpenAI and Google do this automatically, but Anthropic requires you to manually set cache-control on messages. You can do this in lm-deluge with a simple "cache" argument to `process_prompts_sync` or `process_prompts_async`:
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
from lm_deluge import LLMClient, Conversation, Message
|
|
195
|
+
|
|
196
|
+
# Create a conversation with system message
|
|
197
|
+
conv = (
|
|
198
|
+
Conversation.system("You are an expert Python developer with deep knowledge of async programming.")
|
|
199
|
+
.add(Message.user("How do I use asyncio.gather?"))
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Use prompt caching to cache system message and tools
|
|
203
|
+
client = LLMClient.basic("claude-3-5-sonnet")
|
|
204
|
+
resps = client.process_prompts_sync(
|
|
205
|
+
[conv],
|
|
206
|
+
cache="system_and_tools" # Cache system message and any tools
|
|
207
|
+
)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
Available cache patterns: `"system_and_tools"`, `"tools_only"`, `"last_user_message"`, `"last_2_user_messages"`, `"last_3_user_messages"`.
|
|
211
|
+
|
|
212
|
+
## Local Caching
|
|
155
213
|
|
|
156
|
-
`lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches.
|
|
214
|
+
Besides caching from model providers (which provides cache reads at a discount, but not for free) `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches to cache prompts locally. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
|
|
157
215
|
|
|
158
|
-
**IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
|
|
216
|
+
**IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The local cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
|
|
159
217
|
|
|
160
218
|
## Asynchronous Client
|
|
161
219
|
Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
|
|
@@ -1,23 +1,24 @@
|
|
|
1
1
|
lm_deluge/__init__.py,sha256=rndOr4Rcfnpttz-onWU3vVEm-MM0WDFgz6KexKPAx0k,222
|
|
2
2
|
lm_deluge/cache.py,sha256=VB1kv8rM2t5XWPR60uhszFcxLDnVKOe1oA5hYjVDjIo,4375
|
|
3
|
-
lm_deluge/client.py,sha256=
|
|
3
|
+
lm_deluge/client.py,sha256=yIC86BlH31oW1umrXnuOxaCGMGCbNVvGWXdz2yc6n1g,29296
|
|
4
4
|
lm_deluge/embed.py,sha256=m-X8UK4gV9KKD7Wv3yarAceMQaj7gR1JwzD_sB0MOQY,13183
|
|
5
5
|
lm_deluge/errors.py,sha256=oHjt7YnxWbh-eXMScIzov4NvpJMo0-2r5J6Wh5DQ1tk,209
|
|
6
6
|
lm_deluge/gemini_limits.py,sha256=V9mpS9JtXYz7AY6OuKyQp5TuIMRH1BVv9YrSNmGmHNA,1569
|
|
7
7
|
lm_deluge/image.py,sha256=hFbRajqEVQbkirAfOxsTPkeq-27Zl-so4AWBFeUbpBI,7161
|
|
8
8
|
lm_deluge/models.py,sha256=oYrt0x0iVfTwoHjP-l1WWennzEDGwnZczj6ds6a6-xc,45406
|
|
9
|
-
lm_deluge/prompt.py,sha256=
|
|
9
|
+
lm_deluge/prompt.py,sha256=O46eLd6e68_7mGHkCMwLitnsxE4NUvrd1X9vtb_Y7qc,28787
|
|
10
10
|
lm_deluge/rerank.py,sha256=tW1c3gQCAqaF8Ez-r-4qxYAcdKqxnLMxwHApKOUKwk4,11289
|
|
11
11
|
lm_deluge/sampling_params.py,sha256=E2kewh1vz-1Qcy5xNBCzihfGgT_GcHYMfzaWb3FLiXs,739
|
|
12
12
|
lm_deluge/tool.py,sha256=5nFbHchv12C1jkL8nkEh6v9WfxpC0O6rALP25z60WsI,9476
|
|
13
13
|
lm_deluge/tracker.py,sha256=Dk99scN_NeDEO0gkLO5efXiZq11Ga-k6cerUHWN7IWY,1292
|
|
14
|
+
lm_deluge/usage.py,sha256=oS-rmF3ZJ1RMtR7WI6BB2uVOAjJg0scvGF3zZRahWVg,4449
|
|
14
15
|
lm_deluge/api_requests/__init__.py,sha256=_aSpD6CJL9g6OpLPoChXiHjl4MH_OlGcKgfZaW8cgLM,71
|
|
15
|
-
lm_deluge/api_requests/anthropic.py,sha256=
|
|
16
|
-
lm_deluge/api_requests/base.py,sha256=
|
|
17
|
-
lm_deluge/api_requests/bedrock.py,sha256=
|
|
16
|
+
lm_deluge/api_requests/anthropic.py,sha256=F1bDb1Pyzwq08LSd4K4Pu6bugi7IFpUf4tZNWySgdx0,7646
|
|
17
|
+
lm_deluge/api_requests/base.py,sha256=h7AdWwd9sjZnBb2ETZmzC64E9fNYGp2vrOHGXXo8W2g,16803
|
|
18
|
+
lm_deluge/api_requests/bedrock.py,sha256=vf1pkVv4wBqZX7iiBHqHzDEHTlYNEG07rfQr9XE8Pr0,10832
|
|
18
19
|
lm_deluge/api_requests/common.py,sha256=U0mX_wC3Tzg2-1u9nYUCTQqYzuYJqvLrICCNW_dbbJM,287
|
|
19
|
-
lm_deluge/api_requests/mistral.py,sha256=
|
|
20
|
-
lm_deluge/api_requests/openai.py,sha256=
|
|
20
|
+
lm_deluge/api_requests/mistral.py,sha256=DvyriHbUApNOEdyiyyn-_HJnTY3tpKty18uqJ8u6n5Y,5640
|
|
21
|
+
lm_deluge/api_requests/openai.py,sha256=RjSq6LyDt9g7FASjBJd1JzyD09xz6D-abNO8wiELR1M,7552
|
|
21
22
|
lm_deluge/api_requests/deprecated/bedrock.py,sha256=WrcIShCoO8JCUSlFOCHxg6KQCNTZfw3TpYTvSpYk4mA,11320
|
|
22
23
|
lm_deluge/api_requests/deprecated/cohere.py,sha256=KgDScD6_bWhAzOY5BHZQKSA3kurt4KGENqC4wLsGmcU,5142
|
|
23
24
|
lm_deluge/api_requests/deprecated/deepseek.py,sha256=FEApI93VAWDwuaqTooIyKMgONYqRhdUmiAPBRme-IYs,4582
|
|
@@ -31,8 +32,8 @@ lm_deluge/util/json.py,sha256=dCeG9j1D17rXmQJbKJH79X0CGof4Wlqd55TDg4D6ky8,5388
|
|
|
31
32
|
lm_deluge/util/logprobs.py,sha256=UkBZakOxWluaLqHrjARu7xnJ0uCHVfLGHJdnYlEcutk,11768
|
|
32
33
|
lm_deluge/util/validation.py,sha256=hz5dDb3ebvZrZhnaWxOxbNSVMI6nmaOODBkk0htAUhs,1575
|
|
33
34
|
lm_deluge/util/xml.py,sha256=Ft4zajoYBJR3HHCt2oHwGfymGLdvp_gegVmJ-Wqk4Ck,10547
|
|
34
|
-
lm_deluge-0.0.
|
|
35
|
-
lm_deluge-0.0.
|
|
36
|
-
lm_deluge-0.0.
|
|
37
|
-
lm_deluge-0.0.
|
|
38
|
-
lm_deluge-0.0.
|
|
35
|
+
lm_deluge-0.0.12.dist-info/licenses/LICENSE,sha256=uNNXGXPCw2TC7CUs7SEBkA-Mz6QBQFWUUEWDMgEs1dU,1058
|
|
36
|
+
lm_deluge-0.0.12.dist-info/METADATA,sha256=lpl7mGKp096-Ccp6kgd57vuvZOGXs8ska41z_RfvNls,11663
|
|
37
|
+
lm_deluge-0.0.12.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
|
38
|
+
lm_deluge-0.0.12.dist-info/top_level.txt,sha256=hqU-TJX93yBwpgkDtYcXyLr3t7TLSCCZ_reytJjwBaE,10
|
|
39
|
+
lm_deluge-0.0.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|