lm-deluge 0.0.8__tar.gz → 0.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lm-deluge might be problematic. Click here for more details.
- {lm_deluge-0.0.8/src/lm_deluge.egg-info → lm_deluge-0.0.12}/PKG-INFO +97 -8
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/README.md +95 -7
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/pyproject.toml +2 -1
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/anthropic.py +45 -14
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/base.py +77 -26
- lm_deluge-0.0.12/src/lm_deluge/api_requests/bedrock.py +296 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/common.py +2 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/mistral.py +16 -8
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/openai.py +49 -12
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/client.py +28 -4
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/models.py +125 -48
- lm_deluge-0.0.12/src/lm_deluge/prompt.py +779 -0
- lm_deluge-0.0.12/src/lm_deluge/tool.py +280 -0
- lm_deluge-0.0.12/src/lm_deluge/usage.py +114 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12/src/lm_deluge.egg-info}/PKG-INFO +97 -8
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/SOURCES.txt +9 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/requires.txt +1 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_all_models.py +8 -4
- lm_deluge-0.0.12/tests/test_bedrock_models.py +205 -0
- lm_deluge-0.0.12/tests/test_mcp_tools.py +221 -0
- lm_deluge-0.0.12/tests/test_prompt_caching.py +261 -0
- lm_deluge-0.0.12/tests/test_real_caching.py +305 -0
- lm_deluge-0.0.12/tests/test_real_caching_bedrock.py +307 -0
- lm_deluge-0.0.12/tests/test_tool_calls.py +401 -0
- lm_deluge-0.0.12/tests/test_tool_from_function.py +150 -0
- lm_deluge-0.0.8/src/lm_deluge/prompt.py +0 -357
- lm_deluge-0.0.8/src/lm_deluge/tool.py +0 -106
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/LICENSE +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/setup.cfg +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/__init__.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/__init__.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/cache.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/embed.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/errors.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/gemini_limits.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/image.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/__init__.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/extract.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/score.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/translate.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/rerank.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/sampling_params.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/tracker.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/util/json.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/util/logprobs.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/util/validation.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/util/xml.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/top_level.txt +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_cache.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_image_models.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_image_utils.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_json_utils.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_sampling_params.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_translate.py +0 -0
- {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_xml_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lm_deluge
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: Python utility for using LLM API models.
|
|
5
5
|
Author-email: Benjamin Anderson <ben@trytaylor.ai>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -21,6 +21,7 @@ Requires-Dist: bs4
|
|
|
21
21
|
Requires-Dist: lxml
|
|
22
22
|
Requires-Dist: pdf2image
|
|
23
23
|
Requires-Dist: pillow
|
|
24
|
+
Requires-Dist: fastmcp>=2.4
|
|
24
25
|
Requires-Dist: fasttext-wheel
|
|
25
26
|
Requires-Dist: fasttext-langdetect
|
|
26
27
|
Dynamic: license-file
|
|
@@ -32,6 +33,8 @@ Dynamic: license-file
|
|
|
32
33
|
- **Unified client** – Send prompts to all relevant models with a single client.
|
|
33
34
|
- **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
|
|
34
35
|
- **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
|
|
36
|
+
- **Tool Use** – Unified API for defining tools for all providers, and creating tools automatically from python functions.
|
|
37
|
+
- **MCP Support** – Instantiate a `Tool` from a local or remote MCP server so that any LLM can use it, whether or not that provider natively supports MCP.
|
|
35
38
|
- **Caching** – Save completions in a local or distributed cache to avoid repeated LLM calls to process the same input.
|
|
36
39
|
- **Convenient message constructor** – No more looking up how to build an Anthropic messages list with images. Our `Conversation` and `Message` classes work great with our client or with the `openai` and `anthropic` packages.
|
|
37
40
|
- **Sync and async APIs** – Use the client from sync or async code.
|
|
@@ -44,7 +47,7 @@ Dynamic: license-file
|
|
|
44
47
|
pip install lm-deluge
|
|
45
48
|
```
|
|
46
49
|
|
|
47
|
-
The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
|
|
50
|
+
The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
|
|
48
51
|
|
|
49
52
|
## Quickstart
|
|
50
53
|
|
|
@@ -60,13 +63,13 @@ print(resp[0].completion)
|
|
|
60
63
|
|
|
61
64
|
## Spraying Across Models
|
|
62
65
|
|
|
63
|
-
To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
|
|
66
|
+
To distribute your requests across models, just provide a list of more than one model to the constructor. See all available models in `models.py`. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
|
|
64
67
|
|
|
65
68
|
```python
|
|
66
69
|
from lm_deluge import LLMClient
|
|
67
70
|
|
|
68
71
|
client = LLMClient.basic(
|
|
69
|
-
["gpt-4o-mini", "claude-haiku
|
|
72
|
+
["gpt-4o-mini", "claude-3-haiku"],
|
|
70
73
|
max_requests_per_minute=10_000
|
|
71
74
|
)
|
|
72
75
|
resps = client.process_prompts_sync(
|
|
@@ -81,7 +84,7 @@ API calls can be customized in a few ways.
|
|
|
81
84
|
|
|
82
85
|
1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
|
|
83
86
|
2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
|
|
84
|
-
3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
|
|
87
|
+
3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
|
|
85
88
|
|
|
86
89
|
Putting it all together:
|
|
87
90
|
|
|
@@ -120,11 +123,97 @@ resps = client.process_prompts_sync([prompt])
|
|
|
120
123
|
|
|
121
124
|
This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
|
|
122
125
|
|
|
123
|
-
|
|
126
|
+
See a full multi-turn chat example in `examples/multiturn.md`.
|
|
124
127
|
|
|
125
|
-
|
|
128
|
+
## Tool Use
|
|
126
129
|
|
|
127
|
-
|
|
130
|
+
Define tools from Python functions and use them with any model:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from lm_deluge import LLMClient, Tool
|
|
134
|
+
|
|
135
|
+
def get_weather(city: str) -> str:
|
|
136
|
+
return f"The weather in {city} is sunny and 72°F"
|
|
137
|
+
|
|
138
|
+
tool = Tool.from_function(get_weather)
|
|
139
|
+
client = LLMClient.basic("claude-3-haiku")
|
|
140
|
+
resps = client.process_prompts_sync(
|
|
141
|
+
["What's the weather in Paris?"],
|
|
142
|
+
tools=[tool]
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# you can iterate over the tool calls in the response automatically
|
|
146
|
+
for tool_call in resps[0].tool_calls:
|
|
147
|
+
print(tool_call.name, tool_call.arguments)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
You can also automatically instantiate tools from MCP servers. Under the hood, the the constructor connects to the server, asks it what tools it has, and then creates a `Tool` from each of them, *with a built-in `call` and `acall` interface*.
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from lm_deluge import LLMClient, Tool
|
|
154
|
+
|
|
155
|
+
# Connect to a local MCP server and get all of its tools
|
|
156
|
+
filesystem_tools = Tool.from_mcp(
|
|
157
|
+
"filesystem",
|
|
158
|
+
command="npx",
|
|
159
|
+
args=["-y", "@modelcontextprotocol/server-filesystem", "/path/to/directory"]
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# or load ALL the tools from a Claude Desktop like config
|
|
163
|
+
config = {
|
|
164
|
+
"mcpServers": {
|
|
165
|
+
"exa": {
|
|
166
|
+
"url": f"https://mcp.exa.ai/mcp?exaApiKey={os.getenv('EXA_API_KEY')}"
|
|
167
|
+
},
|
|
168
|
+
"zapier": {
|
|
169
|
+
"url": f"https://mcp.zapier.com/api/mcp/s/{os.getenv('ZAPIER_MCP_SECRET')}/mcp"
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
all_tools = Tool.from_mcp_config(config)
|
|
174
|
+
|
|
175
|
+
# let the model use the tools
|
|
176
|
+
client = LLMClient.basic("gpt-4o-mini")
|
|
177
|
+
resps = client.process_prompts_sync(
|
|
178
|
+
["List the files in the current directory"],
|
|
179
|
+
tools=tools
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# call the tools
|
|
183
|
+
for tool_call in resps[0].tool_calls:
|
|
184
|
+
# this is dumb sorry will make it better
|
|
185
|
+
tool_to_call = [x for x in tools if x.name == tool_call.name][0]
|
|
186
|
+
tool_to_call.call(**tool_call.arguments) # in async code, use .acall()
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Prompt Caching (Anthropic)
|
|
190
|
+
|
|
191
|
+
For Anthropic models, you can use prompt caching to reduce costs and latency for repeated context. This uses Anthropic's server-side prompt caching. Other providers like OpenAI and Google do this automatically, but Anthropic requires you to manually set cache-control on messages. You can do this in lm-deluge with a simple "cache" argument to `process_prompts_sync` or `process_prompts_async`:
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
from lm_deluge import LLMClient, Conversation, Message
|
|
195
|
+
|
|
196
|
+
# Create a conversation with system message
|
|
197
|
+
conv = (
|
|
198
|
+
Conversation.system("You are an expert Python developer with deep knowledge of async programming.")
|
|
199
|
+
.add(Message.user("How do I use asyncio.gather?"))
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Use prompt caching to cache system message and tools
|
|
203
|
+
client = LLMClient.basic("claude-3-5-sonnet")
|
|
204
|
+
resps = client.process_prompts_sync(
|
|
205
|
+
[conv],
|
|
206
|
+
cache="system_and_tools" # Cache system message and any tools
|
|
207
|
+
)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
Available cache patterns: `"system_and_tools"`, `"tools_only"`, `"last_user_message"`, `"last_2_user_messages"`, `"last_3_user_messages"`.
|
|
211
|
+
|
|
212
|
+
## Local Caching
|
|
213
|
+
|
|
214
|
+
Besides caching from model providers (which provides cache reads at a discount, but not for free) `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches to cache prompts locally. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
|
|
215
|
+
|
|
216
|
+
**IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The local cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
|
|
128
217
|
|
|
129
218
|
## Asynchronous Client
|
|
130
219
|
Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
|
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
- **Unified client** – Send prompts to all relevant models with a single client.
|
|
6
6
|
- **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
|
|
7
7
|
- **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
|
|
8
|
+
- **Tool Use** – Unified API for defining tools for all providers, and creating tools automatically from python functions.
|
|
9
|
+
- **MCP Support** – Instantiate a `Tool` from a local or remote MCP server so that any LLM can use it, whether or not that provider natively supports MCP.
|
|
8
10
|
- **Caching** – Save completions in a local or distributed cache to avoid repeated LLM calls to process the same input.
|
|
9
11
|
- **Convenient message constructor** – No more looking up how to build an Anthropic messages list with images. Our `Conversation` and `Message` classes work great with our client or with the `openai` and `anthropic` packages.
|
|
10
12
|
- **Sync and async APIs** – Use the client from sync or async code.
|
|
@@ -17,7 +19,7 @@
|
|
|
17
19
|
pip install lm-deluge
|
|
18
20
|
```
|
|
19
21
|
|
|
20
|
-
The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
|
|
22
|
+
The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
|
|
21
23
|
|
|
22
24
|
## Quickstart
|
|
23
25
|
|
|
@@ -33,13 +35,13 @@ print(resp[0].completion)
|
|
|
33
35
|
|
|
34
36
|
## Spraying Across Models
|
|
35
37
|
|
|
36
|
-
To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
|
|
38
|
+
To distribute your requests across models, just provide a list of more than one model to the constructor. See all available models in `models.py`. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
|
|
37
39
|
|
|
38
40
|
```python
|
|
39
41
|
from lm_deluge import LLMClient
|
|
40
42
|
|
|
41
43
|
client = LLMClient.basic(
|
|
42
|
-
["gpt-4o-mini", "claude-haiku
|
|
44
|
+
["gpt-4o-mini", "claude-3-haiku"],
|
|
43
45
|
max_requests_per_minute=10_000
|
|
44
46
|
)
|
|
45
47
|
resps = client.process_prompts_sync(
|
|
@@ -54,7 +56,7 @@ API calls can be customized in a few ways.
|
|
|
54
56
|
|
|
55
57
|
1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
|
|
56
58
|
2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
|
|
57
|
-
3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
|
|
59
|
+
3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
|
|
58
60
|
|
|
59
61
|
Putting it all together:
|
|
60
62
|
|
|
@@ -93,11 +95,97 @@ resps = client.process_prompts_sync([prompt])
|
|
|
93
95
|
|
|
94
96
|
This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
|
|
95
97
|
|
|
96
|
-
|
|
98
|
+
See a full multi-turn chat example in `examples/multiturn.md`.
|
|
97
99
|
|
|
98
|
-
|
|
100
|
+
## Tool Use
|
|
99
101
|
|
|
100
|
-
|
|
102
|
+
Define tools from Python functions and use them with any model:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from lm_deluge import LLMClient, Tool
|
|
106
|
+
|
|
107
|
+
def get_weather(city: str) -> str:
|
|
108
|
+
return f"The weather in {city} is sunny and 72°F"
|
|
109
|
+
|
|
110
|
+
tool = Tool.from_function(get_weather)
|
|
111
|
+
client = LLMClient.basic("claude-3-haiku")
|
|
112
|
+
resps = client.process_prompts_sync(
|
|
113
|
+
["What's the weather in Paris?"],
|
|
114
|
+
tools=[tool]
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# you can iterate over the tool calls in the response automatically
|
|
118
|
+
for tool_call in resps[0].tool_calls:
|
|
119
|
+
print(tool_call.name, tool_call.arguments)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
You can also automatically instantiate tools from MCP servers. Under the hood, the the constructor connects to the server, asks it what tools it has, and then creates a `Tool` from each of them, *with a built-in `call` and `acall` interface*.
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from lm_deluge import LLMClient, Tool
|
|
126
|
+
|
|
127
|
+
# Connect to a local MCP server and get all of its tools
|
|
128
|
+
filesystem_tools = Tool.from_mcp(
|
|
129
|
+
"filesystem",
|
|
130
|
+
command="npx",
|
|
131
|
+
args=["-y", "@modelcontextprotocol/server-filesystem", "/path/to/directory"]
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# or load ALL the tools from a Claude Desktop like config
|
|
135
|
+
config = {
|
|
136
|
+
"mcpServers": {
|
|
137
|
+
"exa": {
|
|
138
|
+
"url": f"https://mcp.exa.ai/mcp?exaApiKey={os.getenv('EXA_API_KEY')}"
|
|
139
|
+
},
|
|
140
|
+
"zapier": {
|
|
141
|
+
"url": f"https://mcp.zapier.com/api/mcp/s/{os.getenv('ZAPIER_MCP_SECRET')}/mcp"
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
all_tools = Tool.from_mcp_config(config)
|
|
146
|
+
|
|
147
|
+
# let the model use the tools
|
|
148
|
+
client = LLMClient.basic("gpt-4o-mini")
|
|
149
|
+
resps = client.process_prompts_sync(
|
|
150
|
+
["List the files in the current directory"],
|
|
151
|
+
tools=tools
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# call the tools
|
|
155
|
+
for tool_call in resps[0].tool_calls:
|
|
156
|
+
# this is dumb sorry will make it better
|
|
157
|
+
tool_to_call = [x for x in tools if x.name == tool_call.name][0]
|
|
158
|
+
tool_to_call.call(**tool_call.arguments) # in async code, use .acall()
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Prompt Caching (Anthropic)
|
|
162
|
+
|
|
163
|
+
For Anthropic models, you can use prompt caching to reduce costs and latency for repeated context. This uses Anthropic's server-side prompt caching. Other providers like OpenAI and Google do this automatically, but Anthropic requires you to manually set cache-control on messages. You can do this in lm-deluge with a simple "cache" argument to `process_prompts_sync` or `process_prompts_async`:
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from lm_deluge import LLMClient, Conversation, Message
|
|
167
|
+
|
|
168
|
+
# Create a conversation with system message
|
|
169
|
+
conv = (
|
|
170
|
+
Conversation.system("You are an expert Python developer with deep knowledge of async programming.")
|
|
171
|
+
.add(Message.user("How do I use asyncio.gather?"))
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Use prompt caching to cache system message and tools
|
|
175
|
+
client = LLMClient.basic("claude-3-5-sonnet")
|
|
176
|
+
resps = client.process_prompts_sync(
|
|
177
|
+
[conv],
|
|
178
|
+
cache="system_and_tools" # Cache system message and any tools
|
|
179
|
+
)
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Available cache patterns: `"system_and_tools"`, `"tools_only"`, `"last_user_message"`, `"last_2_user_messages"`, `"last_3_user_messages"`.
|
|
183
|
+
|
|
184
|
+
## Local Caching
|
|
185
|
+
|
|
186
|
+
Besides caching from model providers (which provides cache reads at a discount, but not for free) `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches to cache prompts locally. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
|
|
187
|
+
|
|
188
|
+
**IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The local cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
|
|
101
189
|
|
|
102
190
|
## Asynchronous Client
|
|
103
191
|
Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
|
|
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
|
|
|
3
3
|
|
|
4
4
|
[project]
|
|
5
5
|
name = "lm_deluge"
|
|
6
|
-
version = "0.0.
|
|
6
|
+
version = "0.0.12"
|
|
7
7
|
authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
|
|
8
8
|
description = "Python utility for using LLM API models."
|
|
9
9
|
readme = "README.md"
|
|
@@ -27,6 +27,7 @@ dependencies = [
|
|
|
27
27
|
"lxml",
|
|
28
28
|
"pdf2image",
|
|
29
29
|
"pillow",
|
|
30
|
+
"fastmcp>=2.4",
|
|
30
31
|
"fasttext-wheel",
|
|
31
32
|
"fasttext-langdetect",
|
|
32
33
|
]
|
|
@@ -6,7 +6,15 @@ import warnings
|
|
|
6
6
|
from tqdm import tqdm
|
|
7
7
|
from typing import Callable
|
|
8
8
|
|
|
9
|
-
from lm_deluge.prompt import
|
|
9
|
+
from lm_deluge.prompt import (
|
|
10
|
+
Conversation,
|
|
11
|
+
Message,
|
|
12
|
+
Text,
|
|
13
|
+
ToolCall,
|
|
14
|
+
Thinking,
|
|
15
|
+
CachePattern,
|
|
16
|
+
)
|
|
17
|
+
from lm_deluge.usage import Usage
|
|
10
18
|
from .base import APIRequestBase, APIResponse
|
|
11
19
|
|
|
12
20
|
from ..tracker import StatusTracker
|
|
@@ -34,6 +42,8 @@ class AnthropicRequest(APIRequestBase):
|
|
|
34
42
|
# for retries
|
|
35
43
|
all_model_names: list[str] | None = None,
|
|
36
44
|
all_sampling_params: list[SamplingParams] | None = None,
|
|
45
|
+
tools: list | None = None,
|
|
46
|
+
cache: CachePattern | None = None,
|
|
37
47
|
):
|
|
38
48
|
super().__init__(
|
|
39
49
|
task_id=task_id,
|
|
@@ -50,11 +60,17 @@ class AnthropicRequest(APIRequestBase):
|
|
|
50
60
|
debug=debug,
|
|
51
61
|
all_model_names=all_model_names,
|
|
52
62
|
all_sampling_params=all_sampling_params,
|
|
63
|
+
tools=tools,
|
|
64
|
+
cache=cache,
|
|
53
65
|
)
|
|
54
66
|
self.model = APIModel.from_registry(model_name)
|
|
55
67
|
self.url = f"{self.model.api_base}/messages"
|
|
56
68
|
|
|
57
|
-
|
|
69
|
+
# Lock images as bytes if caching is enabled
|
|
70
|
+
if cache is not None:
|
|
71
|
+
prompt.lock_images_as_bytes()
|
|
72
|
+
|
|
73
|
+
self.system_message, messages = prompt.to_anthropic(cache_pattern=cache)
|
|
58
74
|
self.request_header = {
|
|
59
75
|
"x-api-key": os.getenv(self.model.api_key_env_var),
|
|
60
76
|
"anthropic-version": "2023-06-01",
|
|
@@ -94,14 +110,19 @@ class AnthropicRequest(APIRequestBase):
|
|
|
94
110
|
)
|
|
95
111
|
if self.system_message is not None:
|
|
96
112
|
self.request_json["system"] = self.system_message
|
|
113
|
+
if tools:
|
|
114
|
+
tool_definitions = [tool.dump_for("anthropic") for tool in tools]
|
|
115
|
+
# Add cache control to last tool if tools_only caching is specified
|
|
116
|
+
if cache == "tools_only" and tool_definitions:
|
|
117
|
+
tool_definitions[-1]["cache_control"] = {"type": "ephemeral"}
|
|
118
|
+
self.request_json["tools"] = tool_definitions
|
|
97
119
|
|
|
98
120
|
async def handle_response(self, http_response: ClientResponse) -> APIResponse:
|
|
99
121
|
is_error = False
|
|
100
122
|
error_message = None
|
|
101
123
|
thinking = None
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
output_tokens = None
|
|
124
|
+
content = None
|
|
125
|
+
usage = None
|
|
105
126
|
status_code = http_response.status
|
|
106
127
|
mimetype = http_response.headers.get("Content-Type", None)
|
|
107
128
|
rate_limits = {}
|
|
@@ -119,16 +140,27 @@ class AnthropicRequest(APIRequestBase):
|
|
|
119
140
|
if status_code >= 200 and status_code < 300:
|
|
120
141
|
try:
|
|
121
142
|
data = await http_response.json()
|
|
122
|
-
|
|
123
|
-
|
|
143
|
+
response_content = data["content"]
|
|
144
|
+
|
|
145
|
+
# Parse response into Message with parts
|
|
146
|
+
parts = []
|
|
147
|
+
for item in response_content:
|
|
124
148
|
if item["type"] == "text":
|
|
125
|
-
|
|
149
|
+
parts.append(Text(item["text"]))
|
|
126
150
|
elif item["type"] == "thinking":
|
|
127
151
|
thinking = item["thinking"]
|
|
152
|
+
parts.append(Thinking(item["thinking"]))
|
|
128
153
|
elif item["type"] == "tool_use":
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
154
|
+
parts.append(
|
|
155
|
+
ToolCall(
|
|
156
|
+
id=item["id"],
|
|
157
|
+
name=item["name"],
|
|
158
|
+
arguments=item["input"],
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
content = Message("assistant", parts)
|
|
163
|
+
usage = Usage.from_anthropic_usage(data["usage"])
|
|
132
164
|
except Exception as e:
|
|
133
165
|
is_error = True
|
|
134
166
|
error_message = (
|
|
@@ -162,10 +194,9 @@ class AnthropicRequest(APIRequestBase):
|
|
|
162
194
|
is_error=is_error,
|
|
163
195
|
error_message=error_message,
|
|
164
196
|
prompt=self.prompt,
|
|
165
|
-
|
|
197
|
+
content=content,
|
|
166
198
|
thinking=thinking,
|
|
167
199
|
model_internal=self.model_name,
|
|
168
200
|
sampling_params=self.sampling_params,
|
|
169
|
-
|
|
170
|
-
output_tokens=output_tokens,
|
|
201
|
+
usage=usage,
|
|
171
202
|
)
|
|
@@ -7,7 +7,8 @@ from dataclasses import dataclass
|
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
8
|
from typing import Callable
|
|
9
9
|
|
|
10
|
-
from lm_deluge.prompt import Conversation
|
|
10
|
+
from lm_deluge.prompt import Conversation, Message, CachePattern
|
|
11
|
+
from lm_deluge.usage import Usage
|
|
11
12
|
|
|
12
13
|
from ..tracker import StatusTracker
|
|
13
14
|
from ..sampling_params import SamplingParams
|
|
@@ -29,10 +30,11 @@ class APIResponse:
|
|
|
29
30
|
is_error: bool | None
|
|
30
31
|
error_message: str | None
|
|
31
32
|
|
|
32
|
-
# completion information
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
# completion information - unified usage tracking
|
|
34
|
+
usage: Usage | None = None
|
|
35
|
+
|
|
36
|
+
# response content - structured format
|
|
37
|
+
content: Message | None = None
|
|
36
38
|
|
|
37
39
|
# optional or calculated automatically
|
|
38
40
|
thinking: str | None = None # if model shows thinking tokens
|
|
@@ -47,6 +49,33 @@ class APIResponse:
|
|
|
47
49
|
# set to true if should NOT retry with the same model (unrecoverable error)
|
|
48
50
|
give_up_if_no_other_models: bool | None = False
|
|
49
51
|
|
|
52
|
+
@property
|
|
53
|
+
def completion(self) -> str | None:
|
|
54
|
+
"""Backward compatibility: extract text from content Message."""
|
|
55
|
+
if self.content is not None:
|
|
56
|
+
return self.content.completion
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def input_tokens(self) -> int | None:
|
|
61
|
+
"""Get input tokens from usage object."""
|
|
62
|
+
return self.usage.input_tokens if self.usage else None
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def output_tokens(self) -> int | None:
|
|
66
|
+
"""Get output tokens from usage object."""
|
|
67
|
+
return self.usage.output_tokens if self.usage else None
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def cache_read_tokens(self) -> int | None:
|
|
71
|
+
"""Get cache read tokens from usage object."""
|
|
72
|
+
return self.usage.cache_read_tokens if self.usage else None
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def cache_write_tokens(self) -> int | None:
|
|
76
|
+
"""Get cache write tokens from usage object."""
|
|
77
|
+
return self.usage.cache_write_tokens if self.usage else None
|
|
78
|
+
|
|
50
79
|
def __post_init__(self):
|
|
51
80
|
# calculate cost & get external model name
|
|
52
81
|
self.id = int(self.id)
|
|
@@ -54,16 +83,15 @@ class APIResponse:
|
|
|
54
83
|
self.model_external = api_model.name
|
|
55
84
|
self.cost = None
|
|
56
85
|
if (
|
|
57
|
-
self.
|
|
58
|
-
and self.output_tokens is not None
|
|
86
|
+
self.usage is not None
|
|
59
87
|
and api_model.input_cost is not None
|
|
60
88
|
and api_model.output_cost is not None
|
|
61
89
|
):
|
|
62
90
|
self.cost = (
|
|
63
|
-
self.input_tokens * api_model.input_cost / 1e6
|
|
64
|
-
+ self.output_tokens * api_model.output_cost / 1e6
|
|
91
|
+
self.usage.input_tokens * api_model.input_cost / 1e6
|
|
92
|
+
+ self.usage.output_tokens * api_model.output_cost / 1e6
|
|
65
93
|
)
|
|
66
|
-
elif self.completion is not None:
|
|
94
|
+
elif self.content is not None and self.completion is not None:
|
|
67
95
|
print(
|
|
68
96
|
f"Warning: Completion provided without token counts for model {self.model_internal}."
|
|
69
97
|
)
|
|
@@ -79,30 +107,45 @@ class APIResponse:
|
|
|
79
107
|
"status_code": self.status_code,
|
|
80
108
|
"is_error": self.is_error,
|
|
81
109
|
"error_message": self.error_message,
|
|
82
|
-
"completion": self.completion,
|
|
83
|
-
"
|
|
84
|
-
"
|
|
110
|
+
"completion": self.completion, # computed property
|
|
111
|
+
"content": self.content.to_log() if self.content else None,
|
|
112
|
+
"usage": self.usage.to_dict() if self.usage else None,
|
|
85
113
|
"finish_reason": self.finish_reason,
|
|
86
114
|
"cost": self.cost,
|
|
87
115
|
}
|
|
88
116
|
|
|
89
117
|
@classmethod
|
|
90
118
|
def from_dict(cls, data: dict):
|
|
119
|
+
# Handle backward compatibility for content/completion
|
|
120
|
+
content = None
|
|
121
|
+
if "content" in data and data["content"] is not None:
|
|
122
|
+
# Reconstruct message from log format
|
|
123
|
+
content = Message.from_log(data["content"])
|
|
124
|
+
elif "completion" in data and data["completion"] is not None:
|
|
125
|
+
# Backward compatibility: create a Message with just text
|
|
126
|
+
content = Message.ai(data["completion"])
|
|
127
|
+
|
|
128
|
+
usage = None
|
|
129
|
+
if "usage" in data and data["usage"] is not None:
|
|
130
|
+
usage = Usage.from_dict(data["usage"])
|
|
131
|
+
|
|
91
132
|
return cls(
|
|
92
133
|
id=data.get("id", random.randint(0, 1_000_000_000)),
|
|
93
134
|
model_internal=data["model_internal"],
|
|
94
|
-
model_external=data["model_external"],
|
|
95
|
-
region=data["region"],
|
|
96
135
|
prompt=Conversation.from_log(data["prompt"]),
|
|
97
136
|
sampling_params=SamplingParams(**data["sampling_params"]),
|
|
98
137
|
status_code=data["status_code"],
|
|
99
138
|
is_error=data["is_error"],
|
|
100
139
|
error_message=data["error_message"],
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
140
|
+
usage=usage,
|
|
141
|
+
content=content,
|
|
142
|
+
thinking=data.get("thinking"),
|
|
143
|
+
model_external=data.get("model_external"),
|
|
144
|
+
region=data.get("region"),
|
|
145
|
+
logprobs=data.get("logprobs"),
|
|
146
|
+
finish_reason=data.get("finish_reason"),
|
|
147
|
+
cost=data.get("cost"),
|
|
148
|
+
cache_hit=data.get("cache_hit", False),
|
|
106
149
|
)
|
|
107
150
|
|
|
108
151
|
def write_to_file(self, filename):
|
|
@@ -145,6 +188,8 @@ class APIRequestBase(ABC):
|
|
|
145
188
|
debug: bool = False,
|
|
146
189
|
all_model_names: list[str] | None = None,
|
|
147
190
|
all_sampling_params: list[SamplingParams] | None = None,
|
|
191
|
+
tools: list | None = None,
|
|
192
|
+
cache: CachePattern | None = None,
|
|
148
193
|
):
|
|
149
194
|
if all_model_names is None:
|
|
150
195
|
raise ValueError("all_model_names must be provided.")
|
|
@@ -166,6 +211,8 @@ class APIRequestBase(ABC):
|
|
|
166
211
|
self.debug = debug
|
|
167
212
|
self.all_model_names = all_model_names
|
|
168
213
|
self.all_sampling_params = all_sampling_params
|
|
214
|
+
self.tools = tools
|
|
215
|
+
self.cache: CachePattern | None = cache
|
|
169
216
|
self.result = [] # list of APIResponse objects from each attempt
|
|
170
217
|
|
|
171
218
|
# these should be set in the __init__ of the subclass
|
|
@@ -255,6 +302,8 @@ class APIRequestBase(ABC):
|
|
|
255
302
|
callback=self.callback,
|
|
256
303
|
all_model_names=self.all_model_names,
|
|
257
304
|
all_sampling_params=self.all_sampling_params,
|
|
305
|
+
tools=self.tools,
|
|
306
|
+
cache=self.cache,
|
|
258
307
|
)
|
|
259
308
|
# PROBLEM: new request is never put into results array, so we can't get the result.
|
|
260
309
|
self.retry_queue.put_nowait(new_request)
|
|
@@ -297,9 +346,8 @@ class APIRequestBase(ABC):
|
|
|
297
346
|
status_code=None,
|
|
298
347
|
is_error=True,
|
|
299
348
|
error_message="Request timed out (terminated by client).",
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
output_tokens=None,
|
|
349
|
+
content=None,
|
|
350
|
+
usage=None,
|
|
303
351
|
)
|
|
304
352
|
)
|
|
305
353
|
self.handle_error(create_new_request=False)
|
|
@@ -315,9 +363,8 @@ class APIRequestBase(ABC):
|
|
|
315
363
|
status_code=None,
|
|
316
364
|
is_error=True,
|
|
317
365
|
error_message=f"Unexpected {type(e).__name__}: {str(e) or 'No message.'}",
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
output_tokens=None,
|
|
366
|
+
content=None,
|
|
367
|
+
usage=None,
|
|
321
368
|
)
|
|
322
369
|
)
|
|
323
370
|
# maybe consider making True?
|
|
@@ -344,6 +391,8 @@ def create_api_request(
|
|
|
344
391
|
callback: Callable | None = None,
|
|
345
392
|
all_model_names: list[str] | None = None,
|
|
346
393
|
all_sampling_params: list[SamplingParams] | None = None,
|
|
394
|
+
tools: list | None = None,
|
|
395
|
+
cache: CachePattern | None = None,
|
|
347
396
|
) -> APIRequestBase:
|
|
348
397
|
from .common import CLASSES # circular import so made it lazy, does this work?
|
|
349
398
|
|
|
@@ -368,5 +417,7 @@ def create_api_request(
|
|
|
368
417
|
callback=callback,
|
|
369
418
|
all_model_names=all_model_names,
|
|
370
419
|
all_sampling_params=all_sampling_params,
|
|
420
|
+
tools=tools,
|
|
421
|
+
cache=cache,
|
|
371
422
|
**kwargs,
|
|
372
423
|
)
|