lm-deluge 0.0.9__tar.gz → 0.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (60) hide show
  1. {lm_deluge-0.0.9/src/lm_deluge.egg-info → lm_deluge-0.0.12}/PKG-INFO +97 -8
  2. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/README.md +95 -7
  3. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/pyproject.toml +2 -1
  4. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/anthropic.py +24 -9
  5. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/base.py +40 -16
  6. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/bedrock.py +26 -13
  7. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/mistral.py +15 -7
  8. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/openai.py +13 -7
  9. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/client.py +17 -8
  10. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/models.py +45 -33
  11. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/prompt.py +87 -1
  12. lm_deluge-0.0.12/src/lm_deluge/tool.py +280 -0
  13. lm_deluge-0.0.12/src/lm_deluge/usage.py +114 -0
  14. {lm_deluge-0.0.9 → lm_deluge-0.0.12/src/lm_deluge.egg-info}/PKG-INFO +97 -8
  15. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/SOURCES.txt +6 -0
  16. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/requires.txt +1 -0
  17. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/tests/test_all_models.py +7 -7
  18. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/tests/test_bedrock_models.py +19 -66
  19. lm_deluge-0.0.12/tests/test_mcp_tools.py +221 -0
  20. lm_deluge-0.0.12/tests/test_prompt_caching.py +261 -0
  21. lm_deluge-0.0.12/tests/test_real_caching.py +305 -0
  22. lm_deluge-0.0.12/tests/test_real_caching_bedrock.py +307 -0
  23. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/tests/test_tool_calls.py +3 -3
  24. lm_deluge-0.0.12/tests/test_tool_from_function.py +150 -0
  25. lm_deluge-0.0.9/src/lm_deluge/tool.py +0 -87
  26. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/LICENSE +0 -0
  27. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/setup.cfg +0 -0
  28. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/__init__.py +0 -0
  29. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/__init__.py +0 -0
  30. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/common.py +0 -0
  31. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
  32. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
  33. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
  34. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
  35. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
  36. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/cache.py +0 -0
  37. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/embed.py +0 -0
  38. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/errors.py +0 -0
  39. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/gemini_limits.py +0 -0
  40. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/image.py +0 -0
  41. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/__init__.py +0 -0
  42. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/extract.py +0 -0
  43. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/score.py +0 -0
  44. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/translate.py +0 -0
  45. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/rerank.py +0 -0
  46. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/sampling_params.py +0 -0
  47. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/tracker.py +0 -0
  48. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/util/json.py +0 -0
  49. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/util/logprobs.py +0 -0
  50. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/util/validation.py +0 -0
  51. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge/util/xml.py +0 -0
  52. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
  53. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/top_level.txt +0 -0
  54. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/tests/test_cache.py +0 -0
  55. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/tests/test_image_models.py +0 -0
  56. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/tests/test_image_utils.py +0 -0
  57. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/tests/test_json_utils.py +0 -0
  58. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/tests/test_sampling_params.py +0 -0
  59. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/tests/test_translate.py +0 -0
  60. {lm_deluge-0.0.9 → lm_deluge-0.0.12}/tests/test_xml_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.9
3
+ Version: 0.0.12
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
@@ -21,6 +21,7 @@ Requires-Dist: bs4
21
21
  Requires-Dist: lxml
22
22
  Requires-Dist: pdf2image
23
23
  Requires-Dist: pillow
24
+ Requires-Dist: fastmcp>=2.4
24
25
  Requires-Dist: fasttext-wheel
25
26
  Requires-Dist: fasttext-langdetect
26
27
  Dynamic: license-file
@@ -32,6 +33,8 @@ Dynamic: license-file
32
33
  - **Unified client** – Send prompts to all relevant models with a single client.
33
34
  - **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
34
35
  - **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
36
+ - **Tool Use** – Unified API for defining tools for all providers, and creating tools automatically from python functions.
37
+ - **MCP Support** – Instantiate a `Tool` from a local or remote MCP server so that any LLM can use it, whether or not that provider natively supports MCP.
35
38
  - **Caching** – Save completions in a local or distributed cache to avoid repeated LLM calls to process the same input.
36
39
  - **Convenient message constructor** – No more looking up how to build an Anthropic messages list with images. Our `Conversation` and `Message` classes work great with our client or with the `openai` and `anthropic` packages.
37
40
  - **Sync and async APIs** – Use the client from sync or async code.
@@ -44,7 +47,7 @@ Dynamic: license-file
44
47
  pip install lm-deluge
45
48
  ```
46
49
 
47
- The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
50
+ The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
48
51
 
49
52
  ## Quickstart
50
53
 
@@ -60,13 +63,13 @@ print(resp[0].completion)
60
63
 
61
64
  ## Spraying Across Models
62
65
 
63
- To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
66
+ To distribute your requests across models, just provide a list of more than one model to the constructor. See all available models in `models.py`. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
64
67
 
65
68
  ```python
66
69
  from lm_deluge import LLMClient
67
70
 
68
71
  client = LLMClient.basic(
69
- ["gpt-4o-mini", "claude-haiku-anthropic"],
72
+ ["gpt-4o-mini", "claude-3-haiku"],
70
73
  max_requests_per_minute=10_000
71
74
  )
72
75
  resps = client.process_prompts_sync(
@@ -81,7 +84,7 @@ API calls can be customized in a few ways.
81
84
 
82
85
  1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
83
86
  2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
84
- 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
87
+ 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
85
88
 
86
89
  Putting it all together:
87
90
 
@@ -120,11 +123,97 @@ resps = client.process_prompts_sync([prompt])
120
123
 
121
124
  This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
122
125
 
123
- ## Caching
126
+ See a full multi-turn chat example in `examples/multiturn.md`.
124
127
 
125
- `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
128
+ ## Tool Use
126
129
 
127
- **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
130
+ Define tools from Python functions and use them with any model:
131
+
132
+ ```python
133
+ from lm_deluge import LLMClient, Tool
134
+
135
+ def get_weather(city: str) -> str:
136
+ return f"The weather in {city} is sunny and 72°F"
137
+
138
+ tool = Tool.from_function(get_weather)
139
+ client = LLMClient.basic("claude-3-haiku")
140
+ resps = client.process_prompts_sync(
141
+ ["What's the weather in Paris?"],
142
+ tools=[tool]
143
+ )
144
+
145
+ # you can iterate over the tool calls in the response automatically
146
+ for tool_call in resps[0].tool_calls:
147
+ print(tool_call.name, tool_call.arguments)
148
+ ```
149
+
150
+ You can also automatically instantiate tools from MCP servers. Under the hood, the the constructor connects to the server, asks it what tools it has, and then creates a `Tool` from each of them, *with a built-in `call` and `acall` interface*.
151
+
152
+ ```python
153
+ from lm_deluge import LLMClient, Tool
154
+
155
+ # Connect to a local MCP server and get all of its tools
156
+ filesystem_tools = Tool.from_mcp(
157
+ "filesystem",
158
+ command="npx",
159
+ args=["-y", "@modelcontextprotocol/server-filesystem", "/path/to/directory"]
160
+ )
161
+
162
+ # or load ALL the tools from a Claude Desktop like config
163
+ config = {
164
+ "mcpServers": {
165
+ "exa": {
166
+ "url": f"https://mcp.exa.ai/mcp?exaApiKey={os.getenv('EXA_API_KEY')}"
167
+ },
168
+ "zapier": {
169
+ "url": f"https://mcp.zapier.com/api/mcp/s/{os.getenv('ZAPIER_MCP_SECRET')}/mcp"
170
+ }
171
+ }
172
+ }
173
+ all_tools = Tool.from_mcp_config(config)
174
+
175
+ # let the model use the tools
176
+ client = LLMClient.basic("gpt-4o-mini")
177
+ resps = client.process_prompts_sync(
178
+ ["List the files in the current directory"],
179
+ tools=tools
180
+ )
181
+
182
+ # call the tools
183
+ for tool_call in resps[0].tool_calls:
184
+ # this is dumb sorry will make it better
185
+ tool_to_call = [x for x in tools if x.name == tool_call.name][0]
186
+ tool_to_call.call(**tool_call.arguments) # in async code, use .acall()
187
+ ```
188
+
189
+ ### Prompt Caching (Anthropic)
190
+
191
+ For Anthropic models, you can use prompt caching to reduce costs and latency for repeated context. This uses Anthropic's server-side prompt caching. Other providers like OpenAI and Google do this automatically, but Anthropic requires you to manually set cache-control on messages. You can do this in lm-deluge with a simple "cache" argument to `process_prompts_sync` or `process_prompts_async`:
192
+
193
+ ```python
194
+ from lm_deluge import LLMClient, Conversation, Message
195
+
196
+ # Create a conversation with system message
197
+ conv = (
198
+ Conversation.system("You are an expert Python developer with deep knowledge of async programming.")
199
+ .add(Message.user("How do I use asyncio.gather?"))
200
+ )
201
+
202
+ # Use prompt caching to cache system message and tools
203
+ client = LLMClient.basic("claude-3-5-sonnet")
204
+ resps = client.process_prompts_sync(
205
+ [conv],
206
+ cache="system_and_tools" # Cache system message and any tools
207
+ )
208
+ ```
209
+
210
+ Available cache patterns: `"system_and_tools"`, `"tools_only"`, `"last_user_message"`, `"last_2_user_messages"`, `"last_3_user_messages"`.
211
+
212
+ ## Local Caching
213
+
214
+ Besides caching from model providers (which provides cache reads at a discount, but not for free) `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches to cache prompts locally. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
215
+
216
+ **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The local cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
128
217
 
129
218
  ## Asynchronous Client
130
219
  Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
@@ -5,6 +5,8 @@
5
5
  - **Unified client** – Send prompts to all relevant models with a single client.
6
6
  - **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
7
7
  - **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
8
+ - **Tool Use** – Unified API for defining tools for all providers, and creating tools automatically from python functions.
9
+ - **MCP Support** – Instantiate a `Tool` from a local or remote MCP server so that any LLM can use it, whether or not that provider natively supports MCP.
8
10
  - **Caching** – Save completions in a local or distributed cache to avoid repeated LLM calls to process the same input.
9
11
  - **Convenient message constructor** – No more looking up how to build an Anthropic messages list with images. Our `Conversation` and `Message` classes work great with our client or with the `openai` and `anthropic` packages.
10
12
  - **Sync and async APIs** – Use the client from sync or async code.
@@ -17,7 +19,7 @@
17
19
  pip install lm-deluge
18
20
  ```
19
21
 
20
- The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
22
+ The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
21
23
 
22
24
  ## Quickstart
23
25
 
@@ -33,13 +35,13 @@ print(resp[0].completion)
33
35
 
34
36
  ## Spraying Across Models
35
37
 
36
- To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
38
+ To distribute your requests across models, just provide a list of more than one model to the constructor. See all available models in `models.py`. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
37
39
 
38
40
  ```python
39
41
  from lm_deluge import LLMClient
40
42
 
41
43
  client = LLMClient.basic(
42
- ["gpt-4o-mini", "claude-haiku-anthropic"],
44
+ ["gpt-4o-mini", "claude-3-haiku"],
43
45
  max_requests_per_minute=10_000
44
46
  )
45
47
  resps = client.process_prompts_sync(
@@ -54,7 +56,7 @@ API calls can be customized in a few ways.
54
56
 
55
57
  1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
56
58
  2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
57
- 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
59
+ 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
58
60
 
59
61
  Putting it all together:
60
62
 
@@ -93,11 +95,97 @@ resps = client.process_prompts_sync([prompt])
93
95
 
94
96
  This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
95
97
 
96
- ## Caching
98
+ See a full multi-turn chat example in `examples/multiturn.md`.
97
99
 
98
- `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
100
+ ## Tool Use
99
101
 
100
- **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
102
+ Define tools from Python functions and use them with any model:
103
+
104
+ ```python
105
+ from lm_deluge import LLMClient, Tool
106
+
107
+ def get_weather(city: str) -> str:
108
+ return f"The weather in {city} is sunny and 72°F"
109
+
110
+ tool = Tool.from_function(get_weather)
111
+ client = LLMClient.basic("claude-3-haiku")
112
+ resps = client.process_prompts_sync(
113
+ ["What's the weather in Paris?"],
114
+ tools=[tool]
115
+ )
116
+
117
+ # you can iterate over the tool calls in the response automatically
118
+ for tool_call in resps[0].tool_calls:
119
+ print(tool_call.name, tool_call.arguments)
120
+ ```
121
+
122
+ You can also automatically instantiate tools from MCP servers. Under the hood, the the constructor connects to the server, asks it what tools it has, and then creates a `Tool` from each of them, *with a built-in `call` and `acall` interface*.
123
+
124
+ ```python
125
+ from lm_deluge import LLMClient, Tool
126
+
127
+ # Connect to a local MCP server and get all of its tools
128
+ filesystem_tools = Tool.from_mcp(
129
+ "filesystem",
130
+ command="npx",
131
+ args=["-y", "@modelcontextprotocol/server-filesystem", "/path/to/directory"]
132
+ )
133
+
134
+ # or load ALL the tools from a Claude Desktop like config
135
+ config = {
136
+ "mcpServers": {
137
+ "exa": {
138
+ "url": f"https://mcp.exa.ai/mcp?exaApiKey={os.getenv('EXA_API_KEY')}"
139
+ },
140
+ "zapier": {
141
+ "url": f"https://mcp.zapier.com/api/mcp/s/{os.getenv('ZAPIER_MCP_SECRET')}/mcp"
142
+ }
143
+ }
144
+ }
145
+ all_tools = Tool.from_mcp_config(config)
146
+
147
+ # let the model use the tools
148
+ client = LLMClient.basic("gpt-4o-mini")
149
+ resps = client.process_prompts_sync(
150
+ ["List the files in the current directory"],
151
+ tools=tools
152
+ )
153
+
154
+ # call the tools
155
+ for tool_call in resps[0].tool_calls:
156
+ # this is dumb sorry will make it better
157
+ tool_to_call = [x for x in tools if x.name == tool_call.name][0]
158
+ tool_to_call.call(**tool_call.arguments) # in async code, use .acall()
159
+ ```
160
+
161
+ ### Prompt Caching (Anthropic)
162
+
163
+ For Anthropic models, you can use prompt caching to reduce costs and latency for repeated context. This uses Anthropic's server-side prompt caching. Other providers like OpenAI and Google do this automatically, but Anthropic requires you to manually set cache-control on messages. You can do this in lm-deluge with a simple "cache" argument to `process_prompts_sync` or `process_prompts_async`:
164
+
165
+ ```python
166
+ from lm_deluge import LLMClient, Conversation, Message
167
+
168
+ # Create a conversation with system message
169
+ conv = (
170
+ Conversation.system("You are an expert Python developer with deep knowledge of async programming.")
171
+ .add(Message.user("How do I use asyncio.gather?"))
172
+ )
173
+
174
+ # Use prompt caching to cache system message and tools
175
+ client = LLMClient.basic("claude-3-5-sonnet")
176
+ resps = client.process_prompts_sync(
177
+ [conv],
178
+ cache="system_and_tools" # Cache system message and any tools
179
+ )
180
+ ```
181
+
182
+ Available cache patterns: `"system_and_tools"`, `"tools_only"`, `"last_user_message"`, `"last_2_user_messages"`, `"last_3_user_messages"`.
183
+
184
+ ## Local Caching
185
+
186
+ Besides caching from model providers (which provides cache reads at a discount, but not for free) `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches to cache prompts locally. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
187
+
188
+ **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The local cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
101
189
 
102
190
  ## Asynchronous Client
103
191
  Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
3
3
 
4
4
  [project]
5
5
  name = "lm_deluge"
6
- version = "0.0.9"
6
+ version = "0.0.12"
7
7
  authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
8
8
  description = "Python utility for using LLM API models."
9
9
  readme = "README.md"
@@ -27,6 +27,7 @@ dependencies = [
27
27
  "lxml",
28
28
  "pdf2image",
29
29
  "pillow",
30
+ "fastmcp>=2.4",
30
31
  "fasttext-wheel",
31
32
  "fasttext-langdetect",
32
33
  ]
@@ -6,7 +6,15 @@ import warnings
6
6
  from tqdm import tqdm
7
7
  from typing import Callable
8
8
 
9
- from lm_deluge.prompt import Conversation, Message, Text, ToolCall, Thinking
9
+ from lm_deluge.prompt import (
10
+ Conversation,
11
+ Message,
12
+ Text,
13
+ ToolCall,
14
+ Thinking,
15
+ CachePattern,
16
+ )
17
+ from lm_deluge.usage import Usage
10
18
  from .base import APIRequestBase, APIResponse
11
19
 
12
20
  from ..tracker import StatusTracker
@@ -35,6 +43,7 @@ class AnthropicRequest(APIRequestBase):
35
43
  all_model_names: list[str] | None = None,
36
44
  all_sampling_params: list[SamplingParams] | None = None,
37
45
  tools: list | None = None,
46
+ cache: CachePattern | None = None,
38
47
  ):
39
48
  super().__init__(
40
49
  task_id=task_id,
@@ -52,11 +61,16 @@ class AnthropicRequest(APIRequestBase):
52
61
  all_model_names=all_model_names,
53
62
  all_sampling_params=all_sampling_params,
54
63
  tools=tools,
64
+ cache=cache,
55
65
  )
56
66
  self.model = APIModel.from_registry(model_name)
57
67
  self.url = f"{self.model.api_base}/messages"
58
68
 
59
- self.system_message, messages = prompt.to_anthropic()
69
+ # Lock images as bytes if caching is enabled
70
+ if cache is not None:
71
+ prompt.lock_images_as_bytes()
72
+
73
+ self.system_message, messages = prompt.to_anthropic(cache_pattern=cache)
60
74
  self.request_header = {
61
75
  "x-api-key": os.getenv(self.model.api_key_env_var),
62
76
  "anthropic-version": "2023-06-01",
@@ -97,15 +111,18 @@ class AnthropicRequest(APIRequestBase):
97
111
  if self.system_message is not None:
98
112
  self.request_json["system"] = self.system_message
99
113
  if tools:
100
- self.request_json["tools"] = [tool.dump_for("anthropic") for tool in tools]
114
+ tool_definitions = [tool.dump_for("anthropic") for tool in tools]
115
+ # Add cache control to last tool if tools_only caching is specified
116
+ if cache == "tools_only" and tool_definitions:
117
+ tool_definitions[-1]["cache_control"] = {"type": "ephemeral"}
118
+ self.request_json["tools"] = tool_definitions
101
119
 
102
120
  async def handle_response(self, http_response: ClientResponse) -> APIResponse:
103
121
  is_error = False
104
122
  error_message = None
105
123
  thinking = None
106
124
  content = None
107
- input_tokens = None
108
- output_tokens = None
125
+ usage = None
109
126
  status_code = http_response.status
110
127
  mimetype = http_response.headers.get("Content-Type", None)
111
128
  rate_limits = {}
@@ -143,8 +160,7 @@ class AnthropicRequest(APIRequestBase):
143
160
  )
144
161
 
145
162
  content = Message("assistant", parts)
146
- input_tokens = data["usage"]["input_tokens"]
147
- output_tokens = data["usage"]["output_tokens"]
163
+ usage = Usage.from_anthropic_usage(data["usage"])
148
164
  except Exception as e:
149
165
  is_error = True
150
166
  error_message = (
@@ -182,6 +198,5 @@ class AnthropicRequest(APIRequestBase):
182
198
  thinking=thinking,
183
199
  model_internal=self.model_name,
184
200
  sampling_params=self.sampling_params,
185
- input_tokens=input_tokens,
186
- output_tokens=output_tokens,
201
+ usage=usage,
187
202
  )
@@ -7,7 +7,8 @@ from dataclasses import dataclass
7
7
  from abc import ABC, abstractmethod
8
8
  from typing import Callable
9
9
 
10
- from lm_deluge.prompt import Conversation, Message
10
+ from lm_deluge.prompt import Conversation, Message, CachePattern
11
+ from lm_deluge.usage import Usage
11
12
 
12
13
  from ..tracker import StatusTracker
13
14
  from ..sampling_params import SamplingParams
@@ -29,9 +30,8 @@ class APIResponse:
29
30
  is_error: bool | None
30
31
  error_message: str | None
31
32
 
32
- # completion information
33
- input_tokens: int | None
34
- output_tokens: int | None
33
+ # completion information - unified usage tracking
34
+ usage: Usage | None = None
35
35
 
36
36
  # response content - structured format
37
37
  content: Message | None = None
@@ -56,6 +56,26 @@ class APIResponse:
56
56
  return self.content.completion
57
57
  return None
58
58
 
59
+ @property
60
+ def input_tokens(self) -> int | None:
61
+ """Get input tokens from usage object."""
62
+ return self.usage.input_tokens if self.usage else None
63
+
64
+ @property
65
+ def output_tokens(self) -> int | None:
66
+ """Get output tokens from usage object."""
67
+ return self.usage.output_tokens if self.usage else None
68
+
69
+ @property
70
+ def cache_read_tokens(self) -> int | None:
71
+ """Get cache read tokens from usage object."""
72
+ return self.usage.cache_read_tokens if self.usage else None
73
+
74
+ @property
75
+ def cache_write_tokens(self) -> int | None:
76
+ """Get cache write tokens from usage object."""
77
+ return self.usage.cache_write_tokens if self.usage else None
78
+
59
79
  def __post_init__(self):
60
80
  # calculate cost & get external model name
61
81
  self.id = int(self.id)
@@ -63,14 +83,13 @@ class APIResponse:
63
83
  self.model_external = api_model.name
64
84
  self.cost = None
65
85
  if (
66
- self.input_tokens is not None
67
- and self.output_tokens is not None
86
+ self.usage is not None
68
87
  and api_model.input_cost is not None
69
88
  and api_model.output_cost is not None
70
89
  ):
71
90
  self.cost = (
72
- self.input_tokens * api_model.input_cost / 1e6
73
- + self.output_tokens * api_model.output_cost / 1e6
91
+ self.usage.input_tokens * api_model.input_cost / 1e6
92
+ + self.usage.output_tokens * api_model.output_cost / 1e6
74
93
  )
75
94
  elif self.content is not None and self.completion is not None:
76
95
  print(
@@ -90,8 +109,7 @@ class APIResponse:
90
109
  "error_message": self.error_message,
91
110
  "completion": self.completion, # computed property
92
111
  "content": self.content.to_log() if self.content else None,
93
- "input_tokens": self.input_tokens,
94
- "output_tokens": self.output_tokens,
112
+ "usage": self.usage.to_dict() if self.usage else None,
95
113
  "finish_reason": self.finish_reason,
96
114
  "cost": self.cost,
97
115
  }
@@ -107,6 +125,10 @@ class APIResponse:
107
125
  # Backward compatibility: create a Message with just text
108
126
  content = Message.ai(data["completion"])
109
127
 
128
+ usage = None
129
+ if "usage" in data and data["usage"] is not None:
130
+ usage = Usage.from_dict(data["usage"])
131
+
110
132
  return cls(
111
133
  id=data.get("id", random.randint(0, 1_000_000_000)),
112
134
  model_internal=data["model_internal"],
@@ -115,8 +137,7 @@ class APIResponse:
115
137
  status_code=data["status_code"],
116
138
  is_error=data["is_error"],
117
139
  error_message=data["error_message"],
118
- input_tokens=data["input_tokens"],
119
- output_tokens=data["output_tokens"],
140
+ usage=usage,
120
141
  content=content,
121
142
  thinking=data.get("thinking"),
122
143
  model_external=data.get("model_external"),
@@ -168,6 +189,7 @@ class APIRequestBase(ABC):
168
189
  all_model_names: list[str] | None = None,
169
190
  all_sampling_params: list[SamplingParams] | None = None,
170
191
  tools: list | None = None,
192
+ cache: CachePattern | None = None,
171
193
  ):
172
194
  if all_model_names is None:
173
195
  raise ValueError("all_model_names must be provided.")
@@ -190,6 +212,7 @@ class APIRequestBase(ABC):
190
212
  self.all_model_names = all_model_names
191
213
  self.all_sampling_params = all_sampling_params
192
214
  self.tools = tools
215
+ self.cache: CachePattern | None = cache
193
216
  self.result = [] # list of APIResponse objects from each attempt
194
217
 
195
218
  # these should be set in the __init__ of the subclass
@@ -280,6 +303,7 @@ class APIRequestBase(ABC):
280
303
  all_model_names=self.all_model_names,
281
304
  all_sampling_params=self.all_sampling_params,
282
305
  tools=self.tools,
306
+ cache=self.cache,
283
307
  )
284
308
  # PROBLEM: new request is never put into results array, so we can't get the result.
285
309
  self.retry_queue.put_nowait(new_request)
@@ -323,8 +347,7 @@ class APIRequestBase(ABC):
323
347
  is_error=True,
324
348
  error_message="Request timed out (terminated by client).",
325
349
  content=None,
326
- input_tokens=None,
327
- output_tokens=None,
350
+ usage=None,
328
351
  )
329
352
  )
330
353
  self.handle_error(create_new_request=False)
@@ -341,8 +364,7 @@ class APIRequestBase(ABC):
341
364
  is_error=True,
342
365
  error_message=f"Unexpected {type(e).__name__}: {str(e) or 'No message.'}",
343
366
  content=None,
344
- input_tokens=None,
345
- output_tokens=None,
367
+ usage=None,
346
368
  )
347
369
  )
348
370
  # maybe consider making True?
@@ -370,6 +392,7 @@ def create_api_request(
370
392
  all_model_names: list[str] | None = None,
371
393
  all_sampling_params: list[SamplingParams] | None = None,
372
394
  tools: list | None = None,
395
+ cache: CachePattern | None = None,
373
396
  ) -> APIRequestBase:
374
397
  from .common import CLASSES # circular import so made it lazy, does this work?
375
398
 
@@ -395,5 +418,6 @@ def create_api_request(
395
418
  all_model_names=all_model_names,
396
419
  all_sampling_params=all_sampling_params,
397
420
  tools=tools,
421
+ cache=cache,
398
422
  **kwargs,
399
423
  )
@@ -12,7 +12,15 @@ except ImportError:
12
12
  "aws4auth is required for bedrock support. Install with: pip install requests-aws4auth"
13
13
  )
14
14
 
15
- from lm_deluge.prompt import Conversation, Message, Text, ToolCall, Thinking
15
+ from lm_deluge.prompt import (
16
+ Conversation,
17
+ Message,
18
+ Text,
19
+ ToolCall,
20
+ Thinking,
21
+ CachePattern,
22
+ )
23
+ from lm_deluge.usage import Usage
16
24
  from .base import APIRequestBase, APIResponse
17
25
 
18
26
  from ..tracker import StatusTracker
@@ -38,6 +46,7 @@ class BedrockRequest(APIRequestBase):
38
46
  all_model_names: list[str] | None = None,
39
47
  all_sampling_params: list[SamplingParams] | None = None,
40
48
  tools: list | None = None,
49
+ cache: CachePattern | None = None,
41
50
  ):
42
51
  super().__init__(
43
52
  task_id=task_id,
@@ -55,8 +64,13 @@ class BedrockRequest(APIRequestBase):
55
64
  all_model_names=all_model_names,
56
65
  all_sampling_params=all_sampling_params,
57
66
  tools=tools,
67
+ cache=cache,
58
68
  )
59
69
 
70
+ # Lock images as bytes if caching is enabled
71
+ if cache is not None:
72
+ prompt.lock_images_as_bytes()
73
+
60
74
  self.model = APIModel.from_registry(model_name)
61
75
 
62
76
  # Get AWS credentials from environment
@@ -87,7 +101,7 @@ class BedrockRequest(APIRequestBase):
87
101
  self.url = f"https://bedrock-runtime.{self.region}.amazonaws.com/model/{self.model.name}/invoke"
88
102
 
89
103
  # Convert prompt to Anthropic format for bedrock
90
- self.system_message, messages = prompt.to_anthropic()
104
+ self.system_message, messages = prompt.to_anthropic(cache_pattern=cache)
91
105
 
92
106
  # Prepare request body in Anthropic's bedrock format
93
107
  self.request_json = {
@@ -102,7 +116,11 @@ class BedrockRequest(APIRequestBase):
102
116
  self.request_json["system"] = self.system_message
103
117
 
104
118
  if tools:
105
- self.request_json["tools"] = [tool.dump_for("anthropic") for tool in tools]
119
+ tool_definitions = [tool.dump_for("anthropic") for tool in tools]
120
+ # Add cache control to last tool if tools_only caching is specified
121
+ if cache == "tools_only" and tool_definitions:
122
+ tool_definitions[-1]["cache_control"] = {"type": "ephemeral"}
123
+ self.request_json["tools"] = tool_definitions
106
124
 
107
125
  # Setup AWS4Auth for signing
108
126
  self.auth = AWS4Auth(
@@ -179,8 +197,7 @@ class BedrockRequest(APIRequestBase):
179
197
  is_error=True,
180
198
  error_message="Request timed out (terminated by client).",
181
199
  content=None,
182
- input_tokens=None,
183
- output_tokens=None,
200
+ usage=None,
184
201
  )
185
202
  )
186
203
  self.handle_error(create_new_request=False)
@@ -199,8 +216,7 @@ class BedrockRequest(APIRequestBase):
199
216
  is_error=True,
200
217
  error_message=f"Unexpected {type(e).__name__}: {str(e) or 'No message.'}",
201
218
  content=None,
202
- input_tokens=None,
203
- output_tokens=None,
219
+ usage=None,
204
220
  )
205
221
  )
206
222
  self.handle_error(create_new_request=False)
@@ -210,8 +226,7 @@ class BedrockRequest(APIRequestBase):
210
226
  error_message = None
211
227
  thinking = None
212
228
  content = None
213
- input_tokens = None
214
- output_tokens = None
229
+ usage = None
215
230
  status_code = http_response.status
216
231
  mimetype = http_response.headers.get("Content-Type", None)
217
232
 
@@ -238,8 +253,7 @@ class BedrockRequest(APIRequestBase):
238
253
  )
239
254
 
240
255
  content = Message("assistant", parts)
241
- input_tokens = data["usage"]["input_tokens"]
242
- output_tokens = data["usage"]["output_tokens"]
256
+ usage = Usage.from_anthropic_usage(data["usage"])
243
257
  except Exception as e:
244
258
  is_error = True
245
259
  error_message = (
@@ -278,6 +292,5 @@ class BedrockRequest(APIRequestBase):
278
292
  model_internal=self.model_name,
279
293
  region=self.region,
280
294
  sampling_params=self.sampling_params,
281
- input_tokens=input_tokens,
282
- output_tokens=output_tokens,
295
+ usage=usage,
283
296
  )