lm-deluge 0.0.8__tar.gz → 0.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (61) hide show
  1. {lm_deluge-0.0.8/src/lm_deluge.egg-info → lm_deluge-0.0.12}/PKG-INFO +97 -8
  2. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/README.md +95 -7
  3. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/pyproject.toml +2 -1
  4. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/anthropic.py +45 -14
  5. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/base.py +77 -26
  6. lm_deluge-0.0.12/src/lm_deluge/api_requests/bedrock.py +296 -0
  7. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/common.py +2 -0
  8. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/mistral.py +16 -8
  9. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/openai.py +49 -12
  10. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/client.py +28 -4
  11. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/models.py +125 -48
  12. lm_deluge-0.0.12/src/lm_deluge/prompt.py +779 -0
  13. lm_deluge-0.0.12/src/lm_deluge/tool.py +280 -0
  14. lm_deluge-0.0.12/src/lm_deluge/usage.py +114 -0
  15. {lm_deluge-0.0.8 → lm_deluge-0.0.12/src/lm_deluge.egg-info}/PKG-INFO +97 -8
  16. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/SOURCES.txt +9 -0
  17. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/requires.txt +1 -0
  18. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_all_models.py +8 -4
  19. lm_deluge-0.0.12/tests/test_bedrock_models.py +205 -0
  20. lm_deluge-0.0.12/tests/test_mcp_tools.py +221 -0
  21. lm_deluge-0.0.12/tests/test_prompt_caching.py +261 -0
  22. lm_deluge-0.0.12/tests/test_real_caching.py +305 -0
  23. lm_deluge-0.0.12/tests/test_real_caching_bedrock.py +307 -0
  24. lm_deluge-0.0.12/tests/test_tool_calls.py +401 -0
  25. lm_deluge-0.0.12/tests/test_tool_from_function.py +150 -0
  26. lm_deluge-0.0.8/src/lm_deluge/prompt.py +0 -357
  27. lm_deluge-0.0.8/src/lm_deluge/tool.py +0 -106
  28. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/LICENSE +0 -0
  29. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/setup.cfg +0 -0
  30. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/__init__.py +0 -0
  31. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/__init__.py +0 -0
  32. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
  33. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
  34. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
  35. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
  36. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
  37. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/cache.py +0 -0
  38. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/embed.py +0 -0
  39. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/errors.py +0 -0
  40. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/gemini_limits.py +0 -0
  41. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/image.py +0 -0
  42. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/__init__.py +0 -0
  43. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/extract.py +0 -0
  44. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/score.py +0 -0
  45. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/llm_tools/translate.py +0 -0
  46. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/rerank.py +0 -0
  47. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/sampling_params.py +0 -0
  48. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/tracker.py +0 -0
  49. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/util/json.py +0 -0
  50. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/util/logprobs.py +0 -0
  51. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/util/validation.py +0 -0
  52. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge/util/xml.py +0 -0
  53. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
  54. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/src/lm_deluge.egg-info/top_level.txt +0 -0
  55. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_cache.py +0 -0
  56. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_image_models.py +0 -0
  57. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_image_utils.py +0 -0
  58. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_json_utils.py +0 -0
  59. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_sampling_params.py +0 -0
  60. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_translate.py +0 -0
  61. {lm_deluge-0.0.8 → lm_deluge-0.0.12}/tests/test_xml_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.8
3
+ Version: 0.0.12
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
@@ -21,6 +21,7 @@ Requires-Dist: bs4
21
21
  Requires-Dist: lxml
22
22
  Requires-Dist: pdf2image
23
23
  Requires-Dist: pillow
24
+ Requires-Dist: fastmcp>=2.4
24
25
  Requires-Dist: fasttext-wheel
25
26
  Requires-Dist: fasttext-langdetect
26
27
  Dynamic: license-file
@@ -32,6 +33,8 @@ Dynamic: license-file
32
33
  - **Unified client** – Send prompts to all relevant models with a single client.
33
34
  - **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
34
35
  - **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
36
+ - **Tool Use** – Unified API for defining tools for all providers, and creating tools automatically from python functions.
37
+ - **MCP Support** – Instantiate a `Tool` from a local or remote MCP server so that any LLM can use it, whether or not that provider natively supports MCP.
35
38
  - **Caching** – Save completions in a local or distributed cache to avoid repeated LLM calls to process the same input.
36
39
  - **Convenient message constructor** – No more looking up how to build an Anthropic messages list with images. Our `Conversation` and `Message` classes work great with our client or with the `openai` and `anthropic` packages.
37
40
  - **Sync and async APIs** – Use the client from sync or async code.
@@ -44,7 +47,7 @@ Dynamic: license-file
44
47
  pip install lm-deluge
45
48
  ```
46
49
 
47
- The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
50
+ The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
48
51
 
49
52
  ## Quickstart
50
53
 
@@ -60,13 +63,13 @@ print(resp[0].completion)
60
63
 
61
64
  ## Spraying Across Models
62
65
 
63
- To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
66
+ To distribute your requests across models, just provide a list of more than one model to the constructor. See all available models in `models.py`. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
64
67
 
65
68
  ```python
66
69
  from lm_deluge import LLMClient
67
70
 
68
71
  client = LLMClient.basic(
69
- ["gpt-4o-mini", "claude-haiku-anthropic"],
72
+ ["gpt-4o-mini", "claude-3-haiku"],
70
73
  max_requests_per_minute=10_000
71
74
  )
72
75
  resps = client.process_prompts_sync(
@@ -81,7 +84,7 @@ API calls can be customized in a few ways.
81
84
 
82
85
  1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
83
86
  2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
84
- 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
87
+ 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
85
88
 
86
89
  Putting it all together:
87
90
 
@@ -120,11 +123,97 @@ resps = client.process_prompts_sync([prompt])
120
123
 
121
124
  This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
122
125
 
123
- ## Caching
126
+ See a full multi-turn chat example in `examples/multiturn.md`.
124
127
 
125
- `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
128
+ ## Tool Use
126
129
 
127
- **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
130
+ Define tools from Python functions and use them with any model:
131
+
132
+ ```python
133
+ from lm_deluge import LLMClient, Tool
134
+
135
+ def get_weather(city: str) -> str:
136
+ return f"The weather in {city} is sunny and 72°F"
137
+
138
+ tool = Tool.from_function(get_weather)
139
+ client = LLMClient.basic("claude-3-haiku")
140
+ resps = client.process_prompts_sync(
141
+ ["What's the weather in Paris?"],
142
+ tools=[tool]
143
+ )
144
+
145
+ # you can iterate over the tool calls in the response automatically
146
+ for tool_call in resps[0].tool_calls:
147
+ print(tool_call.name, tool_call.arguments)
148
+ ```
149
+
150
+ You can also automatically instantiate tools from MCP servers. Under the hood, the the constructor connects to the server, asks it what tools it has, and then creates a `Tool` from each of them, *with a built-in `call` and `acall` interface*.
151
+
152
+ ```python
153
+ from lm_deluge import LLMClient, Tool
154
+
155
+ # Connect to a local MCP server and get all of its tools
156
+ filesystem_tools = Tool.from_mcp(
157
+ "filesystem",
158
+ command="npx",
159
+ args=["-y", "@modelcontextprotocol/server-filesystem", "/path/to/directory"]
160
+ )
161
+
162
+ # or load ALL the tools from a Claude Desktop like config
163
+ config = {
164
+ "mcpServers": {
165
+ "exa": {
166
+ "url": f"https://mcp.exa.ai/mcp?exaApiKey={os.getenv('EXA_API_KEY')}"
167
+ },
168
+ "zapier": {
169
+ "url": f"https://mcp.zapier.com/api/mcp/s/{os.getenv('ZAPIER_MCP_SECRET')}/mcp"
170
+ }
171
+ }
172
+ }
173
+ all_tools = Tool.from_mcp_config(config)
174
+
175
+ # let the model use the tools
176
+ client = LLMClient.basic("gpt-4o-mini")
177
+ resps = client.process_prompts_sync(
178
+ ["List the files in the current directory"],
179
+ tools=tools
180
+ )
181
+
182
+ # call the tools
183
+ for tool_call in resps[0].tool_calls:
184
+ # this is dumb sorry will make it better
185
+ tool_to_call = [x for x in tools if x.name == tool_call.name][0]
186
+ tool_to_call.call(**tool_call.arguments) # in async code, use .acall()
187
+ ```
188
+
189
+ ### Prompt Caching (Anthropic)
190
+
191
+ For Anthropic models, you can use prompt caching to reduce costs and latency for repeated context. This uses Anthropic's server-side prompt caching. Other providers like OpenAI and Google do this automatically, but Anthropic requires you to manually set cache-control on messages. You can do this in lm-deluge with a simple "cache" argument to `process_prompts_sync` or `process_prompts_async`:
192
+
193
+ ```python
194
+ from lm_deluge import LLMClient, Conversation, Message
195
+
196
+ # Create a conversation with system message
197
+ conv = (
198
+ Conversation.system("You are an expert Python developer with deep knowledge of async programming.")
199
+ .add(Message.user("How do I use asyncio.gather?"))
200
+ )
201
+
202
+ # Use prompt caching to cache system message and tools
203
+ client = LLMClient.basic("claude-3-5-sonnet")
204
+ resps = client.process_prompts_sync(
205
+ [conv],
206
+ cache="system_and_tools" # Cache system message and any tools
207
+ )
208
+ ```
209
+
210
+ Available cache patterns: `"system_and_tools"`, `"tools_only"`, `"last_user_message"`, `"last_2_user_messages"`, `"last_3_user_messages"`.
211
+
212
+ ## Local Caching
213
+
214
+ Besides caching from model providers (which provides cache reads at a discount, but not for free) `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches to cache prompts locally. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
215
+
216
+ **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The local cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
128
217
 
129
218
  ## Asynchronous Client
130
219
  Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
@@ -5,6 +5,8 @@
5
5
  - **Unified client** – Send prompts to all relevant models with a single client.
6
6
  - **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
7
7
  - **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
8
+ - **Tool Use** – Unified API for defining tools for all providers, and creating tools automatically from python functions.
9
+ - **MCP Support** – Instantiate a `Tool` from a local or remote MCP server so that any LLM can use it, whether or not that provider natively supports MCP.
8
10
  - **Caching** – Save completions in a local or distributed cache to avoid repeated LLM calls to process the same input.
9
11
  - **Convenient message constructor** – No more looking up how to build an Anthropic messages list with images. Our `Conversation` and `Message` classes work great with our client or with the `openai` and `anthropic` packages.
10
12
  - **Sync and async APIs** – Use the client from sync or async code.
@@ -17,7 +19,7 @@
17
19
  pip install lm-deluge
18
20
  ```
19
21
 
20
- The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
22
+ The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
21
23
 
22
24
  ## Quickstart
23
25
 
@@ -33,13 +35,13 @@ print(resp[0].completion)
33
35
 
34
36
  ## Spraying Across Models
35
37
 
36
- To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
38
+ To distribute your requests across models, just provide a list of more than one model to the constructor. See all available models in `models.py`. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
37
39
 
38
40
  ```python
39
41
  from lm_deluge import LLMClient
40
42
 
41
43
  client = LLMClient.basic(
42
- ["gpt-4o-mini", "claude-haiku-anthropic"],
44
+ ["gpt-4o-mini", "claude-3-haiku"],
43
45
  max_requests_per_minute=10_000
44
46
  )
45
47
  resps = client.process_prompts_sync(
@@ -54,7 +56,7 @@ API calls can be customized in a few ways.
54
56
 
55
57
  1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
56
58
  2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
57
- 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
59
+ 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
58
60
 
59
61
  Putting it all together:
60
62
 
@@ -93,11 +95,97 @@ resps = client.process_prompts_sync([prompt])
93
95
 
94
96
  This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
95
97
 
96
- ## Caching
98
+ See a full multi-turn chat example in `examples/multiturn.md`.
97
99
 
98
- `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
100
+ ## Tool Use
99
101
 
100
- **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
102
+ Define tools from Python functions and use them with any model:
103
+
104
+ ```python
105
+ from lm_deluge import LLMClient, Tool
106
+
107
+ def get_weather(city: str) -> str:
108
+ return f"The weather in {city} is sunny and 72°F"
109
+
110
+ tool = Tool.from_function(get_weather)
111
+ client = LLMClient.basic("claude-3-haiku")
112
+ resps = client.process_prompts_sync(
113
+ ["What's the weather in Paris?"],
114
+ tools=[tool]
115
+ )
116
+
117
+ # you can iterate over the tool calls in the response automatically
118
+ for tool_call in resps[0].tool_calls:
119
+ print(tool_call.name, tool_call.arguments)
120
+ ```
121
+
122
+ You can also automatically instantiate tools from MCP servers. Under the hood, the the constructor connects to the server, asks it what tools it has, and then creates a `Tool` from each of them, *with a built-in `call` and `acall` interface*.
123
+
124
+ ```python
125
+ from lm_deluge import LLMClient, Tool
126
+
127
+ # Connect to a local MCP server and get all of its tools
128
+ filesystem_tools = Tool.from_mcp(
129
+ "filesystem",
130
+ command="npx",
131
+ args=["-y", "@modelcontextprotocol/server-filesystem", "/path/to/directory"]
132
+ )
133
+
134
+ # or load ALL the tools from a Claude Desktop like config
135
+ config = {
136
+ "mcpServers": {
137
+ "exa": {
138
+ "url": f"https://mcp.exa.ai/mcp?exaApiKey={os.getenv('EXA_API_KEY')}"
139
+ },
140
+ "zapier": {
141
+ "url": f"https://mcp.zapier.com/api/mcp/s/{os.getenv('ZAPIER_MCP_SECRET')}/mcp"
142
+ }
143
+ }
144
+ }
145
+ all_tools = Tool.from_mcp_config(config)
146
+
147
+ # let the model use the tools
148
+ client = LLMClient.basic("gpt-4o-mini")
149
+ resps = client.process_prompts_sync(
150
+ ["List the files in the current directory"],
151
+ tools=tools
152
+ )
153
+
154
+ # call the tools
155
+ for tool_call in resps[0].tool_calls:
156
+ # this is dumb sorry will make it better
157
+ tool_to_call = [x for x in tools if x.name == tool_call.name][0]
158
+ tool_to_call.call(**tool_call.arguments) # in async code, use .acall()
159
+ ```
160
+
161
+ ### Prompt Caching (Anthropic)
162
+
163
+ For Anthropic models, you can use prompt caching to reduce costs and latency for repeated context. This uses Anthropic's server-side prompt caching. Other providers like OpenAI and Google do this automatically, but Anthropic requires you to manually set cache-control on messages. You can do this in lm-deluge with a simple "cache" argument to `process_prompts_sync` or `process_prompts_async`:
164
+
165
+ ```python
166
+ from lm_deluge import LLMClient, Conversation, Message
167
+
168
+ # Create a conversation with system message
169
+ conv = (
170
+ Conversation.system("You are an expert Python developer with deep knowledge of async programming.")
171
+ .add(Message.user("How do I use asyncio.gather?"))
172
+ )
173
+
174
+ # Use prompt caching to cache system message and tools
175
+ client = LLMClient.basic("claude-3-5-sonnet")
176
+ resps = client.process_prompts_sync(
177
+ [conv],
178
+ cache="system_and_tools" # Cache system message and any tools
179
+ )
180
+ ```
181
+
182
+ Available cache patterns: `"system_and_tools"`, `"tools_only"`, `"last_user_message"`, `"last_2_user_messages"`, `"last_3_user_messages"`.
183
+
184
+ ## Local Caching
185
+
186
+ Besides caching from model providers (which provides cache reads at a discount, but not for free) `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches to cache prompts locally. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
187
+
188
+ **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The local cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
101
189
 
102
190
  ## Asynchronous Client
103
191
  Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
3
3
 
4
4
  [project]
5
5
  name = "lm_deluge"
6
- version = "0.0.8"
6
+ version = "0.0.12"
7
7
  authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
8
8
  description = "Python utility for using LLM API models."
9
9
  readme = "README.md"
@@ -27,6 +27,7 @@ dependencies = [
27
27
  "lxml",
28
28
  "pdf2image",
29
29
  "pillow",
30
+ "fastmcp>=2.4",
30
31
  "fasttext-wheel",
31
32
  "fasttext-langdetect",
32
33
  ]
@@ -6,7 +6,15 @@ import warnings
6
6
  from tqdm import tqdm
7
7
  from typing import Callable
8
8
 
9
- from lm_deluge.prompt import Conversation
9
+ from lm_deluge.prompt import (
10
+ Conversation,
11
+ Message,
12
+ Text,
13
+ ToolCall,
14
+ Thinking,
15
+ CachePattern,
16
+ )
17
+ from lm_deluge.usage import Usage
10
18
  from .base import APIRequestBase, APIResponse
11
19
 
12
20
  from ..tracker import StatusTracker
@@ -34,6 +42,8 @@ class AnthropicRequest(APIRequestBase):
34
42
  # for retries
35
43
  all_model_names: list[str] | None = None,
36
44
  all_sampling_params: list[SamplingParams] | None = None,
45
+ tools: list | None = None,
46
+ cache: CachePattern | None = None,
37
47
  ):
38
48
  super().__init__(
39
49
  task_id=task_id,
@@ -50,11 +60,17 @@ class AnthropicRequest(APIRequestBase):
50
60
  debug=debug,
51
61
  all_model_names=all_model_names,
52
62
  all_sampling_params=all_sampling_params,
63
+ tools=tools,
64
+ cache=cache,
53
65
  )
54
66
  self.model = APIModel.from_registry(model_name)
55
67
  self.url = f"{self.model.api_base}/messages"
56
68
 
57
- self.system_message, messages = prompt.to_anthropic()
69
+ # Lock images as bytes if caching is enabled
70
+ if cache is not None:
71
+ prompt.lock_images_as_bytes()
72
+
73
+ self.system_message, messages = prompt.to_anthropic(cache_pattern=cache)
58
74
  self.request_header = {
59
75
  "x-api-key": os.getenv(self.model.api_key_env_var),
60
76
  "anthropic-version": "2023-06-01",
@@ -94,14 +110,19 @@ class AnthropicRequest(APIRequestBase):
94
110
  )
95
111
  if self.system_message is not None:
96
112
  self.request_json["system"] = self.system_message
113
+ if tools:
114
+ tool_definitions = [tool.dump_for("anthropic") for tool in tools]
115
+ # Add cache control to last tool if tools_only caching is specified
116
+ if cache == "tools_only" and tool_definitions:
117
+ tool_definitions[-1]["cache_control"] = {"type": "ephemeral"}
118
+ self.request_json["tools"] = tool_definitions
97
119
 
98
120
  async def handle_response(self, http_response: ClientResponse) -> APIResponse:
99
121
  is_error = False
100
122
  error_message = None
101
123
  thinking = None
102
- completion = None
103
- input_tokens = None
104
- output_tokens = None
124
+ content = None
125
+ usage = None
105
126
  status_code = http_response.status
106
127
  mimetype = http_response.headers.get("Content-Type", None)
107
128
  rate_limits = {}
@@ -119,16 +140,27 @@ class AnthropicRequest(APIRequestBase):
119
140
  if status_code >= 200 and status_code < 300:
120
141
  try:
121
142
  data = await http_response.json()
122
- content = data["content"] # [0]["text"]
123
- for item in content:
143
+ response_content = data["content"]
144
+
145
+ # Parse response into Message with parts
146
+ parts = []
147
+ for item in response_content:
124
148
  if item["type"] == "text":
125
- completion = item["text"]
149
+ parts.append(Text(item["text"]))
126
150
  elif item["type"] == "thinking":
127
151
  thinking = item["thinking"]
152
+ parts.append(Thinking(item["thinking"]))
128
153
  elif item["type"] == "tool_use":
129
- continue # TODO: implement and report tool use
130
- input_tokens = data["usage"]["input_tokens"]
131
- output_tokens = data["usage"]["output_tokens"]
154
+ parts.append(
155
+ ToolCall(
156
+ id=item["id"],
157
+ name=item["name"],
158
+ arguments=item["input"],
159
+ )
160
+ )
161
+
162
+ content = Message("assistant", parts)
163
+ usage = Usage.from_anthropic_usage(data["usage"])
132
164
  except Exception as e:
133
165
  is_error = True
134
166
  error_message = (
@@ -162,10 +194,9 @@ class AnthropicRequest(APIRequestBase):
162
194
  is_error=is_error,
163
195
  error_message=error_message,
164
196
  prompt=self.prompt,
165
- completion=completion,
197
+ content=content,
166
198
  thinking=thinking,
167
199
  model_internal=self.model_name,
168
200
  sampling_params=self.sampling_params,
169
- input_tokens=input_tokens,
170
- output_tokens=output_tokens,
201
+ usage=usage,
171
202
  )
@@ -7,7 +7,8 @@ from dataclasses import dataclass
7
7
  from abc import ABC, abstractmethod
8
8
  from typing import Callable
9
9
 
10
- from lm_deluge.prompt import Conversation
10
+ from lm_deluge.prompt import Conversation, Message, CachePattern
11
+ from lm_deluge.usage import Usage
11
12
 
12
13
  from ..tracker import StatusTracker
13
14
  from ..sampling_params import SamplingParams
@@ -29,10 +30,11 @@ class APIResponse:
29
30
  is_error: bool | None
30
31
  error_message: str | None
31
32
 
32
- # completion information
33
- completion: str | None
34
- input_tokens: int | None
35
- output_tokens: int | None
33
+ # completion information - unified usage tracking
34
+ usage: Usage | None = None
35
+
36
+ # response content - structured format
37
+ content: Message | None = None
36
38
 
37
39
  # optional or calculated automatically
38
40
  thinking: str | None = None # if model shows thinking tokens
@@ -47,6 +49,33 @@ class APIResponse:
47
49
  # set to true if should NOT retry with the same model (unrecoverable error)
48
50
  give_up_if_no_other_models: bool | None = False
49
51
 
52
+ @property
53
+ def completion(self) -> str | None:
54
+ """Backward compatibility: extract text from content Message."""
55
+ if self.content is not None:
56
+ return self.content.completion
57
+ return None
58
+
59
+ @property
60
+ def input_tokens(self) -> int | None:
61
+ """Get input tokens from usage object."""
62
+ return self.usage.input_tokens if self.usage else None
63
+
64
+ @property
65
+ def output_tokens(self) -> int | None:
66
+ """Get output tokens from usage object."""
67
+ return self.usage.output_tokens if self.usage else None
68
+
69
+ @property
70
+ def cache_read_tokens(self) -> int | None:
71
+ """Get cache read tokens from usage object."""
72
+ return self.usage.cache_read_tokens if self.usage else None
73
+
74
+ @property
75
+ def cache_write_tokens(self) -> int | None:
76
+ """Get cache write tokens from usage object."""
77
+ return self.usage.cache_write_tokens if self.usage else None
78
+
50
79
  def __post_init__(self):
51
80
  # calculate cost & get external model name
52
81
  self.id = int(self.id)
@@ -54,16 +83,15 @@ class APIResponse:
54
83
  self.model_external = api_model.name
55
84
  self.cost = None
56
85
  if (
57
- self.input_tokens is not None
58
- and self.output_tokens is not None
86
+ self.usage is not None
59
87
  and api_model.input_cost is not None
60
88
  and api_model.output_cost is not None
61
89
  ):
62
90
  self.cost = (
63
- self.input_tokens * api_model.input_cost / 1e6
64
- + self.output_tokens * api_model.output_cost / 1e6
91
+ self.usage.input_tokens * api_model.input_cost / 1e6
92
+ + self.usage.output_tokens * api_model.output_cost / 1e6
65
93
  )
66
- elif self.completion is not None:
94
+ elif self.content is not None and self.completion is not None:
67
95
  print(
68
96
  f"Warning: Completion provided without token counts for model {self.model_internal}."
69
97
  )
@@ -79,30 +107,45 @@ class APIResponse:
79
107
  "status_code": self.status_code,
80
108
  "is_error": self.is_error,
81
109
  "error_message": self.error_message,
82
- "completion": self.completion,
83
- "input_tokens": self.input_tokens,
84
- "output_tokens": self.output_tokens,
110
+ "completion": self.completion, # computed property
111
+ "content": self.content.to_log() if self.content else None,
112
+ "usage": self.usage.to_dict() if self.usage else None,
85
113
  "finish_reason": self.finish_reason,
86
114
  "cost": self.cost,
87
115
  }
88
116
 
89
117
  @classmethod
90
118
  def from_dict(cls, data: dict):
119
+ # Handle backward compatibility for content/completion
120
+ content = None
121
+ if "content" in data and data["content"] is not None:
122
+ # Reconstruct message from log format
123
+ content = Message.from_log(data["content"])
124
+ elif "completion" in data and data["completion"] is not None:
125
+ # Backward compatibility: create a Message with just text
126
+ content = Message.ai(data["completion"])
127
+
128
+ usage = None
129
+ if "usage" in data and data["usage"] is not None:
130
+ usage = Usage.from_dict(data["usage"])
131
+
91
132
  return cls(
92
133
  id=data.get("id", random.randint(0, 1_000_000_000)),
93
134
  model_internal=data["model_internal"],
94
- model_external=data["model_external"],
95
- region=data["region"],
96
135
  prompt=Conversation.from_log(data["prompt"]),
97
136
  sampling_params=SamplingParams(**data["sampling_params"]),
98
137
  status_code=data["status_code"],
99
138
  is_error=data["is_error"],
100
139
  error_message=data["error_message"],
101
- input_tokens=data["input_tokens"],
102
- output_tokens=data["output_tokens"],
103
- completion=data["completion"],
104
- finish_reason=data["finish_reason"],
105
- cost=data["cost"],
140
+ usage=usage,
141
+ content=content,
142
+ thinking=data.get("thinking"),
143
+ model_external=data.get("model_external"),
144
+ region=data.get("region"),
145
+ logprobs=data.get("logprobs"),
146
+ finish_reason=data.get("finish_reason"),
147
+ cost=data.get("cost"),
148
+ cache_hit=data.get("cache_hit", False),
106
149
  )
107
150
 
108
151
  def write_to_file(self, filename):
@@ -145,6 +188,8 @@ class APIRequestBase(ABC):
145
188
  debug: bool = False,
146
189
  all_model_names: list[str] | None = None,
147
190
  all_sampling_params: list[SamplingParams] | None = None,
191
+ tools: list | None = None,
192
+ cache: CachePattern | None = None,
148
193
  ):
149
194
  if all_model_names is None:
150
195
  raise ValueError("all_model_names must be provided.")
@@ -166,6 +211,8 @@ class APIRequestBase(ABC):
166
211
  self.debug = debug
167
212
  self.all_model_names = all_model_names
168
213
  self.all_sampling_params = all_sampling_params
214
+ self.tools = tools
215
+ self.cache: CachePattern | None = cache
169
216
  self.result = [] # list of APIResponse objects from each attempt
170
217
 
171
218
  # these should be set in the __init__ of the subclass
@@ -255,6 +302,8 @@ class APIRequestBase(ABC):
255
302
  callback=self.callback,
256
303
  all_model_names=self.all_model_names,
257
304
  all_sampling_params=self.all_sampling_params,
305
+ tools=self.tools,
306
+ cache=self.cache,
258
307
  )
259
308
  # PROBLEM: new request is never put into results array, so we can't get the result.
260
309
  self.retry_queue.put_nowait(new_request)
@@ -297,9 +346,8 @@ class APIRequestBase(ABC):
297
346
  status_code=None,
298
347
  is_error=True,
299
348
  error_message="Request timed out (terminated by client).",
300
- completion=None,
301
- input_tokens=None,
302
- output_tokens=None,
349
+ content=None,
350
+ usage=None,
303
351
  )
304
352
  )
305
353
  self.handle_error(create_new_request=False)
@@ -315,9 +363,8 @@ class APIRequestBase(ABC):
315
363
  status_code=None,
316
364
  is_error=True,
317
365
  error_message=f"Unexpected {type(e).__name__}: {str(e) or 'No message.'}",
318
- completion=None,
319
- input_tokens=None,
320
- output_tokens=None,
366
+ content=None,
367
+ usage=None,
321
368
  )
322
369
  )
323
370
  # maybe consider making True?
@@ -344,6 +391,8 @@ def create_api_request(
344
391
  callback: Callable | None = None,
345
392
  all_model_names: list[str] | None = None,
346
393
  all_sampling_params: list[SamplingParams] | None = None,
394
+ tools: list | None = None,
395
+ cache: CachePattern | None = None,
347
396
  ) -> APIRequestBase:
348
397
  from .common import CLASSES # circular import so made it lazy, does this work?
349
398
 
@@ -368,5 +417,7 @@ def create_api_request(
368
417
  callback=callback,
369
418
  all_model_names=all_model_names,
370
419
  all_sampling_params=all_sampling_params,
420
+ tools=tools,
421
+ cache=cache,
371
422
  **kwargs,
372
423
  )