lm-deluge 0.0.9__tar.gz → 0.0.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (79) hide show
  1. {lm_deluge-0.0.9/src/lm_deluge.egg-info → lm_deluge-0.0.13}/PKG-INFO +101 -12
  2. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/README.md +98 -9
  3. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/pyproject.toml +3 -3
  4. lm_deluge-0.0.13/src/lm_deluge/__init__.py +15 -0
  5. lm_deluge-0.0.13/src/lm_deluge/agent.py +0 -0
  6. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/anthropic.py +107 -60
  7. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/base.py +107 -54
  8. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/bedrock.py +59 -22
  9. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/common.py +2 -1
  10. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/mistral.py +21 -22
  11. lm_deluge-0.0.13/src/lm_deluge/api_requests/openai.py +415 -0
  12. lm_deluge-0.0.13/src/lm_deluge/batches.py +498 -0
  13. lm_deluge-0.0.13/src/lm_deluge/client.py +501 -0
  14. lm_deluge-0.0.13/src/lm_deluge/computer_use/anthropic_tools.py +75 -0
  15. lm_deluge-0.0.9/src/lm_deluge/sampling_params.py → lm_deluge-0.0.13/src/lm_deluge/config.py +10 -3
  16. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/embed.py +17 -11
  17. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/models.py +78 -33
  18. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/prompt.py +173 -7
  19. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/rerank.py +18 -12
  20. lm_deluge-0.0.13/src/lm_deluge/tool.py +290 -0
  21. lm_deluge-0.0.13/src/lm_deluge/tracker.py +253 -0
  22. lm_deluge-0.0.13/src/lm_deluge/usage.py +114 -0
  23. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/util/json.py +18 -1
  24. {lm_deluge-0.0.9 → lm_deluge-0.0.13/src/lm_deluge.egg-info}/PKG-INFO +101 -12
  25. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge.egg-info/SOURCES.txt +22 -1
  26. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge.egg-info/requires.txt +2 -2
  27. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/tests/test_all_models.py +7 -7
  28. lm_deluge-0.0.13/tests/test_batch_real.py +95 -0
  29. lm_deluge-0.0.13/tests/test_bedrock_computer_use.py +378 -0
  30. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/tests/test_bedrock_models.py +19 -66
  31. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/tests/test_cache.py +5 -4
  32. lm_deluge-0.0.13/tests/test_client_tracker_integration.py +43 -0
  33. lm_deluge-0.0.13/tests/test_computer_use.py +103 -0
  34. lm_deluge-0.0.13/tests/test_computer_use_integration.py +277 -0
  35. lm_deluge-0.0.13/tests/test_debug_format.py +47 -0
  36. lm_deluge-0.0.13/tests/test_logprobs_refactor.py +306 -0
  37. lm_deluge-0.0.13/tests/test_max_concurrent_requests.py +38 -0
  38. lm_deluge-0.0.13/tests/test_mcp_tools.py +221 -0
  39. lm_deluge-0.0.13/tests/test_openai_responses.py +356 -0
  40. lm_deluge-0.0.13/tests/test_prompt_caching.py +257 -0
  41. lm_deluge-0.0.13/tests/test_real_caching.py +305 -0
  42. lm_deluge-0.0.13/tests/test_real_caching_bedrock.py +307 -0
  43. lm_deluge-0.0.13/tests/test_rich_display.py +114 -0
  44. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/tests/test_tool_calls.py +3 -3
  45. lm_deluge-0.0.13/tests/test_tool_from_function.py +150 -0
  46. lm_deluge-0.0.13/tests/test_tool_validation.py +36 -0
  47. lm_deluge-0.0.13/tests/test_tracker_refactor.py +99 -0
  48. lm_deluge-0.0.9/src/lm_deluge/__init__.py +0 -7
  49. lm_deluge-0.0.9/src/lm_deluge/api_requests/openai.py +0 -183
  50. lm_deluge-0.0.9/src/lm_deluge/client.py +0 -762
  51. lm_deluge-0.0.9/src/lm_deluge/tool.py +0 -87
  52. lm_deluge-0.0.9/src/lm_deluge/tracker.py +0 -43
  53. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/LICENSE +0 -0
  54. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/setup.cfg +0 -0
  55. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/__init__.py +0 -0
  56. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
  57. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
  58. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
  59. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
  60. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
  61. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/cache.py +0 -0
  62. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/errors.py +0 -0
  63. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/gemini_limits.py +0 -0
  64. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/image.py +0 -0
  65. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/llm_tools/__init__.py +0 -0
  66. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/llm_tools/extract.py +0 -0
  67. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/llm_tools/score.py +0 -0
  68. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/llm_tools/translate.py +0 -0
  69. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/util/logprobs.py +0 -0
  70. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/util/validation.py +0 -0
  71. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge/util/xml.py +0 -0
  72. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
  73. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/src/lm_deluge.egg-info/top_level.txt +0 -0
  74. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/tests/test_image_models.py +0 -0
  75. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/tests/test_image_utils.py +0 -0
  76. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/tests/test_json_utils.py +0 -0
  77. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/tests/test_sampling_params.py +0 -0
  78. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/tests/test_translate.py +0 -0
  79. {lm_deluge-0.0.9 → lm_deluge-0.0.13}/tests/test_xml_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.9
3
+ Version: 0.0.13
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
@@ -21,8 +21,8 @@ Requires-Dist: bs4
21
21
  Requires-Dist: lxml
22
22
  Requires-Dist: pdf2image
23
23
  Requires-Dist: pillow
24
- Requires-Dist: fasttext-wheel
25
- Requires-Dist: fasttext-langdetect
24
+ Requires-Dist: fastmcp>=2.4
25
+ Requires-Dist: rich
26
26
  Dynamic: license-file
27
27
 
28
28
  # lm-deluge
@@ -32,6 +32,9 @@ Dynamic: license-file
32
32
  - **Unified client** – Send prompts to all relevant models with a single client.
33
33
  - **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
34
34
  - **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
35
+ - **Tool Use** – Unified API for defining tools for all providers, and creating tools automatically from python functions.
36
+ - **MCP Support** – Instantiate a `Tool` from a local or remote MCP server so that any LLM can use it, whether or not that provider natively supports MCP.
37
+ - **Computer Use** – We support Claude Computer Use via the computer_use argument to process_prompts_sync/async. It works with Anthropic's API; Bedrock's API is broken right now and rejects the tool definitions, but in principle this will work there too when Bedrock gets their sh*t together.
35
38
  - **Caching** – Save completions in a local or distributed cache to avoid repeated LLM calls to process the same input.
36
39
  - **Convenient message constructor** – No more looking up how to build an Anthropic messages list with images. Our `Conversation` and `Message` classes work great with our client or with the `openai` and `anthropic` packages.
37
40
  - **Sync and async APIs** – Use the client from sync or async code.
@@ -44,7 +47,7 @@ Dynamic: license-file
44
47
  pip install lm-deluge
45
48
  ```
46
49
 
47
- The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
50
+ The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
48
51
 
49
52
  ## Quickstart
50
53
 
@@ -60,13 +63,13 @@ print(resp[0].completion)
60
63
 
61
64
  ## Spraying Across Models
62
65
 
63
- To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
66
+ To distribute your requests across models, just provide a list of more than one model to the constructor. See all available models in `models.py`. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
64
67
 
65
68
  ```python
66
69
  from lm_deluge import LLMClient
67
70
 
68
71
  client = LLMClient.basic(
69
- ["gpt-4o-mini", "claude-haiku-anthropic"],
72
+ ["gpt-4o-mini", "claude-3-haiku"],
70
73
  max_requests_per_minute=10_000
71
74
  )
72
75
  resps = client.process_prompts_sync(
@@ -81,7 +84,7 @@ API calls can be customized in a few ways.
81
84
 
82
85
  1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
83
86
  2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
84
- 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
87
+ 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
85
88
 
86
89
  Putting it all together:
87
90
 
@@ -120,11 +123,97 @@ resps = client.process_prompts_sync([prompt])
120
123
 
121
124
  This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
122
125
 
123
- ## Caching
126
+ See a full multi-turn chat example in `examples/multiturn.md`.
124
127
 
125
- `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
128
+ ## Tool Use
126
129
 
127
- **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
130
+ Define tools from Python functions and use them with any model:
131
+
132
+ ```python
133
+ from lm_deluge import LLMClient, Tool
134
+
135
+ def get_weather(city: str) -> str:
136
+ return f"The weather in {city} is sunny and 72°F"
137
+
138
+ tool = Tool.from_function(get_weather)
139
+ client = LLMClient.basic("claude-3-haiku")
140
+ resps = client.process_prompts_sync(
141
+ ["What's the weather in Paris?"],
142
+ tools=[tool]
143
+ )
144
+
145
+ # you can iterate over the tool calls in the response automatically
146
+ for tool_call in resps[0].tool_calls:
147
+ print(tool_call.name, tool_call.arguments)
148
+ ```
149
+
150
+ You can also automatically instantiate tools from MCP servers. Under the hood, the the constructor connects to the server, asks it what tools it has, and then creates a `Tool` from each of them, *with a built-in `call` and `acall` interface*.
151
+
152
+ ```python
153
+ from lm_deluge import LLMClient, Tool
154
+
155
+ # Connect to a local MCP server and get all of its tools
156
+ filesystem_tools = Tool.from_mcp(
157
+ "filesystem",
158
+ command="npx",
159
+ args=["-y", "@modelcontextprotocol/server-filesystem", "/path/to/directory"]
160
+ )
161
+
162
+ # or load ALL the tools from a Claude Desktop like config
163
+ config = {
164
+ "mcpServers": {
165
+ "exa": {
166
+ "url": f"https://mcp.exa.ai/mcp?exaApiKey={os.getenv('EXA_API_KEY')}"
167
+ },
168
+ "zapier": {
169
+ "url": f"https://mcp.zapier.com/api/mcp/s/{os.getenv('ZAPIER_MCP_SECRET')}/mcp"
170
+ }
171
+ }
172
+ }
173
+ all_tools = Tool.from_mcp_config(config)
174
+
175
+ # let the model use the tools
176
+ client = LLMClient.basic("gpt-4o-mini")
177
+ resps = client.process_prompts_sync(
178
+ ["List the files in the current directory"],
179
+ tools=tools
180
+ )
181
+
182
+ # call the tools
183
+ for tool_call in resps[0].tool_calls:
184
+ # this is dumb sorry will make it better
185
+ tool_to_call = [x for x in tools if x.name == tool_call.name][0]
186
+ tool_to_call.call(**tool_call.arguments) # in async code, use .acall()
187
+ ```
188
+
189
+ ### Prompt Caching (Anthropic)
190
+
191
+ For Anthropic models, you can use prompt caching to reduce costs and latency for repeated context. This uses Anthropic's server-side prompt caching. Other providers like OpenAI and Google do this automatically, but Anthropic requires you to manually set cache-control on messages. You can do this in lm-deluge with a simple "cache" argument to `process_prompts_sync` or `process_prompts_async`:
192
+
193
+ ```python
194
+ from lm_deluge import LLMClient, Conversation, Message
195
+
196
+ # Create a conversation with system message
197
+ conv = (
198
+ Conversation.system("You are an expert Python developer with deep knowledge of async programming.")
199
+ .add(Message.user("How do I use asyncio.gather?"))
200
+ )
201
+
202
+ # Use prompt caching to cache system message and tools
203
+ client = LLMClient.basic("claude-3-5-sonnet")
204
+ resps = client.process_prompts_sync(
205
+ [conv],
206
+ cache="system_and_tools" # Cache system message and any tools
207
+ )
208
+ ```
209
+
210
+ Available cache patterns: `"system_and_tools"`, `"tools_only"`, `"last_user_message"`, `"last_2_user_messages"`, `"last_3_user_messages"`.
211
+
212
+ ## Local Caching
213
+
214
+ Besides caching from model providers (which provides cache reads at a discount, but not for free) `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches to cache prompts locally. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
215
+
216
+ **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The local cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
128
217
 
129
218
  ## Asynchronous Client
130
219
  Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
@@ -144,11 +233,11 @@ asyncio.run(main())
144
233
 
145
234
  ## Available Models
146
235
 
147
- We support all models in `src/lm_deluge/models.py`. An older version of this client supported Bedrock and Vertex. We plan to re-implement Bedrock support (our previous support was spotty and we need to figure out cross-region inference in order to support the newest Claude models). Vertex support is not currently planned, since Google allows you to connect your Vertex account to AI Studio, and Vertex authentication is a huge pain (requires service account credentials, etc.)
236
+ We support all models in `src/lm_deluge/models.py`. Vertex support is not planned in the short term, since Google allows you to connect your Vertex account to AI Studio, and Vertex authentication is a huge pain (requires service account credentials, etc.)
148
237
 
149
238
  ## Feature Support
150
239
 
151
- We support structured outputs via `json_mode` parameter provided to `SamplingParams`. Structured outputs with a schema are planned. Reasoning models are supported via the `reasoning_effort` parameter, which is translated to a thinking budget for Claude/Gemini. Image models are supported. We don't support tool use yet, but support is planned (keep an eye out for a unified tool definition spec that works for all models!). We support logprobs for OpenAI models that return them via the `logprobs` argument to the `LLMClient`.
240
+ We support structured outputs via `json_mode` parameter provided to `SamplingParams`. Structured outputs with a schema are planned. Reasoning models are supported via the `reasoning_effort` parameter, which is translated to a thinking budget for Claude/Gemini. Image models are supported. We support tool use as documented above. We support logprobs for OpenAI models that return them.
152
241
 
153
242
  ## Built‑in tools
154
243
 
@@ -5,6 +5,9 @@
5
5
  - **Unified client** – Send prompts to all relevant models with a single client.
6
6
  - **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
7
7
  - **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
8
+ - **Tool Use** – Unified API for defining tools for all providers, and creating tools automatically from python functions.
9
+ - **MCP Support** – Instantiate a `Tool` from a local or remote MCP server so that any LLM can use it, whether or not that provider natively supports MCP.
10
+ - **Computer Use** – We support Claude Computer Use via the computer_use argument to process_prompts_sync/async. It works with Anthropic's API; Bedrock's API is broken right now and rejects the tool definitions, but in principle this will work there too when Bedrock gets their sh*t together.
8
11
  - **Caching** – Save completions in a local or distributed cache to avoid repeated LLM calls to process the same input.
9
12
  - **Convenient message constructor** – No more looking up how to build an Anthropic messages list with images. Our `Conversation` and `Message` classes work great with our client or with the `openai` and `anthropic` packages.
10
13
  - **Sync and async APIs** – Use the client from sync or async code.
@@ -17,7 +20,7 @@
17
20
  pip install lm-deluge
18
21
  ```
19
22
 
20
- The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
23
+ The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
21
24
 
22
25
  ## Quickstart
23
26
 
@@ -33,13 +36,13 @@ print(resp[0].completion)
33
36
 
34
37
  ## Spraying Across Models
35
38
 
36
- To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
39
+ To distribute your requests across models, just provide a list of more than one model to the constructor. See all available models in `models.py`. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
37
40
 
38
41
  ```python
39
42
  from lm_deluge import LLMClient
40
43
 
41
44
  client = LLMClient.basic(
42
- ["gpt-4o-mini", "claude-haiku-anthropic"],
45
+ ["gpt-4o-mini", "claude-3-haiku"],
43
46
  max_requests_per_minute=10_000
44
47
  )
45
48
  resps = client.process_prompts_sync(
@@ -54,7 +57,7 @@ API calls can be customized in a few ways.
54
57
 
55
58
  1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
56
59
  2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
57
- 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
60
+ 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
58
61
 
59
62
  Putting it all together:
60
63
 
@@ -93,11 +96,97 @@ resps = client.process_prompts_sync([prompt])
93
96
 
94
97
  This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
95
98
 
96
- ## Caching
99
+ See a full multi-turn chat example in `examples/multiturn.md`.
97
100
 
98
- `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
101
+ ## Tool Use
99
102
 
100
- **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
103
+ Define tools from Python functions and use them with any model:
104
+
105
+ ```python
106
+ from lm_deluge import LLMClient, Tool
107
+
108
+ def get_weather(city: str) -> str:
109
+ return f"The weather in {city} is sunny and 72°F"
110
+
111
+ tool = Tool.from_function(get_weather)
112
+ client = LLMClient.basic("claude-3-haiku")
113
+ resps = client.process_prompts_sync(
114
+ ["What's the weather in Paris?"],
115
+ tools=[tool]
116
+ )
117
+
118
+ # you can iterate over the tool calls in the response automatically
119
+ for tool_call in resps[0].tool_calls:
120
+ print(tool_call.name, tool_call.arguments)
121
+ ```
122
+
123
+ You can also automatically instantiate tools from MCP servers. Under the hood, the the constructor connects to the server, asks it what tools it has, and then creates a `Tool` from each of them, *with a built-in `call` and `acall` interface*.
124
+
125
+ ```python
126
+ from lm_deluge import LLMClient, Tool
127
+
128
+ # Connect to a local MCP server and get all of its tools
129
+ filesystem_tools = Tool.from_mcp(
130
+ "filesystem",
131
+ command="npx",
132
+ args=["-y", "@modelcontextprotocol/server-filesystem", "/path/to/directory"]
133
+ )
134
+
135
+ # or load ALL the tools from a Claude Desktop like config
136
+ config = {
137
+ "mcpServers": {
138
+ "exa": {
139
+ "url": f"https://mcp.exa.ai/mcp?exaApiKey={os.getenv('EXA_API_KEY')}"
140
+ },
141
+ "zapier": {
142
+ "url": f"https://mcp.zapier.com/api/mcp/s/{os.getenv('ZAPIER_MCP_SECRET')}/mcp"
143
+ }
144
+ }
145
+ }
146
+ all_tools = Tool.from_mcp_config(config)
147
+
148
+ # let the model use the tools
149
+ client = LLMClient.basic("gpt-4o-mini")
150
+ resps = client.process_prompts_sync(
151
+ ["List the files in the current directory"],
152
+ tools=tools
153
+ )
154
+
155
+ # call the tools
156
+ for tool_call in resps[0].tool_calls:
157
+ # this is dumb sorry will make it better
158
+ tool_to_call = [x for x in tools if x.name == tool_call.name][0]
159
+ tool_to_call.call(**tool_call.arguments) # in async code, use .acall()
160
+ ```
161
+
162
+ ### Prompt Caching (Anthropic)
163
+
164
+ For Anthropic models, you can use prompt caching to reduce costs and latency for repeated context. This uses Anthropic's server-side prompt caching. Other providers like OpenAI and Google do this automatically, but Anthropic requires you to manually set cache-control on messages. You can do this in lm-deluge with a simple "cache" argument to `process_prompts_sync` or `process_prompts_async`:
165
+
166
+ ```python
167
+ from lm_deluge import LLMClient, Conversation, Message
168
+
169
+ # Create a conversation with system message
170
+ conv = (
171
+ Conversation.system("You are an expert Python developer with deep knowledge of async programming.")
172
+ .add(Message.user("How do I use asyncio.gather?"))
173
+ )
174
+
175
+ # Use prompt caching to cache system message and tools
176
+ client = LLMClient.basic("claude-3-5-sonnet")
177
+ resps = client.process_prompts_sync(
178
+ [conv],
179
+ cache="system_and_tools" # Cache system message and any tools
180
+ )
181
+ ```
182
+
183
+ Available cache patterns: `"system_and_tools"`, `"tools_only"`, `"last_user_message"`, `"last_2_user_messages"`, `"last_3_user_messages"`.
184
+
185
+ ## Local Caching
186
+
187
+ Besides caching from model providers (which provides cache reads at a discount, but not for free) `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches to cache prompts locally. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
188
+
189
+ **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The local cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
101
190
 
102
191
  ## Asynchronous Client
103
192
  Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
@@ -117,11 +206,11 @@ asyncio.run(main())
117
206
 
118
207
  ## Available Models
119
208
 
120
- We support all models in `src/lm_deluge/models.py`. An older version of this client supported Bedrock and Vertex. We plan to re-implement Bedrock support (our previous support was spotty and we need to figure out cross-region inference in order to support the newest Claude models). Vertex support is not currently planned, since Google allows you to connect your Vertex account to AI Studio, and Vertex authentication is a huge pain (requires service account credentials, etc.)
209
+ We support all models in `src/lm_deluge/models.py`. Vertex support is not planned in the short term, since Google allows you to connect your Vertex account to AI Studio, and Vertex authentication is a huge pain (requires service account credentials, etc.)
121
210
 
122
211
  ## Feature Support
123
212
 
124
- We support structured outputs via `json_mode` parameter provided to `SamplingParams`. Structured outputs with a schema are planned. Reasoning models are supported via the `reasoning_effort` parameter, which is translated to a thinking budget for Claude/Gemini. Image models are supported. We don't support tool use yet, but support is planned (keep an eye out for a unified tool definition spec that works for all models!). We support logprobs for OpenAI models that return them via the `logprobs` argument to the `LLMClient`.
213
+ We support structured outputs via `json_mode` parameter provided to `SamplingParams`. Structured outputs with a schema are planned. Reasoning models are supported via the `reasoning_effort` parameter, which is translated to a thinking budget for Claude/Gemini. Image models are supported. We support tool use as documented above. We support logprobs for OpenAI models that return them.
125
214
 
126
215
  ## Built‑in tools
127
216
 
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
3
3
 
4
4
  [project]
5
5
  name = "lm_deluge"
6
- version = "0.0.9"
6
+ version = "0.0.13"
7
7
  authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
8
8
  description = "Python utility for using LLM API models."
9
9
  readme = "README.md"
@@ -27,6 +27,6 @@ dependencies = [
27
27
  "lxml",
28
28
  "pdf2image",
29
29
  "pillow",
30
- "fasttext-wheel",
31
- "fasttext-langdetect",
30
+ "fastmcp>=2.4",
31
+ "rich"
32
32
  ]
@@ -0,0 +1,15 @@
1
+ from .client import LLMClient, SamplingParams, APIResponse
2
+ from .prompt import Conversation, Message
3
+ from .tool import Tool
4
+ import dotenv
5
+
6
+ dotenv.load_dotenv()
7
+
8
+ __all__ = [
9
+ "LLMClient",
10
+ "SamplingParams",
11
+ "APIResponse",
12
+ "Conversation",
13
+ "Message",
14
+ "Tool",
15
+ ]
File without changes
@@ -1,17 +1,94 @@
1
- import asyncio
2
1
  from aiohttp import ClientResponse
3
2
  import json
4
3
  import os
5
- import warnings
6
- from tqdm import tqdm
7
4
  from typing import Callable
8
5
 
9
- from lm_deluge.prompt import Conversation, Message, Text, ToolCall, Thinking
6
+ from lm_deluge.prompt import (
7
+ Conversation,
8
+ Message,
9
+ Text,
10
+ ToolCall,
11
+ Thinking,
12
+ CachePattern,
13
+ )
14
+ from lm_deluge.tool import Tool
15
+ from lm_deluge.usage import Usage
10
16
  from .base import APIRequestBase, APIResponse
11
17
 
12
18
  from ..tracker import StatusTracker
13
- from ..sampling_params import SamplingParams
19
+ from ..config import SamplingParams
14
20
  from ..models import APIModel
21
+ from ..computer_use.anthropic_tools import get_anthropic_cu_tools
22
+
23
+
24
+ def _build_anthropic_request(
25
+ model: APIModel,
26
+ prompt: Conversation,
27
+ tools: list[Tool] | None,
28
+ sampling_params: SamplingParams,
29
+ cache_pattern: CachePattern | None = None,
30
+ computer_use: bool = False,
31
+ display_width: int = 1024,
32
+ display_height: int = 768,
33
+ ):
34
+ system_message, messages = prompt.to_anthropic(cache_pattern=cache_pattern)
35
+ request_header = {
36
+ "x-api-key": os.getenv(model.api_key_env_var),
37
+ "anthropic-version": "2023-06-01",
38
+ "content-type": "application/json",
39
+ }
40
+
41
+ # Add beta header for Computer Use
42
+ if computer_use:
43
+ request_header["anthropic-beta"] = "computer-use-2025-01-24"
44
+
45
+ request_json = {
46
+ "model": model.name,
47
+ "messages": messages,
48
+ "temperature": sampling_params.temperature,
49
+ "top_p": sampling_params.top_p,
50
+ "max_tokens": sampling_params.max_new_tokens,
51
+ }
52
+
53
+ # handle thinking
54
+ if model.reasoning_model and sampling_params.reasoning_effort:
55
+ # translate reasoning effort of low, medium, high to budget tokens
56
+ budget = {"low": 1024, "medium": 4096, "high": 16384}.get(
57
+ sampling_params.reasoning_effort
58
+ )
59
+ request_json["thinking"] = {
60
+ "type": "enabled",
61
+ "budget_tokens": budget,
62
+ }
63
+ request_json.pop("top_p")
64
+ request_json["temperature"] = 1.0
65
+ request_json["max_tokens"] += budget
66
+ else:
67
+ request_json["thinking"] = {"type": "disabled"}
68
+ if sampling_params.reasoning_effort:
69
+ print("ignoring reasoning_effort for non-reasoning model")
70
+ if system_message is not None:
71
+ request_json["system"] = system_message
72
+ if tools or computer_use:
73
+ tool_definitions = []
74
+ if tools:
75
+ tool_definitions.extend([tool.dump_for("anthropic") for tool in tools])
76
+ # Add Computer Use tools
77
+ if computer_use:
78
+ cu_tools = get_anthropic_cu_tools(
79
+ model=model.id,
80
+ display_width=display_width, # todo: set from ComputerUseParams
81
+ display_height=display_height,
82
+ )
83
+ tool_definitions.extend(cu_tools)
84
+
85
+ # Add cache control to last tool if tools_only caching is specified
86
+ if cache_pattern == "tools_only" and tool_definitions:
87
+ tool_definitions[-1]["cache_control"] = {"type": "ephemeral"}
88
+
89
+ request_json["tools"] = tool_definitions
90
+
91
+ return request_json, request_header
15
92
 
16
93
 
17
94
  class AnthropicRequest(APIRequestBase):
@@ -24,17 +101,19 @@ class AnthropicRequest(APIRequestBase):
24
101
  prompt: Conversation,
25
102
  attempts_left: int,
26
103
  status_tracker: StatusTracker,
27
- retry_queue: asyncio.Queue,
28
104
  results_arr: list,
29
105
  request_timeout: int = 30,
30
106
  sampling_params: SamplingParams = SamplingParams(),
31
- pbar: tqdm | None = None,
32
107
  callback: Callable | None = None,
33
- debug: bool = False,
34
108
  # for retries
35
109
  all_model_names: list[str] | None = None,
36
110
  all_sampling_params: list[SamplingParams] | None = None,
37
111
  tools: list | None = None,
112
+ cache: CachePattern | None = None,
113
+ # Computer Use support
114
+ computer_use: bool = False,
115
+ display_width: int = 1024,
116
+ display_height: int = 768,
38
117
  ):
39
118
  super().__init__(
40
119
  task_id=task_id,
@@ -42,70 +121,42 @@ class AnthropicRequest(APIRequestBase):
42
121
  prompt=prompt,
43
122
  attempts_left=attempts_left,
44
123
  status_tracker=status_tracker,
45
- retry_queue=retry_queue,
46
124
  results_arr=results_arr,
47
125
  request_timeout=request_timeout,
48
126
  sampling_params=sampling_params,
49
- pbar=pbar,
50
127
  callback=callback,
51
- debug=debug,
52
128
  all_model_names=all_model_names,
53
129
  all_sampling_params=all_sampling_params,
54
130
  tools=tools,
131
+ cache=cache,
55
132
  )
133
+ self.computer_use = computer_use
134
+ self.display_width = display_width
135
+ self.display_height = display_height
56
136
  self.model = APIModel.from_registry(model_name)
57
137
  self.url = f"{self.model.api_base}/messages"
58
138
 
59
- self.system_message, messages = prompt.to_anthropic()
60
- self.request_header = {
61
- "x-api-key": os.getenv(self.model.api_key_env_var),
62
- "anthropic-version": "2023-06-01",
63
- "content-type": "application/json",
64
- }
139
+ # Lock images as bytes if caching is enabled
140
+ if cache is not None:
141
+ prompt.lock_images_as_bytes()
65
142
 
66
- self.request_json = {
67
- "model": self.model.name,
68
- "messages": messages,
69
- "temperature": self.sampling_params.temperature,
70
- "top_p": self.sampling_params.top_p,
71
- "max_tokens": self.sampling_params.max_new_tokens,
72
- }
73
- # handle thinking
74
- if self.model.reasoning_model:
75
- if sampling_params.reasoning_effort:
76
- # translate reasoning effort of low, medium, high to budget tokens
77
- budget = {"low": 1024, "medium": 4096, "high": 16384}.get(
78
- sampling_params.reasoning_effort
79
- )
80
- self.request_json["thinking"] = {
81
- "type": "enabled",
82
- "budget_tokens": budget,
83
- }
84
- self.request_json.pop("top_p")
85
- self.request_json["temperature"] = 1.0
86
- self.request_json["max_tokens"] += (
87
- budget # assume max tokens is max completion tokens
88
- )
89
- else:
90
- # no thinking
91
- self.request_json["thinking"] = {"type": "disabled"}
92
- else:
93
- if sampling_params.reasoning_effort:
94
- warnings.warn(
95
- f"Ignoring reasoning_effort param for non-reasoning model: {model_name}"
96
- )
97
- if self.system_message is not None:
98
- self.request_json["system"] = self.system_message
99
- if tools:
100
- self.request_json["tools"] = [tool.dump_for("anthropic") for tool in tools]
143
+ self.request_json, self.request_header = _build_anthropic_request(
144
+ self.model,
145
+ prompt,
146
+ tools,
147
+ sampling_params,
148
+ cache,
149
+ computer_use,
150
+ display_width,
151
+ display_height,
152
+ )
101
153
 
102
154
  async def handle_response(self, http_response: ClientResponse) -> APIResponse:
103
155
  is_error = False
104
156
  error_message = None
105
157
  thinking = None
106
158
  content = None
107
- input_tokens = None
108
- output_tokens = None
159
+ usage = None
109
160
  status_code = http_response.status
110
161
  mimetype = http_response.headers.get("Content-Type", None)
111
162
  rate_limits = {}
@@ -118,8 +169,6 @@ class AnthropicRequest(APIRequestBase):
118
169
  "anthropic-ratelimit-tokens-reset",
119
170
  ]:
120
171
  rate_limits[header] = http_response.headers.get(header, None)
121
- if self.debug:
122
- print(f"Rate limits: {rate_limits}")
123
172
  if status_code >= 200 and status_code < 300:
124
173
  try:
125
174
  data = await http_response.json()
@@ -143,8 +192,7 @@ class AnthropicRequest(APIRequestBase):
143
192
  )
144
193
 
145
194
  content = Message("assistant", parts)
146
- input_tokens = data["usage"]["input_tokens"]
147
- output_tokens = data["usage"]["output_tokens"]
195
+ usage = Usage.from_anthropic_usage(data["usage"])
148
196
  except Exception as e:
149
197
  is_error = True
150
198
  error_message = (
@@ -182,6 +230,5 @@ class AnthropicRequest(APIRequestBase):
182
230
  thinking=thinking,
183
231
  model_internal=self.model_name,
184
232
  sampling_params=self.sampling_params,
185
- input_tokens=input_tokens,
186
- output_tokens=output_tokens,
233
+ usage=usage,
187
234
  )