lm-deluge 0.0.32__tar.gz → 0.0.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (62) hide show
  1. {lm_deluge-0.0.32/src/lm_deluge.egg-info → lm_deluge-0.0.33}/PKG-INFO +25 -12
  2. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/README.md +24 -11
  3. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/pyproject.toml +1 -1
  4. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/client.py +95 -11
  5. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/models.py +32 -2
  6. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/tracker.py +108 -47
  7. {lm_deluge-0.0.32 → lm_deluge-0.0.33/src/lm_deluge.egg-info}/PKG-INFO +25 -12
  8. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/LICENSE +0 -0
  9. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/setup.cfg +0 -0
  10. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/__init__.py +0 -0
  11. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/agent.py +0 -0
  12. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/__init__.py +0 -0
  13. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/anthropic.py +0 -0
  14. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/base.py +0 -0
  15. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/bedrock.py +0 -0
  16. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/common.py +0 -0
  17. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
  18. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
  19. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
  20. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
  21. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
  22. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/gemini.py +0 -0
  23. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/mistral.py +0 -0
  24. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/openai.py +0 -0
  25. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/api_requests/response.py +0 -0
  26. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/batches.py +0 -0
  27. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/built_in_tools/anthropic/__init__.py +0 -0
  28. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/built_in_tools/anthropic/bash.py +0 -0
  29. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/built_in_tools/anthropic/computer_use.py +0 -0
  30. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/built_in_tools/anthropic/editor.py +0 -0
  31. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/built_in_tools/base.py +0 -0
  32. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/built_in_tools/openai.py +0 -0
  33. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/cache.py +0 -0
  34. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/config.py +0 -0
  35. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/embed.py +0 -0
  36. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/errors.py +0 -0
  37. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/file.py +0 -0
  38. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/gemini_limits.py +0 -0
  39. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/image.py +0 -0
  40. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/llm_tools/__init__.py +0 -0
  41. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/llm_tools/classify.py +0 -0
  42. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/llm_tools/extract.py +0 -0
  43. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/llm_tools/locate.py +0 -0
  44. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/llm_tools/ocr.py +0 -0
  45. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/llm_tools/score.py +0 -0
  46. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/llm_tools/translate.py +0 -0
  47. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/prompt.py +0 -0
  48. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/request_context.py +0 -0
  49. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/rerank.py +0 -0
  50. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/tool.py +0 -0
  51. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/usage.py +0 -0
  52. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/util/json.py +0 -0
  53. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/util/logprobs.py +0 -0
  54. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/util/spatial.py +0 -0
  55. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/util/validation.py +0 -0
  56. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge/util/xml.py +0 -0
  57. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge.egg-info/SOURCES.txt +0 -0
  58. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
  59. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge.egg-info/requires.txt +0 -0
  60. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/src/lm_deluge.egg-info/top_level.txt +0 -0
  61. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/tests/test_builtin_tools.py +0 -0
  62. {lm_deluge-0.0.32 → lm_deluge-0.0.33}/tests/test_native_mcp_server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.32
3
+ Version: 0.0.33
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
@@ -54,12 +54,12 @@ The package relies on environment variables for API keys. Typical variables incl
54
54
 
55
55
  ## Quickstart
56
56
 
57
- The easiest way to get started is with the `.basic` constructor. This uses sensible default arguments for rate limits and sampling parameters so that you don't have to provide a ton of arguments.
57
+ `LLMClient` uses sensible default arguments for rate limits and sampling parameters so that you don't have to provide a ton of arguments.
58
58
 
59
59
  ```python
60
60
  from lm_deluge import LLMClient
61
61
 
62
- client = LLMClient.basic("gpt-4o-mini")
62
+ client = LLMClient("gpt-4o-mini")
63
63
  resps = client.process_prompts_sync(["Hello, world!"])
64
64
  print(resp[0].completion)
65
65
  ```
@@ -71,7 +71,7 @@ To distribute your requests across models, just provide a list of more than one
71
71
  ```python
72
72
  from lm_deluge import LLMClient
73
73
 
74
- client = LLMClient.basic(
74
+ client = LLMClient(
75
75
  ["gpt-4o-mini", "claude-3-haiku"],
76
76
  max_requests_per_minute=10_000
77
77
  )
@@ -85,8 +85,8 @@ print(resp[0].completion)
85
85
 
86
86
  API calls can be customized in a few ways.
87
87
 
88
- 1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
89
- 2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
88
+ 1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models.
89
+ 2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, caching, **and progress display style**. Set `progress="rich"` (default), `"tqdm"`, or `"manual"` to choose how progress is reported. The manual option prints an update every 30 seconds.
90
90
  3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
91
91
 
92
92
  Putting it all together:
@@ -109,6 +109,19 @@ await client.process_prompts_async(
109
109
  )
110
110
  ```
111
111
 
112
+ ### Queueing individual prompts
113
+
114
+ You can queue prompts one at a time and track progress explicitly:
115
+
116
+ ```python
117
+ client = LLMClient("gpt-4.1-mini", progress="tqdm")
118
+ client.open()
119
+ task_id = client.start_nowait("hello there")
120
+ # ... queue more tasks ...
121
+ results = await client.wait_for_all()
122
+ client.close()
123
+ ```
124
+
112
125
  ## Multi-Turn Conversations
113
126
 
114
127
  Constructing conversations to pass to models is notoriously annoying. Each provider has a slightly different way of defining a list of messages, and with the introduction of images/multi-part messages it's only gotten worse. We provide convenience constructors so you don't have to remember all that stuff.
@@ -120,7 +133,7 @@ prompt = Conversation.system("You are a helpful assistant.").add(
120
133
  Message.user("What's in this image?").add_image("tests/image.jpg")
121
134
  )
122
135
 
123
- client = LLMClient.basic("gpt-4.1-mini")
136
+ client = LLMClient("gpt-4.1-mini")
124
137
  resps = client.process_prompts_sync([prompt])
125
138
  ```
126
139
 
@@ -136,9 +149,9 @@ For models that support file uploads (OpenAI, Anthropic, and Gemini), you can ea
136
149
  from lm_deluge import LLMClient, Conversation
137
150
 
138
151
  # Simple file upload
139
- client = LLMClient.basic("gpt-4.1-mini")
152
+ client = LLMClient("gpt-4.1-mini")
140
153
  conversation = Conversation.user(
141
- "Please summarize this document",
154
+ "Please summarize this document",
142
155
  file="path/to/document.pdf"
143
156
  )
144
157
  resps = client.process_prompts_sync([conversation])
@@ -163,7 +176,7 @@ def get_weather(city: str) -> str:
163
176
  return f"The weather in {city} is sunny and 72°F"
164
177
 
165
178
  tool = Tool.from_function(get_weather)
166
- client = LLMClient.basic("claude-3-haiku")
179
+ client = LLMClient("claude-3-haiku")
167
180
  resps = client.process_prompts_sync(
168
181
  ["What's the weather in Paris?"],
169
182
  tools=[tool]
@@ -200,7 +213,7 @@ config = {
200
213
  all_tools = Tool.from_mcp_config(config)
201
214
 
202
215
  # let the model use the tools
203
- client = LLMClient.basic("gpt-4o-mini")
216
+ client = LLMClient("gpt-4o-mini")
204
217
  resps = client.process_prompts_sync(
205
218
  ["List the files in the current directory"],
206
219
  tools=tools
@@ -237,7 +250,7 @@ conv = (
237
250
  )
238
251
 
239
252
  # Use prompt caching to cache system message and tools
240
- client = LLMClient.basic("claude-3-5-sonnet")
253
+ client = LLMClient("claude-3-5-sonnet")
241
254
  resps = client.process_prompts_sync(
242
255
  [conv],
243
256
  cache="system_and_tools" # Cache system message and any tools
@@ -27,12 +27,12 @@ The package relies on environment variables for API keys. Typical variables incl
27
27
 
28
28
  ## Quickstart
29
29
 
30
- The easiest way to get started is with the `.basic` constructor. This uses sensible default arguments for rate limits and sampling parameters so that you don't have to provide a ton of arguments.
30
+ `LLMClient` uses sensible default arguments for rate limits and sampling parameters so that you don't have to provide a ton of arguments.
31
31
 
32
32
  ```python
33
33
  from lm_deluge import LLMClient
34
34
 
35
- client = LLMClient.basic("gpt-4o-mini")
35
+ client = LLMClient("gpt-4o-mini")
36
36
  resps = client.process_prompts_sync(["Hello, world!"])
37
37
  print(resp[0].completion)
38
38
  ```
@@ -44,7 +44,7 @@ To distribute your requests across models, just provide a list of more than one
44
44
  ```python
45
45
  from lm_deluge import LLMClient
46
46
 
47
- client = LLMClient.basic(
47
+ client = LLMClient(
48
48
  ["gpt-4o-mini", "claude-3-haiku"],
49
49
  max_requests_per_minute=10_000
50
50
  )
@@ -58,8 +58,8 @@ print(resp[0].completion)
58
58
 
59
59
  API calls can be customized in a few ways.
60
60
 
61
- 1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
62
- 2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
61
+ 1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models.
62
+ 2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, caching, **and progress display style**. Set `progress="rich"` (default), `"tqdm"`, or `"manual"` to choose how progress is reported. The manual option prints an update every 30 seconds.
63
63
  3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
64
64
 
65
65
  Putting it all together:
@@ -82,6 +82,19 @@ await client.process_prompts_async(
82
82
  )
83
83
  ```
84
84
 
85
+ ### Queueing individual prompts
86
+
87
+ You can queue prompts one at a time and track progress explicitly:
88
+
89
+ ```python
90
+ client = LLMClient("gpt-4.1-mini", progress="tqdm")
91
+ client.open()
92
+ task_id = client.start_nowait("hello there")
93
+ # ... queue more tasks ...
94
+ results = await client.wait_for_all()
95
+ client.close()
96
+ ```
97
+
85
98
  ## Multi-Turn Conversations
86
99
 
87
100
  Constructing conversations to pass to models is notoriously annoying. Each provider has a slightly different way of defining a list of messages, and with the introduction of images/multi-part messages it's only gotten worse. We provide convenience constructors so you don't have to remember all that stuff.
@@ -93,7 +106,7 @@ prompt = Conversation.system("You are a helpful assistant.").add(
93
106
  Message.user("What's in this image?").add_image("tests/image.jpg")
94
107
  )
95
108
 
96
- client = LLMClient.basic("gpt-4.1-mini")
109
+ client = LLMClient("gpt-4.1-mini")
97
110
  resps = client.process_prompts_sync([prompt])
98
111
  ```
99
112
 
@@ -109,9 +122,9 @@ For models that support file uploads (OpenAI, Anthropic, and Gemini), you can ea
109
122
  from lm_deluge import LLMClient, Conversation
110
123
 
111
124
  # Simple file upload
112
- client = LLMClient.basic("gpt-4.1-mini")
125
+ client = LLMClient("gpt-4.1-mini")
113
126
  conversation = Conversation.user(
114
- "Please summarize this document",
127
+ "Please summarize this document",
115
128
  file="path/to/document.pdf"
116
129
  )
117
130
  resps = client.process_prompts_sync([conversation])
@@ -136,7 +149,7 @@ def get_weather(city: str) -> str:
136
149
  return f"The weather in {city} is sunny and 72°F"
137
150
 
138
151
  tool = Tool.from_function(get_weather)
139
- client = LLMClient.basic("claude-3-haiku")
152
+ client = LLMClient("claude-3-haiku")
140
153
  resps = client.process_prompts_sync(
141
154
  ["What's the weather in Paris?"],
142
155
  tools=[tool]
@@ -173,7 +186,7 @@ config = {
173
186
  all_tools = Tool.from_mcp_config(config)
174
187
 
175
188
  # let the model use the tools
176
- client = LLMClient.basic("gpt-4o-mini")
189
+ client = LLMClient("gpt-4o-mini")
177
190
  resps = client.process_prompts_sync(
178
191
  ["List the files in the current directory"],
179
192
  tools=tools
@@ -210,7 +223,7 @@ conv = (
210
223
  )
211
224
 
212
225
  # Use prompt caching to cache system message and tools
213
- client = LLMClient.basic("claude-3-5-sonnet")
226
+ client = LLMClient("claude-3-5-sonnet")
214
227
  resps = client.process_prompts_sync(
215
228
  [conv],
216
229
  cache="system_and_tools" # Cache system message and any tools
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
3
3
 
4
4
  [project]
5
5
  name = "lm_deluge"
6
- version = "0.0.32"
6
+ version = "0.0.33"
7
7
  authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
8
8
  description = "Python utility for using LLM API models."
9
9
  readme = "README.md"
@@ -25,11 +25,10 @@ from .tracker import StatusTracker
25
25
 
26
26
  # TODO: get completions as they finish, not all at once at the end.
27
27
  # TODO: add optional max_input_tokens to client so we can reject long prompts to prevent abuse
28
- class LLMClient(BaseModel):
28
+ class _LLMClient(BaseModel):
29
29
  """
30
- LLMClient abstracts all the fixed arguments to process_prompts_async, so you can create it
31
- once and use it for more stuff without having to configure all the arguments.
32
- Handles models, sampling params for each model, model weights, rate limits, etc.
30
+ Internal LLMClient implementation using Pydantic.
31
+ Keeps all validation, serialization, and existing functionality.
33
32
  """
34
33
 
35
34
  model_names: str | list[str] = ["gpt-4.1-mini"]
@@ -53,6 +52,9 @@ class LLMClient(BaseModel):
53
52
  top_logprobs: int | None = None
54
53
  force_local_mcp: bool = False
55
54
 
55
+ # Progress configuration
56
+ progress: Literal["rich", "tqdm", "manual"] = "rich"
57
+
56
58
  # Internal state for async task handling
57
59
  _next_task_id: int = PrivateAttr(default=0)
58
60
  _tasks: dict[int, asyncio.Task] = PrivateAttr(default_factory=dict)
@@ -60,6 +62,23 @@ class LLMClient(BaseModel):
60
62
  _tracker: StatusTracker | None = PrivateAttr(default=None)
61
63
  _capacity_lock: asyncio.Lock = PrivateAttr(default_factory=asyncio.Lock)
62
64
 
65
+ # Progress management for queueing API
66
+ def open(self, total: int | None = None, show_progress: bool = True):
67
+ self._tracker = StatusTracker(
68
+ max_requests_per_minute=self.max_requests_per_minute,
69
+ max_tokens_per_minute=self.max_tokens_per_minute,
70
+ max_concurrent_requests=self.max_concurrent_requests,
71
+ progress_style=self.progress,
72
+ use_progress_bar=show_progress,
73
+ )
74
+ self._tracker.init_progress_bar(total)
75
+ return self
76
+
77
+ def close(self):
78
+ if self._tracker:
79
+ self._tracker.log_final_status()
80
+ self._tracker = None
81
+
63
82
  # NEW! Builder methods
64
83
  def with_model(self, model: str):
65
84
  self.model_names = [model]
@@ -90,7 +109,7 @@ class LLMClient(BaseModel):
90
109
  max_concurrent_requests=self.max_concurrent_requests,
91
110
  use_progress_bar=False,
92
111
  progress_bar_disable=True,
93
- use_rich=False,
112
+ progress_style=self.progress,
94
113
  )
95
114
  return self._tracker
96
115
 
@@ -100,7 +119,7 @@ class LLMClient(BaseModel):
100
119
 
101
120
  @model_validator(mode="before")
102
121
  @classmethod
103
- def fix_lists(cls, data) -> "LLMClient":
122
+ def fix_lists(cls, data) -> "_LLMClient":
104
123
  if isinstance(data.get("model_names"), str):
105
124
  data["model_names"] = [data["model_names"]]
106
125
  if not isinstance(data.get("sampling_params", []), list):
@@ -343,13 +362,10 @@ class LLMClient(BaseModel):
343
362
  max_requests_per_minute=self.max_requests_per_minute,
344
363
  max_tokens_per_minute=self.max_tokens_per_minute,
345
364
  max_concurrent_requests=self.max_concurrent_requests,
365
+ progress_style=self.progress,
346
366
  use_progress_bar=show_progress,
347
- progress_bar_total=len(prompts),
348
- progress_bar_disable=not show_progress,
349
- use_rich=show_progress,
350
367
  )
351
-
352
- tracker.init_progress_bar()
368
+ tracker.init_progress_bar(total=len(prompts), disable=not show_progress)
353
369
 
354
370
  # Create retry queue for failed requests
355
371
  retry_queue: asyncio.Queue[RequestContext] = asyncio.Queue()
@@ -510,6 +526,7 @@ class LLMClient(BaseModel):
510
526
  )
511
527
  task = asyncio.create_task(self._run_context(context))
512
528
  self._tasks[task_id] = task
529
+ tracker.add_to_total(1)
513
530
  return task_id
514
531
 
515
532
  async def start(
@@ -752,3 +769,70 @@ class LLMClient(BaseModel):
752
769
  # combined_results["limiting_factor"] = limiting_factor
753
770
 
754
771
  # return combined_results
772
+
773
+
774
+ # Clean factory function with perfect IDE support
775
+ @overload
776
+ def LLMClient(model_names: str, **kwargs) -> _LLMClient: ...
777
+
778
+ @overload
779
+ def LLMClient(model_names: list[str], **kwargs) -> _LLMClient: ...
780
+
781
+ def LLMClient(
782
+ model_names: str | list[str] = "gpt-4.1-mini",
783
+ *,
784
+ max_requests_per_minute: int = 1_000,
785
+ max_tokens_per_minute: int = 100_000,
786
+ max_concurrent_requests: int = 225,
787
+ sampling_params: list[SamplingParams] | None = None,
788
+ model_weights: list[float] | Literal["uniform", "dynamic"] = "uniform",
789
+ max_attempts: int = 5,
790
+ request_timeout: int = 30,
791
+ cache: Any = None,
792
+ extra_headers: dict[str, str] | None = None,
793
+ temperature: float = 0.75,
794
+ top_p: float = 1.0,
795
+ json_mode: bool = False,
796
+ max_new_tokens: int = 512,
797
+ reasoning_effort: Literal["low", "medium", "high", None] = None,
798
+ logprobs: bool = False,
799
+ top_logprobs: int | None = None,
800
+ force_local_mcp: bool = False,
801
+ progress: Literal["rich", "tqdm", "manual"] = "rich",
802
+ ) -> _LLMClient:
803
+ """
804
+ Create an LLMClient with model_names as a positional argument.
805
+
806
+ Args:
807
+ model_names: Model name(s) to use - can be a single string or list of strings
808
+ **kwargs: All other LLMClient configuration options (keyword-only)
809
+
810
+ Returns:
811
+ Configured LLMClient instance
812
+ """
813
+ # Handle default for mutable argument
814
+ if sampling_params is None:
815
+ sampling_params = []
816
+
817
+ # Simply pass everything to the Pydantic constructor
818
+ return _LLMClient(
819
+ model_names=model_names,
820
+ max_requests_per_minute=max_requests_per_minute,
821
+ max_tokens_per_minute=max_tokens_per_minute,
822
+ max_concurrent_requests=max_concurrent_requests,
823
+ sampling_params=sampling_params,
824
+ model_weights=model_weights,
825
+ max_attempts=max_attempts,
826
+ request_timeout=request_timeout,
827
+ cache=cache,
828
+ extra_headers=extra_headers,
829
+ temperature=temperature,
830
+ top_p=top_p,
831
+ json_mode=json_mode,
832
+ max_new_tokens=max_new_tokens,
833
+ reasoning_effort=reasoning_effort,
834
+ logprobs=logprobs,
835
+ top_logprobs=top_logprobs,
836
+ force_local_mcp=force_local_mcp,
837
+ progress=progress,
838
+ )
@@ -1261,9 +1261,39 @@ class APIModel:
1261
1261
  registry: dict[str, APIModel] = {}
1262
1262
 
1263
1263
 
1264
- def register_model(**kwargs) -> APIModel:
1264
+ def register_model(
1265
+ id: str,
1266
+ name: str,
1267
+ api_base: str,
1268
+ api_key_env_var: str,
1269
+ api_spec: str,
1270
+ input_cost: float | None = 0, # $ per million input tokens
1271
+ output_cost: float | None = 0, # $ per million output tokens
1272
+ supports_json: bool = False,
1273
+ supports_logprobs: bool = False,
1274
+ supports_responses: bool = False,
1275
+ reasoning_model: bool = False,
1276
+ regions: list[str] | dict[str, int] = field(default_factory=list),
1277
+ tokens_per_minute: int | None = None,
1278
+ requests_per_minute: int | None = None
1279
+ ) -> APIModel:
1265
1280
  """Register a model configuration and return the created APIModel."""
1266
- model = APIModel(**kwargs)
1281
+ model = APIModel(
1282
+ id=id,
1283
+ name=name,
1284
+ api_base=api_base,
1285
+ api_key_env_var=api_key_env_var,
1286
+ api_spec=api_spec,
1287
+ input_cost=input_cost,
1288
+ output_cost=output_cost,
1289
+ supports_json=supports_json,
1290
+ supports_logprobs=supports_logprobs,
1291
+ supports_responses=supports_responses,
1292
+ reasoning_model=reasoning_model,
1293
+ regions=regions,
1294
+ tokens_per_minute=tokens_per_minute,
1295
+ requests_per_minute=requests_per_minute
1296
+ )
1267
1297
  registry[model.id] = model
1268
1298
  return model
1269
1299
 
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import time
3
3
  from dataclasses import dataclass, field
4
+ from typing import Literal
4
5
 
5
6
  from rich.console import Console, Group
6
7
  from rich.live import Live
@@ -35,17 +36,22 @@ class StatusTracker:
35
36
  use_progress_bar: bool = True
36
37
  progress_bar_total: int | None = None
37
38
  progress_bar_disable: bool = False
39
+ progress_style: Literal["rich", "tqdm", "manual"] = "rich"
40
+ progress_print_interval: float = 30.0
38
41
  _pbar: tqdm | None = None
39
42
 
40
43
  # Rich display configuration
41
- use_rich: bool = True
42
44
  _rich_console: Console | None = None
43
45
  _rich_live: object | None = None
44
- _rich_progress: object | None = None
45
- _rich_task_id: object | None = None
46
+ _rich_progress: Progress | None = None
47
+ _rich_task_id: int | None = None
46
48
  _rich_display_task: asyncio.Task | None = None
47
49
  _rich_stop_event: asyncio.Event | None = None
48
50
 
51
+ # Manual print configuration
52
+ _manual_display_task: asyncio.Task | None = None
53
+ _manual_stop_event: asyncio.Event | None = None
54
+
49
55
  def __post_init__(self):
50
56
  self.available_request_capacity = self.max_requests_per_minute
51
57
  self.available_token_capacity = self.max_tokens_per_minute
@@ -147,69 +153,75 @@ class StatusTracker:
147
153
  if not self.use_progress_bar:
148
154
  return
149
155
 
150
- if self.use_rich:
151
- self._init_rich_display(total, disable)
152
- else:
153
- # Use provided values or fall back to instance defaults
154
- pbar_total = total if total is not None else self.progress_bar_total
155
- pbar_disable = disable if disable is not None else self.progress_bar_disable
156
+ pbar_total = total if total is not None else self.progress_bar_total
157
+ pbar_disable = disable if disable is not None else self.progress_bar_disable
158
+ if pbar_total is None:
159
+ pbar_total = 0
160
+ self.progress_bar_total = pbar_total
161
+
162
+ if self.progress_style == "rich":
163
+ if pbar_disable:
164
+ return
165
+ self._init_rich_display(pbar_total)
166
+ elif self.progress_style == "tqdm":
156
167
  self._pbar = tqdm(total=pbar_total, disable=pbar_disable)
168
+ elif self.progress_style == "manual":
169
+ self._init_manual_display(pbar_total)
170
+
157
171
  self.update_pbar()
158
172
 
159
173
  def close_progress_bar(self):
160
174
  """Close progress bar if it exists."""
161
- if self.use_rich and self._rich_stop_event:
162
- self._close_rich_display()
163
- elif self._pbar is not None:
164
- self._pbar.close()
165
- self._pbar = None
166
-
167
- def _init_rich_display(self, total: int | None = None, disable: bool | None = None):
168
- """Initialize Rich display components."""
169
- if disable:
175
+ if not self.use_progress_bar:
170
176
  return
171
-
172
- pbar_total = total if total is not None else self.progress_bar_total
173
- if pbar_total is None:
174
- pbar_total = 100 # Default fallback
175
-
177
+ if self.progress_style == "rich":
178
+ if self._rich_stop_event:
179
+ self._close_rich_display()
180
+ elif self.progress_style == "tqdm":
181
+ if self._pbar is not None:
182
+ self._pbar.close()
183
+ self._pbar = None
184
+ elif self.progress_style == "manual":
185
+ self._close_manual_display()
186
+
187
+ def _init_rich_display(self, total: int):
188
+ """Initialize Rich display components."""
176
189
  self._rich_console = Console()
177
- self._rich_stop_event = asyncio.Event()
178
-
179
- # Start the display updater task
180
- self._rich_display_task = asyncio.create_task(
181
- self._rich_display_updater(pbar_total)
182
- )
183
-
184
- async def _rich_display_updater(self, total: int):
185
- """Update Rich display independently."""
186
- if not self._rich_console or self._rich_stop_event is None:
187
- return
188
-
189
- # Create progress bar without console so we can use it in Live
190
- progress = Progress(
190
+ self._rich_progress = Progress(
191
191
  SpinnerColumn(),
192
192
  TextColumn("Processing requests..."),
193
193
  BarColumn(),
194
194
  MofNCompleteColumn(),
195
195
  )
196
- main_task = progress.add_task("requests", total=total)
196
+ self._rich_task_id = self._rich_progress.add_task("requests", total=total)
197
+ self._rich_stop_event = asyncio.Event()
198
+ self._rich_display_task = asyncio.create_task(self._rich_display_updater())
197
199
 
198
- # Use Live to combine progress + text
200
+ async def _rich_display_updater(self):
201
+ """Update Rich display independently."""
202
+ if (
203
+ not self._rich_console
204
+ or self._rich_progress is None
205
+ or self._rich_task_id is None
206
+ or self._rich_stop_event is None
207
+ ):
208
+ return
199
209
 
200
210
  with Live(console=self._rich_console, refresh_per_second=10) as live:
201
211
  while not self._rich_stop_event.is_set():
202
212
  completed = self.num_tasks_succeeded
203
- progress.update(main_task, completed=completed)
213
+ self._rich_progress.update(
214
+ self._rich_task_id,
215
+ completed=completed,
216
+ total=self.progress_bar_total,
217
+ )
204
218
 
205
- # Create capacity info text
206
219
  tokens_info = f"TPM Capacity: {self.available_token_capacity / 1000:.1f}k/{self.max_tokens_per_minute / 1000:.1f}k"
207
220
  reqs_info = f"RPM Capacity: {int(self.available_request_capacity)}/{self.max_requests_per_minute}"
208
221
  in_progress = f"In Progress: {int(self.num_tasks_in_progress)}"
209
222
  capacity_text = Text(f"{in_progress} • {tokens_info} • {reqs_info}")
210
223
 
211
- # Group progress bar and text
212
- display = Group(progress, capacity_text)
224
+ display = Group(self._rich_progress, capacity_text)
213
225
  live.update(display)
214
226
 
215
227
  await asyncio.sleep(0.1)
@@ -223,15 +235,45 @@ class StatusTracker:
223
235
 
224
236
  self._rich_console = None
225
237
  self._rich_live = None
238
+ self._rich_progress = None
239
+ self._rich_task_id = None
226
240
  self._rich_display_task = None
227
241
  self._rich_stop_event = None
228
242
 
243
+ def _init_manual_display(self, total: int):
244
+ """Initialize manual progress printer."""
245
+ self.progress_bar_total = total
246
+ self._manual_stop_event = asyncio.Event()
247
+ self._manual_display_task = asyncio.create_task(
248
+ self._manual_display_updater()
249
+ )
250
+
251
+ async def _manual_display_updater(self):
252
+ if self._manual_stop_event is None:
253
+ return
254
+ while not self._manual_stop_event.is_set():
255
+ print(
256
+ f"Completed {self.num_tasks_succeeded}/{self.progress_bar_total} requests"
257
+ )
258
+ await asyncio.sleep(self.progress_print_interval)
259
+
260
+ def _close_manual_display(self):
261
+ if self._manual_stop_event:
262
+ self._manual_stop_event.set()
263
+ if self._manual_display_task and not self._manual_display_task.done():
264
+ self._manual_display_task.cancel()
265
+ self._manual_display_task = None
266
+ self._manual_stop_event = None
267
+
229
268
  def update_pbar(self, n: int = 0):
230
269
  """Update progress bar status and optionally increment.
231
270
 
232
271
  Args:
233
272
  n: Number of items to increment (0 means just update postfix)
234
273
  """
274
+ if self.progress_style != "tqdm":
275
+ return
276
+
235
277
  current_time = time.time()
236
278
  if self._pbar and (current_time - self.last_pbar_update_time > 1):
237
279
  self.last_pbar_update_time = current_time
@@ -249,8 +291,27 @@ class StatusTracker:
249
291
 
250
292
  def increment_pbar(self):
251
293
  """Increment progress bar by 1."""
252
- if self.use_rich:
253
- # Rich display is updated automatically by the display updater
254
- pass
255
- elif self._pbar:
294
+ if not self.use_progress_bar:
295
+ return
296
+ if self.progress_style == "tqdm" and self._pbar:
256
297
  self._pbar.update(1)
298
+ # rich and manual are updated elsewhere
299
+
300
+ def add_to_total(self, n: int = 1):
301
+ """Increase the total number of tasks being tracked."""
302
+ if self.progress_bar_total is None:
303
+ self.progress_bar_total = 0
304
+ self.progress_bar_total += n
305
+ if not self.use_progress_bar:
306
+ return
307
+ if self.progress_style == "tqdm" and self._pbar:
308
+ self._pbar.total = self.progress_bar_total
309
+ self._pbar.refresh()
310
+ elif (
311
+ self.progress_style == "rich"
312
+ and self._rich_progress
313
+ and self._rich_task_id is not None
314
+ ):
315
+ self._rich_progress.update(
316
+ self._rich_task_id, total=self.progress_bar_total
317
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.32
3
+ Version: 0.0.33
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
@@ -54,12 +54,12 @@ The package relies on environment variables for API keys. Typical variables incl
54
54
 
55
55
  ## Quickstart
56
56
 
57
- The easiest way to get started is with the `.basic` constructor. This uses sensible default arguments for rate limits and sampling parameters so that you don't have to provide a ton of arguments.
57
+ `LLMClient` uses sensible default arguments for rate limits and sampling parameters so that you don't have to provide a ton of arguments.
58
58
 
59
59
  ```python
60
60
  from lm_deluge import LLMClient
61
61
 
62
- client = LLMClient.basic("gpt-4o-mini")
62
+ client = LLMClient("gpt-4o-mini")
63
63
  resps = client.process_prompts_sync(["Hello, world!"])
64
64
  print(resp[0].completion)
65
65
  ```
@@ -71,7 +71,7 @@ To distribute your requests across models, just provide a list of more than one
71
71
  ```python
72
72
  from lm_deluge import LLMClient
73
73
 
74
- client = LLMClient.basic(
74
+ client = LLMClient(
75
75
  ["gpt-4o-mini", "claude-3-haiku"],
76
76
  max_requests_per_minute=10_000
77
77
  )
@@ -85,8 +85,8 @@ print(resp[0].completion)
85
85
 
86
86
  API calls can be customized in a few ways.
87
87
 
88
- 1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
89
- 2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
88
+ 1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort. You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models.
89
+ 2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, caching, **and progress display style**. Set `progress="rich"` (default), `"tqdm"`, or `"manual"` to choose how progress is reported. The manual option prints an update every 30 seconds.
90
90
  3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object). This is also where you provide tools.
91
91
 
92
92
  Putting it all together:
@@ -109,6 +109,19 @@ await client.process_prompts_async(
109
109
  )
110
110
  ```
111
111
 
112
+ ### Queueing individual prompts
113
+
114
+ You can queue prompts one at a time and track progress explicitly:
115
+
116
+ ```python
117
+ client = LLMClient("gpt-4.1-mini", progress="tqdm")
118
+ client.open()
119
+ task_id = client.start_nowait("hello there")
120
+ # ... queue more tasks ...
121
+ results = await client.wait_for_all()
122
+ client.close()
123
+ ```
124
+
112
125
  ## Multi-Turn Conversations
113
126
 
114
127
  Constructing conversations to pass to models is notoriously annoying. Each provider has a slightly different way of defining a list of messages, and with the introduction of images/multi-part messages it's only gotten worse. We provide convenience constructors so you don't have to remember all that stuff.
@@ -120,7 +133,7 @@ prompt = Conversation.system("You are a helpful assistant.").add(
120
133
  Message.user("What's in this image?").add_image("tests/image.jpg")
121
134
  )
122
135
 
123
- client = LLMClient.basic("gpt-4.1-mini")
136
+ client = LLMClient("gpt-4.1-mini")
124
137
  resps = client.process_prompts_sync([prompt])
125
138
  ```
126
139
 
@@ -136,9 +149,9 @@ For models that support file uploads (OpenAI, Anthropic, and Gemini), you can ea
136
149
  from lm_deluge import LLMClient, Conversation
137
150
 
138
151
  # Simple file upload
139
- client = LLMClient.basic("gpt-4.1-mini")
152
+ client = LLMClient("gpt-4.1-mini")
140
153
  conversation = Conversation.user(
141
- "Please summarize this document",
154
+ "Please summarize this document",
142
155
  file="path/to/document.pdf"
143
156
  )
144
157
  resps = client.process_prompts_sync([conversation])
@@ -163,7 +176,7 @@ def get_weather(city: str) -> str:
163
176
  return f"The weather in {city} is sunny and 72°F"
164
177
 
165
178
  tool = Tool.from_function(get_weather)
166
- client = LLMClient.basic("claude-3-haiku")
179
+ client = LLMClient("claude-3-haiku")
167
180
  resps = client.process_prompts_sync(
168
181
  ["What's the weather in Paris?"],
169
182
  tools=[tool]
@@ -200,7 +213,7 @@ config = {
200
213
  all_tools = Tool.from_mcp_config(config)
201
214
 
202
215
  # let the model use the tools
203
- client = LLMClient.basic("gpt-4o-mini")
216
+ client = LLMClient("gpt-4o-mini")
204
217
  resps = client.process_prompts_sync(
205
218
  ["List the files in the current directory"],
206
219
  tools=tools
@@ -237,7 +250,7 @@ conv = (
237
250
  )
238
251
 
239
252
  # Use prompt caching to cache system message and tools
240
- client = LLMClient.basic("claude-3-5-sonnet")
253
+ client = LLMClient("claude-3-5-sonnet")
241
254
  resps = client.process_prompts_sync(
242
255
  [conv],
243
256
  cache="system_and_tools" # Cache system message and any tools
File without changes
File without changes