lm-deluge 0.0.13__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

lm_deluge/__init__.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from .client import LLMClient, SamplingParams, APIResponse
2
2
  from .prompt import Conversation, Message
3
3
  from .tool import Tool
4
+ from .file import File
4
5
  import dotenv
5
6
 
6
7
  dotenv.load_dotenv()
@@ -12,4 +13,5 @@ __all__ = [
12
13
  "Conversation",
13
14
  "Message",
14
15
  "Tool",
16
+ "File",
15
17
  ]
@@ -1,165 +1,19 @@
1
1
  import asyncio
2
- import json
3
2
  import random
4
3
  import traceback
5
4
  from abc import ABC, abstractmethod
6
- from dataclasses import dataclass
7
5
  from typing import Callable
8
6
 
9
7
  import aiohttp
10
8
  from aiohttp import ClientResponse
11
9
 
12
- from lm_deluge.prompt import CachePattern, Conversation, Message
13
- from lm_deluge.usage import Usage
10
+ from lm_deluge.prompt import CachePattern, Conversation
14
11
 
15
12
  from ..config import SamplingParams
16
13
  from ..errors import raise_if_modal_exception
17
14
  from ..models import APIModel
18
15
  from ..tracker import StatusTracker
19
-
20
-
21
- @dataclass
22
- class APIResponse:
23
- # request information
24
- id: int # should be unique to the request within a given prompt-processing call
25
- model_internal: str # our internal model tag
26
- prompt: Conversation
27
- sampling_params: SamplingParams
28
-
29
- # http response information
30
- status_code: int | None
31
- is_error: bool | None
32
- error_message: str | None
33
-
34
- # completion information - unified usage tracking
35
- usage: Usage | None = None
36
-
37
- # response content - structured format
38
- content: Message | None = None
39
-
40
- # optional or calculated automatically
41
- thinking: str | None = None # if model shows thinking tokens
42
- model_external: str | None = None # the model tag used by the API
43
- region: str | None = None
44
- logprobs: list | None = None
45
- finish_reason: str | None = None # make required later
46
- cost: float | None = None # calculated automatically
47
- cache_hit: bool = False # manually set if true
48
- # set to true if is_error and should be retried with a different model
49
- retry_with_different_model: bool | None = False
50
- # set to true if should NOT retry with the same model (unrecoverable error)
51
- give_up_if_no_other_models: bool | None = False
52
- # OpenAI Responses API specific - used for computer use continuation
53
- response_id: str | None = None
54
- # Raw API response for debugging
55
- raw_response: dict | None = None
56
-
57
- @property
58
- def completion(self) -> str | None:
59
- """Backward compatibility: extract text from content Message."""
60
- if self.content is not None:
61
- return self.content.completion
62
- return None
63
-
64
- @property
65
- def input_tokens(self) -> int | None:
66
- """Get input tokens from usage object."""
67
- return self.usage.input_tokens if self.usage else None
68
-
69
- @property
70
- def output_tokens(self) -> int | None:
71
- """Get output tokens from usage object."""
72
- return self.usage.output_tokens if self.usage else None
73
-
74
- @property
75
- def cache_read_tokens(self) -> int | None:
76
- """Get cache read tokens from usage object."""
77
- return self.usage.cache_read_tokens if self.usage else None
78
-
79
- @property
80
- def cache_write_tokens(self) -> int | None:
81
- """Get cache write tokens from usage object."""
82
- return self.usage.cache_write_tokens if self.usage else None
83
-
84
- def __post_init__(self):
85
- # calculate cost & get external model name
86
- self.id = int(self.id)
87
- api_model = APIModel.from_registry(self.model_internal)
88
- self.model_external = api_model.name
89
- self.cost = None
90
- if (
91
- self.usage is not None
92
- and api_model.input_cost is not None
93
- and api_model.output_cost is not None
94
- ):
95
- self.cost = (
96
- self.usage.input_tokens * api_model.input_cost / 1e6
97
- + self.usage.output_tokens * api_model.output_cost / 1e6
98
- )
99
- elif self.content is not None and self.completion is not None:
100
- print(
101
- f"Warning: Completion provided without token counts for model {self.model_internal}."
102
- )
103
-
104
- def to_dict(self):
105
- return {
106
- "id": self.id,
107
- "model_internal": self.model_internal,
108
- "model_external": self.model_external,
109
- "region": self.region,
110
- "prompt": self.prompt.to_log(), # destroys image if present
111
- "sampling_params": self.sampling_params.__dict__,
112
- "status_code": self.status_code,
113
- "is_error": self.is_error,
114
- "error_message": self.error_message,
115
- "completion": self.completion, # computed property
116
- "content": self.content.to_log() if self.content else None,
117
- "usage": self.usage.to_dict() if self.usage else None,
118
- "finish_reason": self.finish_reason,
119
- "cost": self.cost,
120
- }
121
-
122
- @classmethod
123
- def from_dict(cls, data: dict):
124
- # Handle backward compatibility for content/completion
125
- content = None
126
- if "content" in data and data["content"] is not None:
127
- # Reconstruct message from log format
128
- content = Message.from_log(data["content"])
129
- elif "completion" in data and data["completion"] is not None:
130
- # Backward compatibility: create a Message with just text
131
- content = Message.ai(data["completion"])
132
-
133
- usage = None
134
- if "usage" in data and data["usage"] is not None:
135
- usage = Usage.from_dict(data["usage"])
136
-
137
- return cls(
138
- id=data.get("id", random.randint(0, 1_000_000_000)),
139
- model_internal=data["model_internal"],
140
- prompt=Conversation.from_log(data["prompt"]),
141
- sampling_params=SamplingParams(**data["sampling_params"]),
142
- status_code=data["status_code"],
143
- is_error=data["is_error"],
144
- error_message=data["error_message"],
145
- usage=usage,
146
- content=content,
147
- thinking=data.get("thinking"),
148
- model_external=data.get("model_external"),
149
- region=data.get("region"),
150
- logprobs=data.get("logprobs"),
151
- finish_reason=data.get("finish_reason"),
152
- cost=data.get("cost"),
153
- cache_hit=data.get("cache_hit", False),
154
- )
155
-
156
- def write_to_file(self, filename):
157
- """
158
- Writes the APIResponse as a line to a file.
159
- If file exists, appends to it.
160
- """
161
- with open(filename, "a") as f:
162
- f.write(json.dumps(self.to_dict()) + "\n")
16
+ from .response import APIResponse
163
17
 
164
18
 
165
19
  class APIRequestBase(ABC):
@@ -1,17 +1,19 @@
1
- import warnings
2
- from aiohttp import ClientResponse
3
1
  import json
4
2
  import os
3
+ import warnings
5
4
  from typing import Callable
6
5
 
6
+ import aiohttp
7
+ from aiohttp import ClientResponse
8
+
7
9
  from lm_deluge.tool import Tool
8
10
 
9
- from .base import APIRequestBase, APIResponse
10
- from ..prompt import Conversation, Message, Text, ToolCall, Thinking, CachePattern
11
- from ..usage import Usage
12
- from ..tracker import StatusTracker
13
11
  from ..config import SamplingParams
14
12
  from ..models import APIModel
13
+ from ..prompt import CachePattern, Conversation, Message, Text, Thinking, ToolCall
14
+ from ..tracker import StatusTracker
15
+ from ..usage import Usage
16
+ from .base import APIRequestBase, APIResponse
15
17
 
16
18
 
17
19
  def _build_oa_chat_request(
@@ -111,6 +113,7 @@ class OpenAIRequest(APIRequestBase):
111
113
  status_code = http_response.status
112
114
  mimetype = http_response.headers.get("Content-Type", None)
113
115
  data = None
116
+ finish_reason = None
114
117
  if status_code >= 200 and status_code < 300:
115
118
  try:
116
119
  data = await http_response.json()
@@ -125,6 +128,7 @@ class OpenAIRequest(APIRequestBase):
125
128
  # Parse response into Message with parts
126
129
  parts = []
127
130
  message = data["choices"][0]["message"]
131
+ finish_reason = data["choices"][0]["finish_reason"]
128
132
 
129
133
  # Add text content if present
130
134
  if message.get("content"):
@@ -190,6 +194,7 @@ class OpenAIRequest(APIRequestBase):
190
194
  sampling_params=self.sampling_params,
191
195
  usage=usage,
192
196
  raw_response=data,
197
+ finish_reason=finish_reason,
193
198
  )
194
199
 
195
200
 
@@ -266,6 +271,13 @@ class OpenAIResponsesRequest(APIRequestBase):
266
271
  self.request_json["max_output_tokens"] = sampling_params.max_new_tokens
267
272
 
268
273
  if self.model.reasoning_model:
274
+ if sampling_params.reasoning_effort in [None, "none"]:
275
+ # gemini models can switch reasoning off
276
+ if "gemini" in self.model.id:
277
+ self.sampling_params.reasoning_effort = "none" # expects string
278
+ # openai models can only go down to "low"
279
+ else:
280
+ self.sampling_params.reasoning_effort = "low"
269
281
  self.request_json["temperature"] = 1.0
270
282
  self.request_json["top_p"] = 1.0
271
283
  self.request_json["reasoning"] = {
@@ -413,3 +425,57 @@ class OpenAIResponsesRequest(APIRequestBase):
413
425
  usage=usage,
414
426
  raw_response=data,
415
427
  )
428
+
429
+
430
+ async def stream_chat(
431
+ model_name: str, # must correspond to registry
432
+ prompt: Conversation,
433
+ sampling_params: SamplingParams = SamplingParams(),
434
+ tools: list | None = None,
435
+ cache: CachePattern | None = None,
436
+ ):
437
+ if cache is not None:
438
+ warnings.warn(
439
+ f"Cache parameter '{cache}' is only supported for Anthropic models, ignoring for {model_name}"
440
+ )
441
+
442
+ model = APIModel.from_registry(model_name)
443
+ if model.api_spec != "openai":
444
+ raise ValueError("streaming only supported on openai models for now")
445
+ url = f"{model.api_base}/chat/completions"
446
+ request_header = {"Authorization": f"Bearer {os.getenv(model.api_key_env_var)}"}
447
+ request_json = _build_oa_chat_request(model, prompt, tools, sampling_params)
448
+ request_json["stream"] = True
449
+
450
+ async with aiohttp.ClientSession() as s:
451
+ async with s.post(url, headers=request_header, json=request_json) as r:
452
+ r.raise_for_status() # bail on 4xx/5xx
453
+ content = ""
454
+ buf = ""
455
+ async for chunk in r.content.iter_any(): # raw bytes
456
+ buf += chunk.decode()
457
+ while "\n\n" in buf: # full SSE frame
458
+ event, buf = buf.split("\n\n", 1)
459
+ if not event.startswith("data:"):
460
+ continue # ignore comments
461
+ data = event[5:].strip() # after "data:"
462
+ if data == "[DONE]":
463
+ yield APIResponse(
464
+ id=0,
465
+ status_code=None,
466
+ is_error=False,
467
+ error_message=None,
468
+ prompt=prompt,
469
+ content=Message(
470
+ role="assistant", parts=[Text(text=content)]
471
+ ),
472
+ model_internal=model.id,
473
+ sampling_params=sampling_params,
474
+ usage=None,
475
+ raw_response=None,
476
+ )
477
+ msg = json.loads(data) # SSE payload
478
+ delta = msg["choices"][0]["delta"].get("content")
479
+ if delta:
480
+ content += delta
481
+ yield delta
@@ -0,0 +1,153 @@
1
+ import json
2
+ import random
3
+ from dataclasses import dataclass
4
+
5
+ from lm_deluge.prompt import Conversation, Message
6
+ from lm_deluge.usage import Usage
7
+
8
+ from ..config import SamplingParams
9
+ from ..models import APIModel
10
+
11
+
12
+ @dataclass
13
+ class APIResponse:
14
+ # request information
15
+ id: int # should be unique to the request within a given prompt-processing call
16
+ model_internal: str # our internal model tag
17
+ prompt: Conversation
18
+ sampling_params: SamplingParams
19
+
20
+ # http response information
21
+ status_code: int | None
22
+ is_error: bool | None
23
+ error_message: str | None
24
+
25
+ # completion information - unified usage tracking
26
+ usage: Usage | None = None
27
+
28
+ # response content - structured format
29
+ content: Message | None = None
30
+
31
+ # optional or calculated automatically
32
+ thinking: str | None = None # if model shows thinking tokens
33
+ model_external: str | None = None # the model tag used by the API
34
+ region: str | None = None
35
+ logprobs: list | None = None
36
+ finish_reason: str | None = None # make required later
37
+ cost: float | None = None # calculated automatically
38
+ cache_hit: bool = False # manually set if true
39
+ # set to true if is_error and should be retried with a different model
40
+ retry_with_different_model: bool | None = False
41
+ # set to true if should NOT retry with the same model (unrecoverable error)
42
+ give_up_if_no_other_models: bool | None = False
43
+ # OpenAI Responses API specific - used for computer use continuation
44
+ response_id: str | None = None
45
+ # Raw API response for debugging
46
+ raw_response: dict | None = None
47
+
48
+ @property
49
+ def completion(self) -> str | None:
50
+ """Backward compatibility: extract text from content Message."""
51
+ if self.content is not None:
52
+ return self.content.completion
53
+ return None
54
+
55
+ @property
56
+ def input_tokens(self) -> int | None:
57
+ """Get input tokens from usage object."""
58
+ return self.usage.input_tokens if self.usage else None
59
+
60
+ @property
61
+ def output_tokens(self) -> int | None:
62
+ """Get output tokens from usage object."""
63
+ return self.usage.output_tokens if self.usage else None
64
+
65
+ @property
66
+ def cache_read_tokens(self) -> int | None:
67
+ """Get cache read tokens from usage object."""
68
+ return self.usage.cache_read_tokens if self.usage else None
69
+
70
+ @property
71
+ def cache_write_tokens(self) -> int | None:
72
+ """Get cache write tokens from usage object."""
73
+ return self.usage.cache_write_tokens if self.usage else None
74
+
75
+ def __post_init__(self):
76
+ # calculate cost & get external model name
77
+ self.id = int(self.id)
78
+ api_model = APIModel.from_registry(self.model_internal)
79
+ self.model_external = api_model.name
80
+ self.cost = None
81
+ if (
82
+ self.usage is not None
83
+ and api_model.input_cost is not None
84
+ and api_model.output_cost is not None
85
+ ):
86
+ self.cost = (
87
+ self.usage.input_tokens * api_model.input_cost / 1e6
88
+ + self.usage.output_tokens * api_model.output_cost / 1e6
89
+ )
90
+ elif self.content is not None and self.completion is not None:
91
+ print(
92
+ f"Warning: Completion provided without token counts for model {self.model_internal}."
93
+ )
94
+
95
+ def to_dict(self):
96
+ return {
97
+ "id": self.id,
98
+ "model_internal": self.model_internal,
99
+ "model_external": self.model_external,
100
+ "region": self.region,
101
+ "prompt": self.prompt.to_log(), # destroys image if present
102
+ "sampling_params": self.sampling_params.__dict__,
103
+ "status_code": self.status_code,
104
+ "is_error": self.is_error,
105
+ "error_message": self.error_message,
106
+ "completion": self.completion, # computed property
107
+ "content": self.content.to_log() if self.content else None,
108
+ "usage": self.usage.to_dict() if self.usage else None,
109
+ "finish_reason": self.finish_reason,
110
+ "cost": self.cost,
111
+ }
112
+
113
+ @classmethod
114
+ def from_dict(cls, data: dict):
115
+ # Handle backward compatibility for content/completion
116
+ content = None
117
+ if "content" in data and data["content"] is not None:
118
+ # Reconstruct message from log format
119
+ content = Message.from_log(data["content"])
120
+ elif "completion" in data and data["completion"] is not None:
121
+ # Backward compatibility: create a Message with just text
122
+ content = Message.ai(data["completion"])
123
+
124
+ usage = None
125
+ if "usage" in data and data["usage"] is not None:
126
+ usage = Usage.from_dict(data["usage"])
127
+
128
+ return cls(
129
+ id=data.get("id", random.randint(0, 1_000_000_000)),
130
+ model_internal=data["model_internal"],
131
+ prompt=Conversation.from_log(data["prompt"]),
132
+ sampling_params=SamplingParams(**data["sampling_params"]),
133
+ status_code=data["status_code"],
134
+ is_error=data["is_error"],
135
+ error_message=data["error_message"],
136
+ usage=usage,
137
+ content=content,
138
+ thinking=data.get("thinking"),
139
+ model_external=data.get("model_external"),
140
+ region=data.get("region"),
141
+ logprobs=data.get("logprobs"),
142
+ finish_reason=data.get("finish_reason"),
143
+ cost=data.get("cost"),
144
+ cache_hit=data.get("cache_hit", False),
145
+ )
146
+
147
+ def write_to_file(self, filename):
148
+ """
149
+ Writes the APIResponse as a line to a file.
150
+ If file exists, appends to it.
151
+ """
152
+ with open(filename, "a") as f:
153
+ f.write(json.dumps(self.to_dict()) + "\n")
lm_deluge/client.py CHANGED
@@ -6,6 +6,7 @@ import yaml
6
6
  from pydantic import BaseModel
7
7
  from pydantic.functional_validators import model_validator
8
8
 
9
+ from lm_deluge.api_requests.openai import stream_chat
9
10
  from lm_deluge.batches import (
10
11
  submit_batches_anthropic,
11
12
  submit_batches_oa,
@@ -34,6 +35,12 @@ class LLMClient(BaseModel):
34
35
  """
35
36
 
36
37
  model_names: list[str] = ["gpt-4.1-mini"]
38
+
39
+ def __init__(self, model_name: str | list[str] | None = None, **kwargs):
40
+ if model_name is not None:
41
+ kwargs["model_names"] = model_name
42
+ super().__init__(**kwargs)
43
+
37
44
  max_requests_per_minute: int = 1_000
38
45
  max_tokens_per_minute: int = 100_000
39
46
  max_concurrent_requests: int = 225
@@ -81,7 +88,7 @@ class LLMClient(BaseModel):
81
88
  @model_validator(mode="before")
82
89
  @classmethod
83
90
  def fix_lists(cls, data) -> "LLMClient":
84
- if isinstance(data["model_names"], str):
91
+ if isinstance(data.get("model_names"), str):
85
92
  data["model_names"] = [data["model_names"]]
86
93
  if "sampling_params" not in data or len(data.get("sampling_params", [])) == 0:
87
94
  data["sampling_params"] = [
@@ -162,6 +169,11 @@ class LLMClient(BaseModel):
162
169
  kwargs["model_names"] = model
163
170
  return cls(**kwargs)
164
171
 
172
+ def _select_model(self):
173
+ assert isinstance(self.model_weights, list)
174
+ model_idx = np.random.choice(range(len(self.models)), p=self.model_weights)
175
+ return self.models[model_idx], self.sampling_params[model_idx]
176
+
165
177
  @overload
166
178
  async def process_prompts_async(
167
179
  self,
@@ -249,41 +261,6 @@ class LLMClient(BaseModel):
249
261
  if len(cache_hit_ids) > 0:
250
262
  tracker.update_pbar(len(cache_hit_ids))
251
263
 
252
- # api_task = asyncio.create_task(
253
- # process_api_prompts_async(
254
- # ids,
255
- # prompts, # type: ignore -- fix later for dry running conversations
256
- # self.models,
257
- # self.model_weights, # type: ignore
258
- # self.sampling_params, # type: ignore
259
- # max_attempts=self.max_attempts,
260
- # max_concurrent_requests=self.max_concurrent_requests,
261
- # request_timeout=self.request_timeout,
262
- # status_tracker=tracker,
263
- # tools=tools,
264
- # cache=cache,
265
- # computer_use=computer_use,
266
- # display_width=display_width,
267
- # display_height=display_height,
268
- # use_responses_api=use_responses_api,
269
- # )
270
- # )
271
- # async def process_api_prompts_async(
272
-
273
- # models: str | list[str],
274
- # model_weights: list[float],
275
- # sampling_params: list[SamplingParams],
276
- # max_attempts: int = 5,
277
- # max_concurrent_requests: int = 1_000,
278
- # request_timeout: int = 30,
279
- # status_tracker: StatusTracker | None = None,
280
- # tools: list[Tool] | None = None,
281
- # cache: CachePattern | None = None,
282
- # computer_use: bool = False,
283
- # display_width: int = 1024,
284
- # display_height: int = 768,
285
- # use_responses_api: bool = False,
286
- # ):
287
264
  if isinstance(ids, np.ndarray):
288
265
  ids = ids.tolist() # pyright: ignore
289
266
 
@@ -296,28 +273,28 @@ class LLMClient(BaseModel):
296
273
  assert tracker.retry_queue, "retry queue not initialized"
297
274
  while True:
298
275
  # get next request (if one is not already waiting for capacity)
276
+ retry_request = False
299
277
  if next_request is None:
300
278
  if not tracker.retry_queue.empty():
301
279
  next_request = tracker.retry_queue.get_nowait()
280
+ retry_request = True
302
281
  print(f"Retrying request {next_request.task_id}.")
303
282
  elif prompts_not_finished:
304
283
  try:
305
284
  # get new request
306
285
  id, prompt = next(prompts_iter)
307
286
  # select model
308
- assert isinstance(self.model_weights, list)
309
- model_idx = np.random.choice(
310
- range(len(self.models)), p=self.model_weights
311
- )
287
+ model, sampling_params = self._select_model()
288
+
312
289
  next_request = create_api_request(
313
290
  task_id=id,
314
- model_name=self.models[model_idx],
291
+ model_name=model,
315
292
  prompt=prompt, # type: ignore
316
293
  request_timeout=self.request_timeout,
317
294
  attempts_left=self.max_attempts,
318
295
  status_tracker=tracker,
319
296
  results_arr=requests,
320
- sampling_params=self.sampling_params[model_idx],
297
+ sampling_params=sampling_params,
321
298
  all_model_names=self.models,
322
299
  all_sampling_params=self.sampling_params,
323
300
  tools=tools,
@@ -339,10 +316,9 @@ class LLMClient(BaseModel):
339
316
  # if enough capacity available, call API
340
317
  if next_request:
341
318
  next_request_tokens = next_request.num_tokens
342
- if tracker.check_capacity(next_request_tokens):
319
+ if tracker.check_capacity(next_request_tokens, retry=retry_request):
343
320
  tracker.set_limiting_factor(None)
344
- next_request.attempts_left -= 1
345
- # call API
321
+ # call API (attempts_left will be decremented in handle_error if it fails)
346
322
  asyncio.create_task(next_request.call_api())
347
323
  next_request = None # reset next_request to empty
348
324
  # update pbar status
@@ -360,9 +336,10 @@ class LLMClient(BaseModel):
360
336
  await asyncio.sleep(tracker.seconds_to_pause)
361
337
  print(f"Pausing {tracker.seconds_to_pause}s to cool down.")
362
338
 
363
- # after finishing, log final status
364
- tracker.log_final_status()
365
- # deduplicate results by id
339
+ # after finishing, log final status
340
+ tracker.log_final_status()
341
+
342
+ # deduplicate results by id
366
343
  api_results = deduplicate_responses(requests)
367
344
  for res in api_results:
368
345
  results[res.id] = res
@@ -399,6 +376,17 @@ class LLMClient(BaseModel):
399
376
  )
400
377
  )
401
378
 
379
+ async def stream(self, prompt: str | Conversation, tools: list[Tool] | None = None):
380
+ model, sampling_params = self._select_model()
381
+ if isinstance(prompt, str):
382
+ prompt = Conversation.user(prompt)
383
+ async for item in stream_chat(model, prompt, sampling_params, tools, None):
384
+ if isinstance(item, str):
385
+ print(item, end="", flush=True)
386
+ else:
387
+ # final item
388
+ return item
389
+
402
390
  async def submit_batch_job(
403
391
  self,
404
392
  prompts: Sequence[str | list[dict] | Conversation],
lm_deluge/config.py CHANGED
@@ -1,13 +1,14 @@
1
- from pydantic import BaseModel
2
1
  from typing import Literal
3
2
 
3
+ from pydantic import BaseModel
4
+
4
5
 
5
6
  class SamplingParams(BaseModel):
6
7
  temperature: float = 0.0
7
8
  top_p: float = 1.0
8
9
  json_mode: bool = False
9
10
  max_new_tokens: int = 512
10
- reasoning_effort: Literal["low", "medium", "high", None] = None
11
+ reasoning_effort: Literal["low", "medium", "high", "none", None] = None
11
12
  logprobs: bool = False
12
13
  top_logprobs: int | None = None
13
14
 
lm_deluge/file.py ADDED
@@ -0,0 +1,149 @@
1
+ import os
2
+ import io
3
+ import requests
4
+ import base64
5
+ import mimetypes
6
+ import xxhash
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+
10
+
11
+ @dataclass(slots=True)
12
+ class File:
13
+ # raw bytes, pathlike, http url, base64 data url, or file_id
14
+ data: bytes | io.BytesIO | Path | str
15
+ media_type: str | None = None # inferred if None
16
+ filename: str | None = None # optional filename for uploads
17
+ file_id: str | None = None # for OpenAI file uploads or Anthropic file API
18
+ type: str = field(init=False, default="file")
19
+
20
+ # helpers -----------------------------------------------------------------
21
+ def _bytes(self) -> bytes:
22
+ if isinstance(self.data, bytes):
23
+ return self.data
24
+ elif isinstance(self.data, io.BytesIO):
25
+ return self.data.getvalue()
26
+ elif isinstance(self.data, str) and self.data.startswith("http"):
27
+ res = requests.get(self.data)
28
+ res.raise_for_status()
29
+ return res.content
30
+ elif isinstance(self.data, str) and os.path.exists(self.data):
31
+ with open(self.data, "rb") as f:
32
+ return f.read()
33
+ elif isinstance(self.data, Path) and self.data.exists():
34
+ return Path(self.data).read_bytes()
35
+ elif isinstance(self.data, str) and self.data.startswith("data:"):
36
+ header, encoded = self.data.split(",", 1)
37
+ return base64.b64decode(encoded)
38
+ else:
39
+ raise ValueError("unreadable file format")
40
+
41
+ def _mime(self) -> str:
42
+ if self.media_type:
43
+ return self.media_type
44
+ if isinstance(self.data, (Path, str)):
45
+ # For URL or path, try to guess from the string
46
+ path_str = str(self.data)
47
+ guess = mimetypes.guess_type(path_str)[0]
48
+ if guess:
49
+ return guess
50
+ return "application/pdf" # default to PDF
51
+
52
+ def _filename(self) -> str:
53
+ if self.filename:
54
+ return self.filename
55
+ if isinstance(self.data, (Path, str)):
56
+ path_str = str(self.data)
57
+ if path_str.startswith("http"):
58
+ # Extract filename from URL
59
+ return path_str.split("/")[-1].split("?")[0] or "document.pdf"
60
+ else:
61
+ # Extract from local path
62
+ return os.path.basename(path_str) or "document.pdf"
63
+ return "document.pdf"
64
+
65
+ def _base64(self, include_header: bool = True) -> str:
66
+ encoded = base64.b64encode(self._bytes()).decode("utf-8")
67
+ if not include_header:
68
+ return encoded
69
+ return f"data:{self._mime()};base64,{encoded}"
70
+
71
+ @property
72
+ def fingerprint(self) -> str:
73
+ # Hash the file contents for fingerprinting
74
+ file_bytes = self._bytes()
75
+ return xxhash.xxh64(file_bytes).hexdigest()
76
+
77
+ @property
78
+ def size(self) -> int:
79
+ """Return file size in bytes."""
80
+ return len(self._bytes())
81
+
82
+ # ── provider-specific emission ────────────────────────────────────────────
83
+ def oa_chat(self) -> dict:
84
+ """For OpenAI Chat Completions - file content as base64 or file_id."""
85
+ if self.file_id:
86
+ return {
87
+ "type": "file",
88
+ "file": {
89
+ "file_id": self.file_id,
90
+ },
91
+ }
92
+ else:
93
+ return {
94
+ "type": "file",
95
+ "file": {
96
+ "filename": self._filename(),
97
+ "file_data": self._base64(),
98
+ },
99
+ }
100
+
101
+ def oa_resp(self) -> dict:
102
+ """For OpenAI Responses API - file content as base64 or file_id."""
103
+ if self.file_id:
104
+ return {
105
+ "type": "input_file",
106
+ "file_id": self.file_id,
107
+ }
108
+ else:
109
+ return {
110
+ "type": "input_file",
111
+ "filename": self._filename(),
112
+ "file_data": self._base64(),
113
+ }
114
+
115
+ def anthropic(self) -> dict:
116
+ """For Anthropic Messages API - file content as base64 or file_id."""
117
+ if self.file_id:
118
+ return {
119
+ "type": "document",
120
+ "source": {
121
+ "type": "file",
122
+ "file_id": self.file_id,
123
+ },
124
+ }
125
+ else:
126
+ b64 = base64.b64encode(self._bytes()).decode()
127
+ return {
128
+ "type": "document",
129
+ "source": {
130
+ "type": "base64",
131
+ "media_type": self._mime(),
132
+ "data": b64,
133
+ },
134
+ }
135
+
136
+ def anthropic_file_upload(self) -> tuple[str, bytes, str]:
137
+ """For Anthropic Files API - return tuple for file upload."""
138
+ filename = self._filename()
139
+ content = self._bytes()
140
+ media_type = self._mime()
141
+ return filename, content, media_type
142
+
143
+ def gemini(self) -> dict:
144
+ """For Gemini API - not yet supported."""
145
+ raise NotImplementedError("File support for Gemini is not yet implemented")
146
+
147
+ def mistral(self) -> dict:
148
+ """For Mistral API - not yet supported."""
149
+ raise NotImplementedError("File support for Mistral is not yet implemented")
lm_deluge/prompt.py CHANGED
@@ -1,12 +1,15 @@
1
1
  import io
2
2
  import json
3
- import tiktoken
4
- import xxhash
5
3
  from dataclasses import dataclass, field
6
4
  from pathlib import Path
7
5
  from typing import Literal, Sequence
8
- from lm_deluge.models import APIModel
6
+
7
+ import tiktoken
8
+ import xxhash
9
+
10
+ from lm_deluge.file import File
9
11
  from lm_deluge.image import Image
12
+ from lm_deluge.models import APIModel
10
13
 
11
14
  CachePattern = Literal[
12
15
  "tools_only",
@@ -203,7 +206,7 @@ class Thinking:
203
206
  return {"type": "text", "text": f"[Thinking: {self.content}]"}
204
207
 
205
208
 
206
- Part = Text | Image | ToolCall | ToolResult | Thinking
209
+ Part = Text | Image | File | ToolCall | ToolResult | Thinking
207
210
 
208
211
 
209
212
  ###############################################################################
@@ -246,6 +249,11 @@ class Message:
246
249
  """Get all image parts with proper typing."""
247
250
  return [part for part in self.parts if part.type == "image"] # type: ignore
248
251
 
252
+ @property
253
+ def files(self) -> list[File]:
254
+ """Get all file parts with proper typing."""
255
+ return [part for part in self.parts if part.type == "file"] # type: ignore
256
+
249
257
  @property
250
258
  def thinking_parts(self) -> list["Thinking"]:
251
259
  """Get all thinking parts with proper typing."""
@@ -262,6 +270,9 @@ class Message:
262
270
  elif isinstance(p, Image): # Image – redact the bytes, keep a hint
263
271
  w, h = p.size
264
272
  content_blocks.append({"type": "image", "tag": f"<Image ({w}×{h})>"})
273
+ elif isinstance(p, File): # File – redact the bytes, keep a hint
274
+ size = p.size
275
+ content_blocks.append({"type": "file", "tag": f"<File ({size} bytes)>"})
265
276
  elif isinstance(p, ToolCall):
266
277
  content_blocks.append(
267
278
  {
@@ -296,6 +307,9 @@ class Message:
296
307
  elif p["type"] == "image":
297
308
  # We only stored a placeholder tag, so keep that placeholder.
298
309
  parts.append(Image(p["tag"], detail="low"))
310
+ elif p["type"] == "file":
311
+ # We only stored a placeholder tag, so keep that placeholder.
312
+ parts.append(File(p["tag"]))
299
313
  elif p["type"] == "tool_call":
300
314
  parts.append(
301
315
  ToolCall(id=p["id"], name=p["name"], arguments=p["arguments"])
@@ -340,6 +354,20 @@ class Message:
340
354
  self.parts.append(img)
341
355
  return self
342
356
 
357
+ def add_file(
358
+ self,
359
+ data: bytes | str | Path | io.BytesIO,
360
+ *,
361
+ media_type: str | None = None,
362
+ filename: str | None = None,
363
+ ) -> "Message":
364
+ """
365
+ Append a file block and return self for chaining.
366
+ """
367
+ file = File(data, media_type=media_type, filename=filename)
368
+ self.parts.append(file)
369
+ return self
370
+
343
371
  def add_tool_call(self, id: str, name: str, arguments: dict) -> "Message":
344
372
  """Append a tool call block and return self for chaining."""
345
373
  self.parts.append(ToolCall(id=id, name=name, arguments=arguments))
@@ -362,12 +390,15 @@ class Message:
362
390
  text: str | None = None,
363
391
  *,
364
392
  image: str | bytes | Path | io.BytesIO | None = None,
393
+ file: str | bytes | Path | io.BytesIO | None = None,
365
394
  ) -> "Message":
366
395
  res = cls("user", [])
367
396
  if text is not None:
368
397
  res.add_text(text)
369
398
  if image is not None:
370
399
  res.add_image(image)
400
+ if file is not None:
401
+ res.add_file(file)
371
402
  return res
372
403
 
373
404
  @classmethod
@@ -403,6 +434,19 @@ class Message:
403
434
  part_list.append(Text(item["text"]))
404
435
  elif item["type"] == "image_url":
405
436
  part_list.append(Image(data=item["image_url"]["url"]))
437
+ elif item["type"] == "file":
438
+ file_data = item["file"]
439
+ if "file_id" in file_data:
440
+ # Handle file ID reference (not implemented yet)
441
+ part_list.append(File(data=file_data["file_id"]))
442
+ elif "file_data" in file_data:
443
+ # Handle base64 file data
444
+ part_list.append(
445
+ File(
446
+ data=file_data["file_data"],
447
+ filename=file_data.get("filename"),
448
+ )
449
+ )
406
450
  parts = part_list
407
451
 
408
452
  # Handle tool calls (assistant messages)
@@ -511,11 +555,17 @@ class Conversation:
511
555
 
512
556
  @classmethod
513
557
  def user(
514
- cls, text: str, *, image: bytes | str | Path | None = None
558
+ cls,
559
+ text: str,
560
+ *,
561
+ image: bytes | str | Path | None = None,
562
+ file: bytes | str | Path | None = None,
515
563
  ) -> "Conversation":
516
- msg = (
517
- Message.user(text) if image is None else Message.user(text).add_image(image)
518
- )
564
+ msg = Message.user(text)
565
+ if image is not None:
566
+ msg.add_image(image)
567
+ if file is not None:
568
+ msg.add_file(file)
519
569
  return cls([msg])
520
570
 
521
571
  @classmethod
@@ -677,6 +727,9 @@ class Conversation:
677
727
  if isinstance(part, Image):
678
728
  # Force conversion to bytes if not already
679
729
  part.data = part._bytes()
730
+ elif isinstance(part, File):
731
+ # Force conversion to bytes if not already
732
+ part.data = part._bytes()
680
733
  return self
681
734
 
682
735
  def _add_cache_control_to_message(self, message: dict) -> None:
@@ -765,6 +818,11 @@ class Conversation:
765
818
  content_blocks.append(
766
819
  {"type": "image", "tag": f"<Image ({w}×{h})>"}
767
820
  )
821
+ elif isinstance(p, File): # File – redact the bytes, keep a hint
822
+ size = p.size
823
+ content_blocks.append(
824
+ {"type": "file", "tag": f"<File ({size} bytes)>"}
825
+ )
768
826
  elif isinstance(p, ToolCall):
769
827
  content_blocks.append(
770
828
  {
@@ -795,7 +853,7 @@ class Conversation:
795
853
 
796
854
  for m in payload.get("messages", []):
797
855
  role: Role = m["role"] # 'system' | 'user' | 'assistant'
798
- parts: list[Text | Image | ToolCall | ToolResult | Thinking] = []
856
+ parts: list[Part] = []
799
857
 
800
858
  for p in m["content"]:
801
859
  if p["type"] == "text":
@@ -804,6 +862,9 @@ class Conversation:
804
862
  # We only stored a placeholder tag, so keep that placeholder.
805
863
  # You could raise instead if real image bytes are required.
806
864
  parts.append(Image(p["tag"], detail="low"))
865
+ elif p["type"] == "file":
866
+ # We only stored a placeholder tag, so keep that placeholder.
867
+ parts.append(File(p["tag"]))
807
868
  elif p["type"] == "tool_call":
808
869
  parts.append(
809
870
  ToolCall(id=p["id"], name=p["name"], arguments=p["arguments"])
lm_deluge/tracker.py CHANGED
@@ -67,7 +67,7 @@ class StatusTracker:
67
67
  def set_limiting_factor(self, factor):
68
68
  self.limiting_factor = factor
69
69
 
70
- def check_capacity(self, num_tokens: int):
70
+ def check_capacity(self, num_tokens: int, retry: bool = False):
71
71
  request_available = self.available_request_capacity >= 1
72
72
  tokens_available = self.available_token_capacity >= num_tokens
73
73
  concurrent_request_available = (
@@ -76,8 +76,10 @@ class StatusTracker:
76
76
  if request_available and tokens_available and concurrent_request_available:
77
77
  self.available_request_capacity -= 1
78
78
  self.available_token_capacity -= num_tokens
79
- self.num_tasks_started += 1
80
- self.num_tasks_in_progress += 1
79
+ if not retry:
80
+ # Only count new tasks, not retries
81
+ self.num_tasks_started += 1
82
+ self.num_tasks_in_progress += 1
81
83
  self.set_limiting_factor(None)
82
84
  return True
83
85
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.13
3
+ Version: 0.0.14
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
@@ -30,6 +30,7 @@ Dynamic: license-file
30
30
  `lm-deluge` is a lightweight helper library for maxing out your rate limits with LLM providers. It provides the following:
31
31
 
32
32
  - **Unified client** – Send prompts to all relevant models with a single client.
33
+ - **Files and Images** - Include images easily for multimodal models, and PDF files for models that support them (OpenAI and Anthropic).
33
34
  - **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
34
35
  - **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
35
36
  - **Tool Use** – Unified API for defining tools for all providers, and creating tools automatically from python functions.
@@ -41,6 +42,8 @@ Dynamic: license-file
41
42
 
42
43
  **STREAMING IS NOT IN SCOPE.** There are plenty of packages that let you stream chat completions across providers. The sole purpose of this package is to do very fast batch inference using APIs. Sorry!
43
44
 
45
+ **Update 06/02/2025:** I lied, it supports (very basic) streaming now via client.stream(...). It will print tokens as they arrive, then return an APIResponse at the end. More sophisticated streaming may or may not be implemented later, don't count on it.
46
+
44
47
  ## Installation
45
48
 
46
49
  ```bash
@@ -1,26 +1,28 @@
1
- lm_deluge/__init__.py,sha256=XR_EuBvJM4LggqfWdsrdQij1-UIGAFwyvHW9Rp8tnQA,280
1
+ lm_deluge/__init__.py,sha256=mAztMuxINmh7dGbYnT8tsmw1eryQAvd0jpY8yHzd0EE,315
2
2
  lm_deluge/agent.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  lm_deluge/batches.py,sha256=dI5G9uvmoDU9hMohrkEhlIDyJPsmsVwZPwxx6qETxxk,17728
4
4
  lm_deluge/cache.py,sha256=VB1kv8rM2t5XWPR60uhszFcxLDnVKOe1oA5hYjVDjIo,4375
5
- lm_deluge/client.py,sha256=nkYO_wsGgUkFfqfb_8JrDzcU39RL9FfplKEK6zrncAo,20564
6
- lm_deluge/config.py,sha256=E47daVMvqMicoY2CDcgUnN5nVGDLAQejR358B-pRHZk,923
5
+ lm_deluge/client.py,sha256=kMHA3VlCRk_Ly1CiJ6rRz2GxttxhVuw6WEQtdMVrK-4,19806
6
+ lm_deluge/config.py,sha256=H1tQyJDNHGFuwxqQNL5Z-CjWAC0luHSBA3iY_pxmACM,932
7
7
  lm_deluge/embed.py,sha256=CO-TOlC5kOTAM8lcnicoG4u4K664vCBwHF1vHa-nAGg,13382
8
8
  lm_deluge/errors.py,sha256=oHjt7YnxWbh-eXMScIzov4NvpJMo0-2r5J6Wh5DQ1tk,209
9
+ lm_deluge/file.py,sha256=9l-zWKoHPnPhTL_CZNbxyoKwbLxlXHkRU2bz43qxaV4,5311
9
10
  lm_deluge/gemini_limits.py,sha256=V9mpS9JtXYz7AY6OuKyQp5TuIMRH1BVv9YrSNmGmHNA,1569
10
11
  lm_deluge/image.py,sha256=hFbRajqEVQbkirAfOxsTPkeq-27Zl-so4AWBFeUbpBI,7161
11
12
  lm_deluge/models.py,sha256=gW9ZhKYjwC-ZF-SzWqagFUE_7Mqerdtt_T5NxGo040E,46583
12
- lm_deluge/prompt.py,sha256=dKaV4gI9yLB0w0Ukdz14kGl34yMm5JNm6Sc-24WQPcg,32202
13
+ lm_deluge/prompt.py,sha256=KOuJFwpRKuz2F5WLniZzjOTW05I--mzYyMglr-s47F8,34601
13
14
  lm_deluge/rerank.py,sha256=-NBAJdHz9OB-SWWJnHzkFmeVO4wR6lFV7Vw-SxG7aVo,11457
14
15
  lm_deluge/tool.py,sha256=C2zwU9-7fldfYT0TZDoVVGGSC6dN_It9GSxnfkN6Z_w,9822
15
- lm_deluge/tracker.py,sha256=Un2uthRNZk3dl2fODvvR6CCyFW3IKWfR0GjvpB_dxoM,9095
16
+ lm_deluge/tracker.py,sha256=4QQ0-H01KQp8x8KccidBIJWA5zfSQyA0kgTynvSG0gk,9202
16
17
  lm_deluge/usage.py,sha256=oS-rmF3ZJ1RMtR7WI6BB2uVOAjJg0scvGF3zZRahWVg,4449
17
18
  lm_deluge/api_requests/__init__.py,sha256=_aSpD6CJL9g6OpLPoChXiHjl4MH_OlGcKgfZaW8cgLM,71
18
19
  lm_deluge/api_requests/anthropic.py,sha256=itKPu1cqCYcrr4fkLarlvSYr6tqLEAGVLGXEG05QXWM,8345
19
- lm_deluge/api_requests/base.py,sha256=ixI326EtRadoVCbmvIddzzzIp6E_zPfPOIfDEnucZrc,18060
20
+ lm_deluge/api_requests/base.py,sha256=THgCceZ_z9YjA_E9WWME5f2tIRSOOI2OAQCAWVlV-Xg,12448
20
21
  lm_deluge/api_requests/bedrock.py,sha256=yh4-zMrjlQfmxoBbrc2WYJ8gEqVkTP_-tMR7-XbTAtQ,11753
21
22
  lm_deluge/api_requests/common.py,sha256=pcOpODL4heoaNLjbA6_ogkrOAbUSKY3F37D2EyMLW10,359
22
23
  lm_deluge/api_requests/mistral.py,sha256=PkuoKbOJAB6DOK_NvzbxpWPAktfvonf69QjC0tVCYuE,5366
23
- lm_deluge/api_requests/openai.py,sha256=fj-ioXeK6-OGl9VIFpVy6XJRYOvf6TgMv7eu5mkC8RE,16482
24
+ lm_deluge/api_requests/openai.py,sha256=HUn83Y_Roo3pCUTBnrQhL9skW_PJ4OvS5gr5rIg58dU,19366
25
+ lm_deluge/api_requests/response.py,sha256=X6AHXv-4dWHLKkPv7J0MSesweunqxIqJED6UY6ypdzE,5770
24
26
  lm_deluge/api_requests/deprecated/bedrock.py,sha256=WrcIShCoO8JCUSlFOCHxg6KQCNTZfw3TpYTvSpYk4mA,11320
25
27
  lm_deluge/api_requests/deprecated/cohere.py,sha256=KgDScD6_bWhAzOY5BHZQKSA3kurt4KGENqC4wLsGmcU,5142
26
28
  lm_deluge/api_requests/deprecated/deepseek.py,sha256=FEApI93VAWDwuaqTooIyKMgONYqRhdUmiAPBRme-IYs,4582
@@ -35,8 +37,8 @@ lm_deluge/util/json.py,sha256=_4Oar2Cmz2L1DK3EtPLPDxD6rsYHxjROmV8ZpmMjQ-4,5822
35
37
  lm_deluge/util/logprobs.py,sha256=UkBZakOxWluaLqHrjARu7xnJ0uCHVfLGHJdnYlEcutk,11768
36
38
  lm_deluge/util/validation.py,sha256=hz5dDb3ebvZrZhnaWxOxbNSVMI6nmaOODBkk0htAUhs,1575
37
39
  lm_deluge/util/xml.py,sha256=Ft4zajoYBJR3HHCt2oHwGfymGLdvp_gegVmJ-Wqk4Ck,10547
38
- lm_deluge-0.0.13.dist-info/licenses/LICENSE,sha256=uNNXGXPCw2TC7CUs7SEBkA-Mz6QBQFWUUEWDMgEs1dU,1058
39
- lm_deluge-0.0.13.dist-info/METADATA,sha256=GEkP9_w0VcPOGEKad9Yh24WOhiW4TQvC2pX4wK1x0jk,11549
40
- lm_deluge-0.0.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
41
- lm_deluge-0.0.13.dist-info/top_level.txt,sha256=hqU-TJX93yBwpgkDtYcXyLr3t7TLSCCZ_reytJjwBaE,10
42
- lm_deluge-0.0.13.dist-info/RECORD,,
40
+ lm_deluge-0.0.14.dist-info/licenses/LICENSE,sha256=uNNXGXPCw2TC7CUs7SEBkA-Mz6QBQFWUUEWDMgEs1dU,1058
41
+ lm_deluge-0.0.14.dist-info/METADATA,sha256=iK9UuTpf235TbQQ6CkrLX725loOMSdwTscZJQgEHeoo,11942
42
+ lm_deluge-0.0.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
+ lm_deluge-0.0.14.dist-info/top_level.txt,sha256=hqU-TJX93yBwpgkDtYcXyLr3t7TLSCCZ_reytJjwBaE,10
44
+ lm_deluge-0.0.14.dist-info/RECORD,,