lm-deluge 0.0.53__tar.gz → 0.0.55__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lm-deluge might be problematic. Click here for more details.
- {lm_deluge-0.0.53/src/lm_deluge.egg-info → lm_deluge-0.0.55}/PKG-INFO +1 -1
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/pyproject.toml +1 -1
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/__init__.py +3 -4
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/base.py +6 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/response.py +28 -1
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/client.py +67 -124
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/extract.py +7 -5
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/__init__.py +4 -1
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/anthropic.py +20 -2
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/google.py +20 -12
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/openai.py +18 -8
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/tracker.py +17 -10
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/usage.py +30 -21
- {lm_deluge-0.0.53 → lm_deluge-0.0.55/src/lm_deluge.egg-info}/PKG-INFO +1 -1
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/LICENSE +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/README.md +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/setup.cfg +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/agent.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/__init__.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/anthropic.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/bedrock.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/common.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/gemini.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/mistral.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/api_requests/openai.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/batches.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/__init__.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/bash.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/computer_use.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/editor.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/base.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/openai.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/cache.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/cli.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/config.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/embed.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/errors.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/file.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/gemini_limits.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/image.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/__init__.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/classify.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/locate.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/ocr.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/score.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/llm_tools/translate.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/bedrock.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/cerebras.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/cohere.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/deepseek.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/fireworks.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/grok.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/groq.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/meta.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/mistral.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/openrouter.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/models/together.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/presets/cerebras.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/presets/meta.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/prompt.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/request_context.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/rerank.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/tool.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/harmony.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/json.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/logprobs.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/spatial.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/validation.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/util/xml.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/SOURCES.txt +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/requires.txt +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge.egg-info/top_level.txt +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/tests/test_builtin_tools.py +0 -0
- {lm_deluge-0.0.53 → lm_deluge-0.0.55}/tests/test_native_mcp_server.py +0 -0
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
from .client import LLMClient, SamplingParams
|
|
1
|
+
from .client import APIResponse, LLMClient, SamplingParams
|
|
2
|
+
from .file import File
|
|
2
3
|
from .prompt import Conversation, Message
|
|
3
4
|
from .tool import Tool
|
|
4
|
-
from .file import File
|
|
5
|
-
import dotenv
|
|
6
5
|
|
|
7
|
-
dotenv.load_dotenv()
|
|
6
|
+
# dotenv.load_dotenv() - don't do this, fucks with other packages
|
|
8
7
|
|
|
9
8
|
__all__ = [
|
|
10
9
|
"LLMClient",
|
|
@@ -52,6 +52,9 @@ class APIRequestBase(ABC):
|
|
|
52
52
|
self, base_headers: dict[str, str], exclude_patterns: list[str] | None = None
|
|
53
53
|
) -> dict[str, str]:
|
|
54
54
|
"""Merge extra_headers with base headers, giving priority to extra_headers."""
|
|
55
|
+
# Filter out None values from base headers (e.g., missing API keys)
|
|
56
|
+
base_headers = {k: v for k, v in base_headers.items() if v is not None}
|
|
57
|
+
|
|
55
58
|
if not self.context.extra_headers:
|
|
56
59
|
return base_headers
|
|
57
60
|
|
|
@@ -69,6 +72,9 @@ class APIRequestBase(ABC):
|
|
|
69
72
|
# Start with base headers, then overlay filtered extra headers (extra takes precedence)
|
|
70
73
|
merged = dict(base_headers)
|
|
71
74
|
merged.update(filtered_extra)
|
|
75
|
+
|
|
76
|
+
# Filter out None values from final merged headers
|
|
77
|
+
merged = {k: v for k, v in merged.items() if v is not None}
|
|
72
78
|
return merged
|
|
73
79
|
|
|
74
80
|
def handle_success(self, data):
|
|
@@ -84,10 +84,37 @@ class APIResponse:
|
|
|
84
84
|
and api_model.input_cost is not None
|
|
85
85
|
and api_model.output_cost is not None
|
|
86
86
|
):
|
|
87
|
+
# Calculate input cost, accounting for cached vs non-cached tokens
|
|
88
|
+
# Different providers report tokens differently:
|
|
89
|
+
# - Anthropic/Bedrock: input_tokens is ONLY non-cached, cache_read_tokens is separate
|
|
90
|
+
# - OpenAI/Gemini: input_tokens INCLUDES cached, cache_read_tokens is a subset
|
|
91
|
+
cache_read_tokens = self.usage.cache_read_tokens or 0
|
|
92
|
+
|
|
93
|
+
if api_model.api_spec in ("anthropic", "bedrock"):
|
|
94
|
+
# For Anthropic: input_tokens already excludes cache, so use directly
|
|
95
|
+
non_cached_input_tokens = self.usage.input_tokens
|
|
96
|
+
else:
|
|
97
|
+
# For OpenAI/Gemini: input_tokens includes cache, so subtract it
|
|
98
|
+
non_cached_input_tokens = self.usage.input_tokens - cache_read_tokens
|
|
99
|
+
|
|
87
100
|
self.cost = (
|
|
88
|
-
|
|
101
|
+
non_cached_input_tokens * api_model.input_cost / 1e6
|
|
89
102
|
+ self.usage.output_tokens * api_model.output_cost / 1e6
|
|
90
103
|
)
|
|
104
|
+
|
|
105
|
+
# Add cost for cache read tokens (at reduced rate)
|
|
106
|
+
if cache_read_tokens > 0 and api_model.cached_input_cost is not None:
|
|
107
|
+
self.cost += cache_read_tokens * api_model.cached_input_cost / 1e6
|
|
108
|
+
|
|
109
|
+
# Add cost for cache write tokens (only for Anthropic)
|
|
110
|
+
if (
|
|
111
|
+
self.usage.cache_write_tokens
|
|
112
|
+
and self.usage.cache_write_tokens > 0
|
|
113
|
+
and api_model.cache_write_cost is not None
|
|
114
|
+
):
|
|
115
|
+
self.cost += (
|
|
116
|
+
self.usage.cache_write_tokens * api_model.cache_write_cost / 1e6
|
|
117
|
+
)
|
|
91
118
|
elif self.content is not None and self.completion is not None:
|
|
92
119
|
pass
|
|
93
120
|
# print(
|
|
@@ -30,6 +30,7 @@ class _LLMClient(BaseModel):
|
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
32
|
model_names: str | list[str] = ["gpt-4.1-mini"]
|
|
33
|
+
name: str | None = None
|
|
33
34
|
max_requests_per_minute: int = 1_000
|
|
34
35
|
max_tokens_per_minute: int = 100_000
|
|
35
36
|
max_concurrent_requests: int = 225
|
|
@@ -69,6 +70,7 @@ class _LLMClient(BaseModel):
|
|
|
69
70
|
max_requests_per_minute=self.max_requests_per_minute,
|
|
70
71
|
max_tokens_per_minute=self.max_tokens_per_minute,
|
|
71
72
|
max_concurrent_requests=self.max_concurrent_requests,
|
|
73
|
+
client_name=self.name or "LLMClient",
|
|
72
74
|
progress_style=self.progress,
|
|
73
75
|
use_progress_bar=show_progress,
|
|
74
76
|
)
|
|
@@ -80,6 +82,22 @@ class _LLMClient(BaseModel):
|
|
|
80
82
|
self._tracker.log_final_status()
|
|
81
83
|
self._tracker = None
|
|
82
84
|
|
|
85
|
+
def reset_tracker(self):
|
|
86
|
+
"""Reset tracker by closing and reopening with fresh state.
|
|
87
|
+
|
|
88
|
+
Useful when reusing a client across multiple batches and you want
|
|
89
|
+
the progress bar to start from 0 instead of showing cumulative totals.
|
|
90
|
+
"""
|
|
91
|
+
if self._tracker is None:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
# Close existing tracker (including progress bar)
|
|
95
|
+
show_progress = self._tracker.use_progress_bar
|
|
96
|
+
self.close()
|
|
97
|
+
|
|
98
|
+
# Create fresh tracker
|
|
99
|
+
self.open(total=0, show_progress=show_progress)
|
|
100
|
+
|
|
83
101
|
# NEW! Builder methods
|
|
84
102
|
def with_model(self, model: str):
|
|
85
103
|
self.model_names = [model]
|
|
@@ -153,6 +171,13 @@ class _LLMClient(BaseModel):
|
|
|
153
171
|
# normalize weights
|
|
154
172
|
self.model_weights = [w / sum(self.model_weights) for w in self.model_weights]
|
|
155
173
|
|
|
174
|
+
# Auto-generate name if not provided
|
|
175
|
+
if self.name is None:
|
|
176
|
+
if len(self.model_names) == 1:
|
|
177
|
+
self.name = self.model_names[0]
|
|
178
|
+
else:
|
|
179
|
+
self.name = "LLMClient"
|
|
180
|
+
|
|
156
181
|
# Validate logprobs settings across all sampling params
|
|
157
182
|
if self.logprobs or any(sp.logprobs for sp in self.sampling_params):
|
|
158
183
|
print("Logprobs enabled.")
|
|
@@ -353,147 +378,61 @@ class _LLMClient(BaseModel):
|
|
|
353
378
|
cache: CachePattern | None = None,
|
|
354
379
|
use_responses_api: bool = False,
|
|
355
380
|
) -> list[APIResponse | None] | list[str | None] | dict[str, int]:
|
|
356
|
-
|
|
357
|
-
prompts = prompts_to_conversations(prompts)
|
|
358
|
-
ids = list(range(len(prompts)))
|
|
359
|
-
results: list[APIResponse | None] = [None for _ in range(len(prompts))]
|
|
360
|
-
contexts: list[RequestContext | None] = [None for _ in range(len(prompts))]
|
|
361
|
-
inflight_tasks: set[asyncio.Task[None]] = set()
|
|
362
|
-
# Use existing tracker if client has been opened; otherwise open/close automatically
|
|
363
|
-
tracker: StatusTracker
|
|
364
|
-
tracker_preopened = self._tracker is not None
|
|
365
|
-
if tracker_preopened:
|
|
366
|
-
tracker = self._tracker # type: ignore[assignment]
|
|
367
|
-
tracker.add_to_total(len(prompts))
|
|
368
|
-
else:
|
|
369
|
-
self.open(total=len(prompts), show_progress=show_progress)
|
|
370
|
-
tracker = self._tracker # type: ignore[assignment]
|
|
371
|
-
assert tracker is not None
|
|
372
|
-
|
|
373
|
-
# Create retry queue for failed requests
|
|
374
|
-
retry_queue: asyncio.Queue[RequestContext] = asyncio.Queue()
|
|
381
|
+
"""Process multiple prompts asynchronously using the start_nowait/wait_for_all backend.
|
|
375
382
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
#
|
|
380
|
-
|
|
381
|
-
next_is_retry = False # Track whether next_context is a retry
|
|
382
|
-
prompts_not_finished = True
|
|
383
|
-
prompts_iter = iter(zip(ids, prompts))
|
|
384
|
-
|
|
385
|
-
while True:
|
|
386
|
-
# Get next context (retry or new) - only if we don't already have one waiting
|
|
387
|
-
if next_context is None:
|
|
388
|
-
if not retry_queue.empty():
|
|
389
|
-
next_context = retry_queue.get_nowait()
|
|
390
|
-
next_is_retry = True
|
|
391
|
-
print(f"Retrying request {next_context.task_id}.")
|
|
392
|
-
elif prompts_not_finished:
|
|
393
|
-
try:
|
|
394
|
-
task_id, prompt = next(prompts_iter)
|
|
395
|
-
model, sampling_params = self._select_model()
|
|
396
|
-
assert isinstance(prompt, Conversation)
|
|
397
|
-
next_context = RequestContext(
|
|
398
|
-
task_id=task_id,
|
|
399
|
-
model_name=model,
|
|
400
|
-
prompt=prompt,
|
|
401
|
-
sampling_params=sampling_params,
|
|
402
|
-
attempts_left=self.max_attempts,
|
|
403
|
-
request_timeout=self.request_timeout,
|
|
404
|
-
status_tracker=tracker,
|
|
405
|
-
tools=tools,
|
|
406
|
-
cache=cache,
|
|
407
|
-
use_responses_api=use_responses_api,
|
|
408
|
-
extra_headers=self.extra_headers,
|
|
409
|
-
force_local_mcp=self.force_local_mcp,
|
|
410
|
-
)
|
|
411
|
-
|
|
412
|
-
next_is_retry = False
|
|
413
|
-
except StopIteration:
|
|
414
|
-
prompts_not_finished = False
|
|
415
|
-
|
|
416
|
-
# Dispatch using shared capacity gate (consistent with start_nowait)
|
|
417
|
-
if next_context:
|
|
418
|
-
# Wait here until we have capacity to launch this context
|
|
419
|
-
await self._wait_for_capacity(
|
|
420
|
-
next_context.num_tokens, tracker, retry=next_is_retry
|
|
421
|
-
)
|
|
422
|
-
|
|
423
|
-
# Launch simplified request processing
|
|
424
|
-
contexts[next_context.task_id] = next_context
|
|
425
|
-
|
|
426
|
-
async def process_and_store(ctx: RequestContext):
|
|
427
|
-
try:
|
|
428
|
-
response = await self.process_single_request(ctx, retry_queue)
|
|
429
|
-
results[ctx.task_id] = response
|
|
430
|
-
except BaseException as exc:
|
|
431
|
-
# Capture cancellations and other BaseExceptions before fallback response fires.
|
|
432
|
-
error_response = APIResponse(
|
|
433
|
-
id=ctx.task_id,
|
|
434
|
-
model_internal=ctx.model_name,
|
|
435
|
-
prompt=ctx.prompt,
|
|
436
|
-
sampling_params=ctx.sampling_params,
|
|
437
|
-
status_code=None,
|
|
438
|
-
is_error=True,
|
|
439
|
-
error_message=f"{type(exc).__name__}: {exc}",
|
|
440
|
-
raw_response={"exception_repr": repr(exc)},
|
|
441
|
-
)
|
|
442
|
-
results[ctx.task_id] = error_response
|
|
443
|
-
if ctx.status_tracker:
|
|
444
|
-
ctx.status_tracker.task_failed(ctx.task_id)
|
|
445
|
-
raise
|
|
446
|
-
|
|
447
|
-
task = asyncio.create_task(process_and_store(next_context))
|
|
448
|
-
inflight_tasks.add(task)
|
|
449
|
-
task.add_done_callback(inflight_tasks.discard)
|
|
450
|
-
next_context = None # Reset after successful dispatch
|
|
451
|
-
next_is_retry = False
|
|
452
|
-
|
|
453
|
-
# Update progress - original logic
|
|
454
|
-
tracker.update_pbar()
|
|
455
|
-
|
|
456
|
-
# Check completion: consider final outcomes, not in-progress count
|
|
457
|
-
# This avoids rare hangs if in-progress is miscounted (e.g., double-increment).
|
|
458
|
-
if (tracker.num_tasks_succeeded + tracker.num_tasks_failed) >= len(
|
|
459
|
-
prompts
|
|
460
|
-
) and retry_queue.empty():
|
|
461
|
-
break
|
|
383
|
+
This implementation creates all tasks upfront and waits for them to complete,
|
|
384
|
+
avoiding issues with tracker state accumulating across multiple calls.
|
|
385
|
+
"""
|
|
386
|
+
# Convert prompts to Conversations
|
|
387
|
+
prompts = prompts_to_conversations(prompts)
|
|
462
388
|
|
|
463
|
-
|
|
464
|
-
|
|
389
|
+
# Ensure tracker exists (start_nowait will call add_to_total for each task)
|
|
390
|
+
if self._tracker is None:
|
|
391
|
+
self.open(total=0, show_progress=show_progress)
|
|
392
|
+
tracker_preopened = False
|
|
393
|
+
else:
|
|
394
|
+
tracker_preopened = True
|
|
395
|
+
|
|
396
|
+
# Start all tasks using start_nowait - tasks will coordinate via shared capacity lock
|
|
397
|
+
task_ids = []
|
|
398
|
+
for prompt in prompts:
|
|
399
|
+
assert isinstance(prompt, Conversation)
|
|
400
|
+
task_id = self.start_nowait(
|
|
401
|
+
prompt,
|
|
402
|
+
tools=tools,
|
|
403
|
+
cache=cache,
|
|
404
|
+
use_responses_api=use_responses_api,
|
|
405
|
+
)
|
|
406
|
+
task_ids.append(task_id)
|
|
465
407
|
|
|
466
|
-
|
|
467
|
-
|
|
408
|
+
# Wait for all tasks to complete
|
|
409
|
+
results = await self.wait_for_all(task_ids)
|
|
468
410
|
|
|
411
|
+
# Close tracker if we opened it
|
|
469
412
|
if not tracker_preopened:
|
|
470
413
|
self.close()
|
|
471
414
|
|
|
415
|
+
# Defensive check: This should rarely happen, but provides a safety net
|
|
472
416
|
for idx, response in enumerate(results):
|
|
473
417
|
if response is None:
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
if ctx
|
|
479
|
-
else self.sampling_params[0]
|
|
480
|
-
if self.sampling_params
|
|
481
|
-
else SamplingParams()
|
|
418
|
+
# This should only happen if there's a bug in _run_context
|
|
419
|
+
print(
|
|
420
|
+
f"WARNING: result[{idx}] is None! Creating defensive error response. "
|
|
421
|
+
f"Please report this bug."
|
|
482
422
|
)
|
|
483
|
-
model_name = ctx.model_name if ctx else self.model_names[0]
|
|
484
|
-
assert isinstance(
|
|
485
|
-
prompt, Conversation
|
|
486
|
-
), "expected prompt to be a conversation"
|
|
487
423
|
results[idx] = APIResponse(
|
|
488
424
|
id=idx,
|
|
489
|
-
model_internal=
|
|
490
|
-
prompt=
|
|
491
|
-
sampling_params=sampling_params
|
|
425
|
+
model_internal=self.model_names[0],
|
|
426
|
+
prompt=prompts[idx], # type: ignore
|
|
427
|
+
sampling_params=self.sampling_params[0]
|
|
428
|
+
if self.sampling_params
|
|
429
|
+
else SamplingParams(),
|
|
492
430
|
status_code=None,
|
|
493
431
|
is_error=True,
|
|
494
432
|
error_message="Internal error: no response produced.",
|
|
495
433
|
)
|
|
496
434
|
|
|
435
|
+
# Handle return format
|
|
497
436
|
if return_completions_only:
|
|
498
437
|
return [r.completion if r is not None else None for r in results]
|
|
499
438
|
|
|
@@ -795,6 +734,7 @@ class _LLMClient(BaseModel):
|
|
|
795
734
|
def LLMClient(
|
|
796
735
|
model_names: str,
|
|
797
736
|
*,
|
|
737
|
+
name: str | None = None,
|
|
798
738
|
max_requests_per_minute: int = 1_000,
|
|
799
739
|
max_tokens_per_minute: int = 100_000,
|
|
800
740
|
max_concurrent_requests: int = 225,
|
|
@@ -821,6 +761,7 @@ def LLMClient(
|
|
|
821
761
|
def LLMClient(
|
|
822
762
|
model_names: list[str],
|
|
823
763
|
*,
|
|
764
|
+
name: str | None = None,
|
|
824
765
|
max_requests_per_minute: int = 1_000,
|
|
825
766
|
max_tokens_per_minute: int = 100_000,
|
|
826
767
|
max_concurrent_requests: int = 225,
|
|
@@ -846,6 +787,7 @@ def LLMClient(
|
|
|
846
787
|
def LLMClient(
|
|
847
788
|
model_names: str | list[str] = "gpt-4.1-mini",
|
|
848
789
|
*,
|
|
790
|
+
name: str | None = None,
|
|
849
791
|
max_requests_per_minute: int = 1_000,
|
|
850
792
|
max_tokens_per_minute: int = 100_000,
|
|
851
793
|
max_concurrent_requests: int = 225,
|
|
@@ -883,6 +825,7 @@ def LLMClient(
|
|
|
883
825
|
# Simply pass everything to the Pydantic constructor
|
|
884
826
|
return _LLMClient(
|
|
885
827
|
model_names=model_names,
|
|
828
|
+
name=name,
|
|
886
829
|
max_requests_per_minute=max_requests_per_minute,
|
|
887
830
|
max_tokens_per_minute=max_tokens_per_minute,
|
|
888
831
|
max_concurrent_requests=max_concurrent_requests,
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import io
|
|
3
3
|
import json
|
|
4
|
+
import os
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
7
|
+
from lm_deluge.client import _LLMClient
|
|
6
8
|
from lm_deluge.file import File
|
|
7
9
|
|
|
8
|
-
from ..client import LLMClient
|
|
9
10
|
from ..prompt import Conversation
|
|
10
11
|
from ..util.json import load_json
|
|
11
12
|
|
|
@@ -18,7 +19,7 @@ except ImportError:
|
|
|
18
19
|
async def extract_async(
|
|
19
20
|
inputs: list[str | Any],
|
|
20
21
|
schema: Any,
|
|
21
|
-
client:
|
|
22
|
+
client: _LLMClient,
|
|
22
23
|
document_name: str | None = None,
|
|
23
24
|
object_name: str | None = None,
|
|
24
25
|
show_progress: bool = True,
|
|
@@ -32,12 +33,13 @@ async def extract_async(
|
|
|
32
33
|
raise ValueError("schema must be a pydantic model or a dict.")
|
|
33
34
|
|
|
34
35
|
# warn if json_mode is not True
|
|
36
|
+
has_warned = os.environ.get("LM_DELUGE_WARN_JSON_MODE", False)
|
|
35
37
|
for sp in client.sampling_params:
|
|
36
|
-
if sp.json_mode is False:
|
|
38
|
+
if sp.json_mode is False and not has_warned:
|
|
37
39
|
print(
|
|
38
40
|
"Warning: json_mode is False for one or more sampling params. You may get invalid output."
|
|
39
41
|
)
|
|
40
|
-
|
|
42
|
+
os.environ["LM_DELUGE_WARN_JSON_MODE"] = "True"
|
|
41
43
|
# check_schema(schema_dict) -- figure out later
|
|
42
44
|
if document_name is None:
|
|
43
45
|
document_name = "text"
|
|
@@ -111,7 +113,7 @@ async def extract_async(
|
|
|
111
113
|
def extract(
|
|
112
114
|
inputs: list[str | Any],
|
|
113
115
|
schema: Any,
|
|
114
|
-
client:
|
|
116
|
+
client: _LLMClient,
|
|
115
117
|
document_name: str | None = None,
|
|
116
118
|
object_name: str | None = None,
|
|
117
119
|
show_progress: bool = True,
|
|
@@ -29,7 +29,8 @@ class APIModel:
|
|
|
29
29
|
api_base: str
|
|
30
30
|
api_key_env_var: str
|
|
31
31
|
api_spec: str
|
|
32
|
-
cached_input_cost: float | None = 0
|
|
32
|
+
cached_input_cost: float | None = 0 # $ per million cached/read input tokens
|
|
33
|
+
cache_write_cost: float | None = 0 # $ per million cache write tokens
|
|
33
34
|
input_cost: float | None = 0 # $ per million input tokens
|
|
34
35
|
output_cost: float | None = 0 # $ per million output tokens
|
|
35
36
|
supports_json: bool = False
|
|
@@ -89,6 +90,7 @@ def register_model(
|
|
|
89
90
|
api_spec: str = "openai",
|
|
90
91
|
input_cost: float | None = 0, # $ per million input tokens
|
|
91
92
|
cached_input_cost: float | None = 0,
|
|
93
|
+
cache_write_cost: float | None = 0, # $ per million cache write tokens
|
|
92
94
|
output_cost: float | None = 0, # $ per million output tokens
|
|
93
95
|
supports_json: bool = False,
|
|
94
96
|
supports_logprobs: bool = False,
|
|
@@ -106,6 +108,7 @@ def register_model(
|
|
|
106
108
|
api_key_env_var=api_key_env_var,
|
|
107
109
|
api_spec=api_spec,
|
|
108
110
|
cached_input_cost=cached_input_cost,
|
|
111
|
+
cache_write_cost=cache_write_cost,
|
|
109
112
|
input_cost=input_cost,
|
|
110
113
|
output_cost=output_cost,
|
|
111
114
|
supports_json=supports_json,
|
|
@@ -18,6 +18,8 @@ ANTHROPIC_MODELS = {
|
|
|
18
18
|
"supports_json": False,
|
|
19
19
|
"api_spec": "anthropic",
|
|
20
20
|
"input_cost": 3.0,
|
|
21
|
+
"cached_input_cost": 0.30,
|
|
22
|
+
"cache_write_cost": 3.75,
|
|
21
23
|
"output_cost": 15.0,
|
|
22
24
|
"requests_per_minute": 4_000,
|
|
23
25
|
"tokens_per_minute": 400_000,
|
|
@@ -30,6 +32,8 @@ ANTHROPIC_MODELS = {
|
|
|
30
32
|
"supports_json": False,
|
|
31
33
|
"api_spec": "anthropic",
|
|
32
34
|
"input_cost": 15.0,
|
|
35
|
+
"cached_input_cost": 1.50,
|
|
36
|
+
"cache_write_cost": 18.75,
|
|
33
37
|
"output_cost": 75.0,
|
|
34
38
|
"requests_per_minute": 4_000,
|
|
35
39
|
"tokens_per_minute": 400_000,
|
|
@@ -43,6 +47,8 @@ ANTHROPIC_MODELS = {
|
|
|
43
47
|
"supports_json": False,
|
|
44
48
|
"api_spec": "anthropic",
|
|
45
49
|
"input_cost": 15.0,
|
|
50
|
+
"cached_input_cost": 1.50,
|
|
51
|
+
"cache_write_cost": 18.75,
|
|
46
52
|
"output_cost": 75.0,
|
|
47
53
|
"requests_per_minute": 4_000,
|
|
48
54
|
"tokens_per_minute": 400_000,
|
|
@@ -56,6 +62,8 @@ ANTHROPIC_MODELS = {
|
|
|
56
62
|
"supports_json": False,
|
|
57
63
|
"api_spec": "anthropic",
|
|
58
64
|
"input_cost": 3.0,
|
|
65
|
+
"cached_input_cost": 0.30,
|
|
66
|
+
"cache_write_cost": 3.75,
|
|
59
67
|
"output_cost": 15.0,
|
|
60
68
|
"requests_per_minute": 4_000,
|
|
61
69
|
"tokens_per_minute": 400_000,
|
|
@@ -68,6 +76,8 @@ ANTHROPIC_MODELS = {
|
|
|
68
76
|
"supports_json": False,
|
|
69
77
|
"api_spec": "anthropic",
|
|
70
78
|
"input_cost": 3.0,
|
|
79
|
+
"cached_input_cost": 0.30,
|
|
80
|
+
"cache_write_cost": 3.75,
|
|
71
81
|
"output_cost": 15.0,
|
|
72
82
|
"requests_per_minute": 4_000,
|
|
73
83
|
"tokens_per_minute": 400_000,
|
|
@@ -81,6 +91,8 @@ ANTHROPIC_MODELS = {
|
|
|
81
91
|
"supports_json": False,
|
|
82
92
|
"api_spec": "anthropic",
|
|
83
93
|
"input_cost": 3.0,
|
|
94
|
+
"cached_input_cost": 0.30,
|
|
95
|
+
"cache_write_cost": 3.75,
|
|
84
96
|
"output_cost": 15.0,
|
|
85
97
|
"requests_per_minute": 4_000,
|
|
86
98
|
"tokens_per_minute": 400_000,
|
|
@@ -93,6 +105,8 @@ ANTHROPIC_MODELS = {
|
|
|
93
105
|
"supports_json": False,
|
|
94
106
|
"api_spec": "anthropic",
|
|
95
107
|
"input_cost": 3.0,
|
|
108
|
+
"cached_input_cost": 0.30,
|
|
109
|
+
"cache_write_cost": 3.75,
|
|
96
110
|
"output_cost": 15.0,
|
|
97
111
|
"requests_per_minute": 4_000,
|
|
98
112
|
"tokens_per_minute": 400_000,
|
|
@@ -116,8 +130,10 @@ ANTHROPIC_MODELS = {
|
|
|
116
130
|
"api_key_env_var": "ANTHROPIC_API_KEY",
|
|
117
131
|
"supports_json": False,
|
|
118
132
|
"api_spec": "anthropic",
|
|
119
|
-
"input_cost":
|
|
120
|
-
"
|
|
133
|
+
"input_cost": 0.8,
|
|
134
|
+
"cached_input_cost": 0.08,
|
|
135
|
+
"cache_write_cost": 1.00,
|
|
136
|
+
"output_cost": 4.00,
|
|
121
137
|
"requests_per_minute": 20_000,
|
|
122
138
|
"tokens_per_minute": 4_000_000, # supposed to be this but they fucked up
|
|
123
139
|
},
|
|
@@ -129,6 +145,8 @@ ANTHROPIC_MODELS = {
|
|
|
129
145
|
"supports_json": False,
|
|
130
146
|
"api_spec": "anthropic",
|
|
131
147
|
"input_cost": 0.25,
|
|
148
|
+
"cache_write_cost": 0.30,
|
|
149
|
+
"cached_input_cost": 0.03,
|
|
132
150
|
"output_cost": 1.25,
|
|
133
151
|
"requests_per_minute": 10_000,
|
|
134
152
|
"tokens_per_minute": 4_000_000, # supposed to be this but they fucked up
|
|
@@ -18,6 +18,7 @@ GOOGLE_MODELS = {
|
|
|
18
18
|
"supports_logprobs": False,
|
|
19
19
|
"api_spec": "openai",
|
|
20
20
|
"input_cost": 0.1,
|
|
21
|
+
"cached_input_cost": 0.025,
|
|
21
22
|
"output_cost": 0.4,
|
|
22
23
|
"requests_per_minute": 20,
|
|
23
24
|
"tokens_per_minute": 100_000,
|
|
@@ -31,8 +32,8 @@ GOOGLE_MODELS = {
|
|
|
31
32
|
"supports_json": True,
|
|
32
33
|
"supports_logprobs": False,
|
|
33
34
|
"api_spec": "openai",
|
|
34
|
-
"input_cost": 0.
|
|
35
|
-
"output_cost": 0.
|
|
35
|
+
"input_cost": 0.075,
|
|
36
|
+
"output_cost": 0.3,
|
|
36
37
|
"requests_per_minute": 20,
|
|
37
38
|
"tokens_per_minute": 100_000,
|
|
38
39
|
"reasoning_model": False,
|
|
@@ -45,8 +46,9 @@ GOOGLE_MODELS = {
|
|
|
45
46
|
"supports_json": True,
|
|
46
47
|
"supports_logprobs": False,
|
|
47
48
|
"api_spec": "openai",
|
|
48
|
-
"input_cost":
|
|
49
|
-
"
|
|
49
|
+
"input_cost": 1.25,
|
|
50
|
+
"cached_input_cost": 0.31,
|
|
51
|
+
"output_cost": 10.0,
|
|
50
52
|
"requests_per_minute": 20,
|
|
51
53
|
"tokens_per_minute": 100_000,
|
|
52
54
|
"reasoning_model": True,
|
|
@@ -59,8 +61,9 @@ GOOGLE_MODELS = {
|
|
|
59
61
|
"supports_json": True,
|
|
60
62
|
"supports_logprobs": False,
|
|
61
63
|
"api_spec": "openai",
|
|
62
|
-
"input_cost": 0.
|
|
63
|
-
"
|
|
64
|
+
"input_cost": 0.3,
|
|
65
|
+
"cached_input_cost": 0.075,
|
|
66
|
+
"output_cost": 2.5,
|
|
64
67
|
"requests_per_minute": 20,
|
|
65
68
|
"tokens_per_minute": 100_000,
|
|
66
69
|
"reasoning_model": True,
|
|
@@ -74,6 +77,7 @@ GOOGLE_MODELS = {
|
|
|
74
77
|
"supports_logprobs": False,
|
|
75
78
|
"api_spec": "openai",
|
|
76
79
|
"input_cost": 0.1,
|
|
80
|
+
"cached_input_cost": 0.025,
|
|
77
81
|
"output_cost": 0.4,
|
|
78
82
|
"requests_per_minute": 20,
|
|
79
83
|
"tokens_per_minute": 100_000,
|
|
@@ -89,6 +93,7 @@ GOOGLE_MODELS = {
|
|
|
89
93
|
"supports_logprobs": False,
|
|
90
94
|
"api_spec": "gemini",
|
|
91
95
|
"input_cost": 0.1,
|
|
96
|
+
"cached_input_cost": 0.025,
|
|
92
97
|
"output_cost": 0.4,
|
|
93
98
|
"requests_per_minute": 20,
|
|
94
99
|
"tokens_per_minute": 100_000,
|
|
@@ -102,8 +107,8 @@ GOOGLE_MODELS = {
|
|
|
102
107
|
"supports_json": True,
|
|
103
108
|
"supports_logprobs": False,
|
|
104
109
|
"api_spec": "gemini",
|
|
105
|
-
"input_cost": 0.
|
|
106
|
-
"output_cost": 0.
|
|
110
|
+
"input_cost": 0.075,
|
|
111
|
+
"output_cost": 0.3,
|
|
107
112
|
"requests_per_minute": 20,
|
|
108
113
|
"tokens_per_minute": 100_000,
|
|
109
114
|
"reasoning_model": False,
|
|
@@ -116,8 +121,9 @@ GOOGLE_MODELS = {
|
|
|
116
121
|
"supports_json": True,
|
|
117
122
|
"supports_logprobs": False,
|
|
118
123
|
"api_spec": "gemini",
|
|
119
|
-
"input_cost":
|
|
120
|
-
"
|
|
124
|
+
"input_cost": 1.25,
|
|
125
|
+
"cached_input_cost": 0.31,
|
|
126
|
+
"output_cost": 10.0,
|
|
121
127
|
"requests_per_minute": 20,
|
|
122
128
|
"tokens_per_minute": 100_000,
|
|
123
129
|
"reasoning_model": True,
|
|
@@ -130,8 +136,9 @@ GOOGLE_MODELS = {
|
|
|
130
136
|
"supports_json": True,
|
|
131
137
|
"supports_logprobs": False,
|
|
132
138
|
"api_spec": "gemini",
|
|
133
|
-
"input_cost": 0.
|
|
134
|
-
"
|
|
139
|
+
"input_cost": 0.3,
|
|
140
|
+
"cached_input_cost": 0.075,
|
|
141
|
+
"output_cost": 2.5,
|
|
135
142
|
"requests_per_minute": 20,
|
|
136
143
|
"tokens_per_minute": 100_000,
|
|
137
144
|
"reasoning_model": True,
|
|
@@ -145,6 +152,7 @@ GOOGLE_MODELS = {
|
|
|
145
152
|
"supports_logprobs": False,
|
|
146
153
|
"api_spec": "gemini",
|
|
147
154
|
"input_cost": 0.1,
|
|
155
|
+
"cached_input_cost": 0.025,
|
|
148
156
|
"output_cost": 0.4,
|
|
149
157
|
"requests_per_minute": 20,
|
|
150
158
|
"tokens_per_minute": 100_000,
|
|
@@ -75,8 +75,8 @@ OPENAI_MODELS = {
|
|
|
75
75
|
"supports_logprobs": False,
|
|
76
76
|
"supports_responses": True,
|
|
77
77
|
"api_spec": "openai",
|
|
78
|
-
"input_cost":
|
|
79
|
-
"output_cost":
|
|
78
|
+
"input_cost": 3.0,
|
|
79
|
+
"output_cost": 12.0,
|
|
80
80
|
"requests_per_minute": 20,
|
|
81
81
|
"tokens_per_minute": 100_000,
|
|
82
82
|
"reasoning_model": False,
|
|
@@ -90,8 +90,9 @@ OPENAI_MODELS = {
|
|
|
90
90
|
"supports_logprobs": True,
|
|
91
91
|
"supports_responses": True,
|
|
92
92
|
"api_spec": "openai",
|
|
93
|
-
"input_cost":
|
|
94
|
-
"
|
|
93
|
+
"input_cost": 2.0,
|
|
94
|
+
"cached_input_cost": 0.50,
|
|
95
|
+
"output_cost": 8.0,
|
|
95
96
|
"requests_per_minute": 20,
|
|
96
97
|
"tokens_per_minute": 100_000,
|
|
97
98
|
"reasoning_model": True,
|
|
@@ -106,6 +107,7 @@ OPENAI_MODELS = {
|
|
|
106
107
|
"supports_responses": True,
|
|
107
108
|
"api_spec": "openai",
|
|
108
109
|
"input_cost": 1.1,
|
|
110
|
+
"cached_input_cost": 0.275,
|
|
109
111
|
"output_cost": 4.4,
|
|
110
112
|
"requests_per_minute": 20,
|
|
111
113
|
"tokens_per_minute": 100_000,
|
|
@@ -121,6 +123,7 @@ OPENAI_MODELS = {
|
|
|
121
123
|
"supports_responses": True,
|
|
122
124
|
"api_spec": "openai",
|
|
123
125
|
"input_cost": 2.0,
|
|
126
|
+
"cached_input_cost": 0.50,
|
|
124
127
|
"output_cost": 8.0,
|
|
125
128
|
"requests_per_minute": 20,
|
|
126
129
|
"tokens_per_minute": 100_000,
|
|
@@ -136,6 +139,7 @@ OPENAI_MODELS = {
|
|
|
136
139
|
"supports_responses": True,
|
|
137
140
|
"api_spec": "openai",
|
|
138
141
|
"input_cost": 0.4,
|
|
142
|
+
"cached_input_cost": 0.10,
|
|
139
143
|
"output_cost": 1.6,
|
|
140
144
|
"requests_per_minute": 20,
|
|
141
145
|
"tokens_per_minute": 100_000,
|
|
@@ -151,6 +155,7 @@ OPENAI_MODELS = {
|
|
|
151
155
|
"supports_responses": True,
|
|
152
156
|
"api_spec": "openai",
|
|
153
157
|
"input_cost": 0.1,
|
|
158
|
+
"cached_input_cost": 0.025,
|
|
154
159
|
"output_cost": 0.4,
|
|
155
160
|
"requests_per_minute": 20,
|
|
156
161
|
"tokens_per_minute": 100_000,
|
|
@@ -181,6 +186,7 @@ OPENAI_MODELS = {
|
|
|
181
186
|
"supports_responses": True,
|
|
182
187
|
"api_spec": "openai",
|
|
183
188
|
"input_cost": 1.1,
|
|
189
|
+
"cached_input_cost": 0.55,
|
|
184
190
|
"output_cost": 4.4,
|
|
185
191
|
"requests_per_minute": 20,
|
|
186
192
|
"tokens_per_minute": 100_000,
|
|
@@ -196,6 +202,7 @@ OPENAI_MODELS = {
|
|
|
196
202
|
"supports_responses": True,
|
|
197
203
|
"api_spec": "openai",
|
|
198
204
|
"input_cost": 15.0,
|
|
205
|
+
"cached_input_cost": 7.50,
|
|
199
206
|
"output_cost": 60.0,
|
|
200
207
|
"requests_per_minute": 20,
|
|
201
208
|
"tokens_per_minute": 100_000,
|
|
@@ -225,8 +232,9 @@ OPENAI_MODELS = {
|
|
|
225
232
|
"supports_logprobs": True,
|
|
226
233
|
"supports_responses": True,
|
|
227
234
|
"api_spec": "openai",
|
|
228
|
-
"input_cost":
|
|
229
|
-
"
|
|
235
|
+
"input_cost": 1.1,
|
|
236
|
+
"cached_input_cost": 0.55,
|
|
237
|
+
"output_cost": 4.4,
|
|
230
238
|
"requests_per_minute": 20,
|
|
231
239
|
"tokens_per_minute": 100_000,
|
|
232
240
|
"reasoning_model": True,
|
|
@@ -240,8 +248,9 @@ OPENAI_MODELS = {
|
|
|
240
248
|
"supports_logprobs": True,
|
|
241
249
|
"supports_responses": True,
|
|
242
250
|
"api_spec": "openai",
|
|
243
|
-
"input_cost":
|
|
244
|
-
"
|
|
251
|
+
"input_cost": 2.50,
|
|
252
|
+
"cached_input_cost": 1.25,
|
|
253
|
+
"output_cost": 10.0,
|
|
245
254
|
"requests_per_minute": 10_000,
|
|
246
255
|
"tokens_per_minute": 30_000_000,
|
|
247
256
|
},
|
|
@@ -255,6 +264,7 @@ OPENAI_MODELS = {
|
|
|
255
264
|
"supports_responses": True,
|
|
256
265
|
"api_spec": "openai",
|
|
257
266
|
"input_cost": 0.15,
|
|
267
|
+
"cached_input_cost": 0.075,
|
|
258
268
|
"output_cost": 0.6,
|
|
259
269
|
"requests_per_minute": 60_000,
|
|
260
270
|
"tokens_per_minute": 250_000_000,
|
|
@@ -13,7 +13,6 @@ from rich.progress import (
|
|
|
13
13
|
TaskID,
|
|
14
14
|
TextColumn,
|
|
15
15
|
)
|
|
16
|
-
from rich.text import Text
|
|
17
16
|
from tqdm.auto import tqdm
|
|
18
17
|
|
|
19
18
|
SECONDS_TO_PAUSE_AFTER_RATE_LIMIT_ERROR = 5
|
|
@@ -24,6 +23,7 @@ class StatusTracker:
|
|
|
24
23
|
max_requests_per_minute: int
|
|
25
24
|
max_tokens_per_minute: int
|
|
26
25
|
max_concurrent_requests: int
|
|
26
|
+
client_name: str = "LLMClient"
|
|
27
27
|
num_tasks_started: int = 0
|
|
28
28
|
num_tasks_in_progress: int = 0
|
|
29
29
|
num_tasks_succeeded: int = 0
|
|
@@ -187,14 +187,16 @@ class StatusTracker:
|
|
|
187
187
|
|
|
188
188
|
def _init_rich_display(self, total: int):
|
|
189
189
|
"""Initialize Rich display components."""
|
|
190
|
-
self._rich_console = Console()
|
|
190
|
+
self._rich_console = Console(highlight=False)
|
|
191
|
+
# Escape square brackets so Rich doesn't interpret them as markup
|
|
192
|
+
description = f"[bold blue]\\[{self.client_name}][/bold blue] Processing..."
|
|
191
193
|
self._rich_progress = Progress(
|
|
192
194
|
SpinnerColumn(),
|
|
193
|
-
TextColumn("
|
|
195
|
+
TextColumn("[progress.description]{task.description}"),
|
|
194
196
|
BarColumn(),
|
|
195
197
|
MofNCompleteColumn(),
|
|
196
198
|
)
|
|
197
|
-
self._rich_task_id = self._rich_progress.add_task(
|
|
199
|
+
self._rich_task_id = self._rich_progress.add_task(description, total=total)
|
|
198
200
|
self._rich_stop_event = asyncio.Event()
|
|
199
201
|
self._rich_display_task = asyncio.create_task(self._rich_display_updater())
|
|
200
202
|
|
|
@@ -217,12 +219,17 @@ class StatusTracker:
|
|
|
217
219
|
total=self.progress_bar_total,
|
|
218
220
|
)
|
|
219
221
|
|
|
220
|
-
tokens_info = f"
|
|
221
|
-
reqs_info = f"
|
|
222
|
-
in_progress =
|
|
223
|
-
|
|
222
|
+
tokens_info = f"{self.available_token_capacity / 1000:.1f}k/{self.max_tokens_per_minute / 1000:.1f}k TPM"
|
|
223
|
+
reqs_info = f"{int(self.available_request_capacity)}/{self.max_requests_per_minute} RPM"
|
|
224
|
+
in_progress = (
|
|
225
|
+
f" [gold3]In Progress:[/gold3] {int(self.num_tasks_in_progress)} "
|
|
226
|
+
+ ("requests" if self.num_tasks_in_progress != 1 else "request")
|
|
227
|
+
)
|
|
228
|
+
capacity_text = (
|
|
229
|
+
f" [gold3]Capacity:[/gold3] {tokens_info} • {reqs_info}"
|
|
230
|
+
)
|
|
224
231
|
|
|
225
|
-
display = Group(self._rich_progress, capacity_text)
|
|
232
|
+
display = Group(self._rich_progress, in_progress, capacity_text)
|
|
226
233
|
live.update(display)
|
|
227
234
|
|
|
228
235
|
await asyncio.sleep(0.1)
|
|
@@ -252,7 +259,7 @@ class StatusTracker:
|
|
|
252
259
|
return
|
|
253
260
|
while not self._manual_stop_event.is_set():
|
|
254
261
|
print(
|
|
255
|
-
f"Completed {self.num_tasks_succeeded}/{self.progress_bar_total} requests"
|
|
262
|
+
f"[{self.client_name}] Completed {self.num_tasks_succeeded}/{self.progress_bar_total} requests"
|
|
256
263
|
)
|
|
257
264
|
await asyncio.sleep(self.progress_print_interval)
|
|
258
265
|
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
|
|
5
4
|
@dataclass
|
|
@@ -13,8 +12,8 @@ class Usage:
|
|
|
13
12
|
|
|
14
13
|
input_tokens: int = 0
|
|
15
14
|
output_tokens: int = 0
|
|
16
|
-
cache_read_tokens:
|
|
17
|
-
cache_write_tokens:
|
|
15
|
+
cache_read_tokens: int = 0
|
|
16
|
+
cache_write_tokens: int = 0
|
|
18
17
|
|
|
19
18
|
@property
|
|
20
19
|
def total_input_tokens(self) -> int:
|
|
@@ -47,18 +46,29 @@ class Usage:
|
|
|
47
46
|
return cls(
|
|
48
47
|
input_tokens=usage_data.get("input_tokens", 0),
|
|
49
48
|
output_tokens=usage_data.get("output_tokens", 0),
|
|
50
|
-
cache_read_tokens=usage_data.get("cache_read_input_tokens"),
|
|
51
|
-
cache_write_tokens=usage_data.get("cache_creation_input_tokens"),
|
|
49
|
+
cache_read_tokens=usage_data.get("cache_read_input_tokens", 0),
|
|
50
|
+
cache_write_tokens=usage_data.get("cache_creation_input_tokens", 0),
|
|
52
51
|
)
|
|
53
52
|
|
|
54
53
|
@classmethod
|
|
55
54
|
def from_openai_usage(cls, usage_data: dict) -> "Usage":
|
|
56
|
-
"""Create Usage from OpenAI API response usage data.
|
|
55
|
+
"""Create Usage from OpenAI API response usage data.
|
|
56
|
+
|
|
57
|
+
OpenAI supports prompt caching - cached tokens appear in prompt_tokens_details.cached_tokens.
|
|
58
|
+
Caching is automatic for prompts over 1024 tokens.
|
|
59
|
+
"""
|
|
60
|
+
prompt_tokens_details = usage_data.get("prompt_tokens_details", {})
|
|
61
|
+
cached_tokens = (
|
|
62
|
+
prompt_tokens_details.get("cached_tokens", 0)
|
|
63
|
+
if prompt_tokens_details
|
|
64
|
+
else 0
|
|
65
|
+
)
|
|
66
|
+
|
|
57
67
|
return cls(
|
|
58
68
|
input_tokens=usage_data.get("prompt_tokens", 0),
|
|
59
69
|
output_tokens=usage_data.get("completion_tokens", 0),
|
|
60
|
-
cache_read_tokens=
|
|
61
|
-
cache_write_tokens=
|
|
70
|
+
cache_read_tokens=cached_tokens if cached_tokens > 0 else 0,
|
|
71
|
+
cache_write_tokens=0, # OpenAI doesn't charge separately for cache writes
|
|
62
72
|
)
|
|
63
73
|
|
|
64
74
|
@classmethod
|
|
@@ -67,18 +77,23 @@ class Usage:
|
|
|
67
77
|
return cls(
|
|
68
78
|
input_tokens=usage_data.get("prompt_tokens", 0),
|
|
69
79
|
output_tokens=usage_data.get("completion_tokens", 0),
|
|
70
|
-
cache_read_tokens=
|
|
71
|
-
cache_write_tokens=
|
|
80
|
+
cache_read_tokens=0, # Mistral doesn't support caching
|
|
81
|
+
cache_write_tokens=0,
|
|
72
82
|
)
|
|
73
83
|
|
|
74
84
|
@classmethod
|
|
75
85
|
def from_gemini_usage(cls, usage_data: dict) -> "Usage":
|
|
76
|
-
"""Create Usage from Gemini API response usage data.
|
|
86
|
+
"""Create Usage from Gemini API response usage data.
|
|
87
|
+
|
|
88
|
+
Gemini supports context caching - cached tokens appear in cachedContentTokenCount.
|
|
89
|
+
"""
|
|
90
|
+
cached_tokens = usage_data.get("cachedContentTokenCount", 0)
|
|
91
|
+
|
|
77
92
|
return cls(
|
|
78
93
|
input_tokens=usage_data.get("promptTokenCount", 0),
|
|
79
94
|
output_tokens=usage_data.get("candidatesTokenCount", 0),
|
|
80
|
-
cache_read_tokens=
|
|
81
|
-
cache_write_tokens=
|
|
95
|
+
cache_read_tokens=cached_tokens if cached_tokens > 0 else 0,
|
|
96
|
+
cache_write_tokens=0, # Gemini doesn't charge separately for cache writes
|
|
82
97
|
)
|
|
83
98
|
|
|
84
99
|
def to_dict(self) -> dict:
|
|
@@ -100,8 +115,8 @@ class Usage:
|
|
|
100
115
|
return cls(
|
|
101
116
|
input_tokens=data.get("input_tokens", 0),
|
|
102
117
|
output_tokens=data.get("output_tokens", 0),
|
|
103
|
-
cache_read_tokens=data.get("cache_read_tokens"),
|
|
104
|
-
cache_write_tokens=data.get("cache_write_tokens"),
|
|
118
|
+
cache_read_tokens=data.get("cache_read_tokens", 0),
|
|
119
|
+
cache_write_tokens=data.get("cache_write_tokens", 0),
|
|
105
120
|
)
|
|
106
121
|
|
|
107
122
|
def __add__(self, other: "Usage") -> "Usage":
|
|
@@ -111,14 +126,8 @@ class Usage:
|
|
|
111
126
|
output_tokens=self.output_tokens + other.output_tokens,
|
|
112
127
|
cache_read_tokens=(
|
|
113
128
|
(self.cache_read_tokens or 0) + (other.cache_read_tokens or 0)
|
|
114
|
-
if self.cache_read_tokens is not None
|
|
115
|
-
or other.cache_read_tokens is not None
|
|
116
|
-
else None
|
|
117
129
|
),
|
|
118
130
|
cache_write_tokens=(
|
|
119
131
|
(self.cache_write_tokens or 0) + (other.cache_write_tokens or 0)
|
|
120
|
-
if self.cache_write_tokens is not None
|
|
121
|
-
or other.cache_write_tokens is not None
|
|
122
|
-
else None
|
|
123
132
|
),
|
|
124
133
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{lm_deluge-0.0.53 → lm_deluge-0.0.55}/src/lm_deluge/built_in_tools/anthropic/computer_use.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|