speedy-utils 1.1.0__tar.gz → 1.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/PKG-INFO +1 -1
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/pyproject.toml +1 -1
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/async_lm.py +63 -86
- speedy_utils-1.1.3/src/llm_utils/scripts/vllm_load_balancer.py +882 -0
- speedy_utils-1.1.0/src/llm_utils/scripts/vllm_load_balancer.py +0 -509
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/README.md +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/__init__.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/chat_format/__init__.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/chat_format/display.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/chat_format/transform.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/chat_format/utils.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/group_messages.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/__init__.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/chat_html.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/lm_json.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/sync_lm.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/utils.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/scripts/README.md +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/scripts/vllm_serve.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/__init__.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/all.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/__init__.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/clock.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/function_decorator.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/logger.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/notebook_utils.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/report_manager.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/utils_cache.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/utils_io.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/utils_misc.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/utils_print.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/multi_worker/__init__.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/multi_worker/process.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/multi_worker/thread.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/scripts/__init__.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/scripts/mpython.py +0 -0
- {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/scripts/openapi_client_codegen.py +0 -0
|
@@ -1,84 +1,10 @@
|
|
|
1
|
-
"""
|
|
2
|
-
# ============================================================================= #
|
|
3
|
-
# ASYNCHRONOUS LANGUAGE MODEL WRAPPER WITH CONCURRENT EXECUTION SUPPORT
|
|
4
|
-
# ============================================================================= #
|
|
5
|
-
#
|
|
6
|
-
# Title & Intent:
|
|
7
|
-
# High-performance asynchronous language model interface for concurrent LLM operations
|
|
8
|
-
#
|
|
9
|
-
# High-level Summary:
|
|
10
|
-
# This module provides an async drop-in replacement for the synchronous LM class, designed
|
|
11
|
-
# for high-throughput applications requiring concurrent language model operations. It maintains
|
|
12
|
-
# full API compatibility while adding async/await semantics, connection pooling, and efficient
|
|
13
|
-
# resource management. The AsyncLM class supports batch processing, concurrent request handling,
|
|
14
|
-
# and maintains the same caching and type safety guarantees as the synchronous version.
|
|
15
|
-
#
|
|
16
|
-
# Public API / Data Contracts:
|
|
17
|
-
# • AsyncLM(model, temperature=0.0, max_tokens=2000, host="localhost", port=None, **kwargs) - Async wrapper class
|
|
18
|
-
# • async AsyncLM.__call__(prompt=None, messages=None, response_format=str, cache=None, **kwargs) -> str | BaseModel
|
|
19
|
-
# • async AsyncLM.list_models(port=None) -> List[str] - Enumerate available models
|
|
20
|
-
# • async AsyncLM.count_tokens(messages, model=None) -> int - Token counting utility
|
|
21
|
-
# • async AsyncLM.price(messages, model=None, response_tokens=0) -> float - Cost estimation
|
|
22
|
-
# • AsyncLM.set_model(model_name) -> None - Runtime model switching (sync method)
|
|
23
|
-
# • async AsyncLM.batch_call(requests) -> List[Union[str, BaseModel]] - Concurrent batch processing
|
|
24
|
-
# • TModel = TypeVar("TModel", bound=BaseModel) - Generic type for structured responses
|
|
25
|
-
# • Messages = List[ChatCompletionMessageParam] - Typed message format
|
|
26
|
-
#
|
|
27
|
-
# Invariants / Constraints:
|
|
28
|
-
# • MUST be used within async context (asyncio event loop required)
|
|
29
|
-
# • MUST provide either 'prompt' or 'messages' parameter, but not both
|
|
30
|
-
# • MUST properly await all async method calls
|
|
31
|
-
# • Connection pooling MUST handle concurrent requests efficiently
|
|
32
|
-
# • MUST maintain thread safety across concurrent operations
|
|
33
|
-
# • Rate limit handling MUST use async backoff without blocking event loop
|
|
34
|
-
# • MUST preserve all synchronous LM class behaviors and constraints
|
|
35
|
-
# • Resource cleanup MUST occur on context manager exit or explicit close
|
|
36
|
-
#
|
|
37
|
-
# Usage Example:
|
|
38
|
-
# ```python
|
|
39
|
-
# import asyncio
|
|
40
|
-
# from llm_utils.lm.async_lm import AsyncLM
|
|
41
|
-
# from pydantic import BaseModel
|
|
42
|
-
#
|
|
43
|
-
# class SummaryResponse(BaseModel):
|
|
44
|
-
# summary: str
|
|
45
|
-
# key_points: List[str]
|
|
46
|
-
# confidence: float
|
|
47
|
-
#
|
|
48
|
-
# async def main():
|
|
49
|
-
# # Single async call
|
|
50
|
-
# lm = AsyncLM(model="gpt-4o-mini", temperature=0.1)
|
|
51
|
-
# response = await lm(prompt="Summarize quantum computing")
|
|
52
|
-
# print(response)
|
|
53
|
-
#
|
|
54
|
-
# # Concurrent batch processing
|
|
55
|
-
# texts = ["Text 1 to summarize", "Text 2 to summarize", "Text 3 to summarize"]
|
|
56
|
-
# tasks = [lm(prompt=f"Summarize: {text}", response_format=SummaryResponse) for text in texts]
|
|
57
|
-
# summaries = await asyncio.gather(*tasks)
|
|
58
|
-
#
|
|
59
|
-
# for summary in summaries:
|
|
60
|
-
# print(f"Summary: {summary.summary}")
|
|
61
|
-
# print(f"Key points: {summary.key_points}")
|
|
62
|
-
#
|
|
63
|
-
# asyncio.run(main())
|
|
64
|
-
# ```
|
|
65
|
-
#
|
|
66
|
-
# TODO & Future Work:
|
|
67
|
-
# • Add async context manager support for automatic resource cleanup
|
|
68
|
-
# • Implement connection pool size optimization based on usage patterns
|
|
69
|
-
# • Add async streaming response support with async generators
|
|
70
|
-
# • Optimize memory usage for large-scale concurrent operations
|
|
71
|
-
# • Add async rate limiting with priority queuing
|
|
72
|
-
#
|
|
73
|
-
# ============================================================================= #
|
|
74
|
-
"""
|
|
75
1
|
|
|
76
2
|
import base64
|
|
77
3
|
import hashlib
|
|
78
4
|
import json
|
|
79
5
|
import os
|
|
80
6
|
from abc import ABC
|
|
81
|
-
from functools import lru_cache
|
|
7
|
+
from functools import cache, lru_cache
|
|
82
8
|
from typing import (
|
|
83
9
|
Any,
|
|
84
10
|
Dict,
|
|
@@ -110,7 +36,7 @@ from openai.types.chat import (
|
|
|
110
36
|
)
|
|
111
37
|
from openai.types.model import Model
|
|
112
38
|
from pydantic import BaseModel
|
|
113
|
-
|
|
39
|
+
from pydantic import ValidationError
|
|
114
40
|
from llm_utils.chat_format.display import get_conversation_one_turn
|
|
115
41
|
|
|
116
42
|
# --------------------------------------------------------------------------- #
|
|
@@ -146,10 +72,13 @@ def _yellow(t):
|
|
|
146
72
|
return _color(33, t)
|
|
147
73
|
|
|
148
74
|
|
|
149
|
-
|
|
75
|
+
TParsed = TypeVar("TParsed", bound=BaseModel)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ParsedOutput(TypedDict, Generic[TParsed]):
|
|
150
79
|
messages: List
|
|
151
80
|
completion: Any
|
|
152
|
-
parsed:
|
|
81
|
+
parsed: TParsed
|
|
153
82
|
|
|
154
83
|
|
|
155
84
|
class AsyncLM:
|
|
@@ -460,7 +389,7 @@ class AsyncLM:
|
|
|
460
389
|
# ------------------------------------------------------------------ #
|
|
461
390
|
async def parse(
|
|
462
391
|
self,
|
|
463
|
-
response_model: Type[
|
|
392
|
+
response_model: Type[TParsed],
|
|
464
393
|
instruction: Optional[str] = None,
|
|
465
394
|
prompt: Optional[str] = None,
|
|
466
395
|
messages: Optional[RawMsgs] = None,
|
|
@@ -470,7 +399,7 @@ class AsyncLM:
|
|
|
470
399
|
max_tokens: Optional[int] = None,
|
|
471
400
|
cache: Optional[bool] = True,
|
|
472
401
|
**kwargs,
|
|
473
|
-
) -> ParsedOutput
|
|
402
|
+
) -> ParsedOutput[TParsed]:
|
|
474
403
|
"""Parse response using guided JSON generation."""
|
|
475
404
|
if messages is None:
|
|
476
405
|
assert instruction is not None, "Instruction must be provided."
|
|
@@ -513,6 +442,7 @@ class AsyncLM:
|
|
|
513
442
|
|
|
514
443
|
use_cache = self.do_cache if cache is None else cache
|
|
515
444
|
cache_key = None
|
|
445
|
+
completion = None
|
|
516
446
|
if use_cache:
|
|
517
447
|
cache_data = {
|
|
518
448
|
"messages": messages,
|
|
@@ -522,7 +452,7 @@ class AsyncLM:
|
|
|
522
452
|
}
|
|
523
453
|
cache_key = self._cache_key(cache_data, {}, response_model)
|
|
524
454
|
completion = self._load_cache(cache_key) # dict
|
|
525
|
-
|
|
455
|
+
if not completion:
|
|
526
456
|
completion = await self.client.chat.completions.create(
|
|
527
457
|
model=self.model, # type: ignore
|
|
528
458
|
messages=messages, # type: ignore
|
|
@@ -532,10 +462,12 @@ class AsyncLM:
|
|
|
532
462
|
completion = completion.model_dump()
|
|
533
463
|
if cache_key:
|
|
534
464
|
self._dump_cache(cache_key, completion)
|
|
535
|
-
|
|
465
|
+
assert isinstance(completion, dict), (
|
|
466
|
+
"Completion must be a dictionary with OpenAI response format."
|
|
467
|
+
)
|
|
536
468
|
self.last_log = [prompt, messages, completion]
|
|
537
469
|
|
|
538
|
-
output = self._parse_complete_output(completion, response_model)
|
|
470
|
+
output = cast(TParsed, self._parse_complete_output(completion, response_model))
|
|
539
471
|
full_messages = messages + [completion]
|
|
540
472
|
return ParsedOutput(
|
|
541
473
|
messages=full_messages,
|
|
@@ -555,7 +487,49 @@ class AsyncLM:
|
|
|
555
487
|
|
|
556
488
|
content = completion["choices"][0]["message"]["content"]
|
|
557
489
|
if not content:
|
|
558
|
-
|
|
490
|
+
# Enhanced error for debugging: show input tokens and their count
|
|
491
|
+
|
|
492
|
+
# Try to extract tokens from the completion for debugging
|
|
493
|
+
input_tokens = None
|
|
494
|
+
try:
|
|
495
|
+
input_tokens = completion.get('usage', {}).get('prompt_tokens')
|
|
496
|
+
except Exception:
|
|
497
|
+
input_tokens = None
|
|
498
|
+
|
|
499
|
+
# Try to get the prompt/messages for tokenization
|
|
500
|
+
prompt = None
|
|
501
|
+
try:
|
|
502
|
+
prompt = completion.get('messages') or completion.get('prompt')
|
|
503
|
+
except Exception:
|
|
504
|
+
prompt = None
|
|
505
|
+
|
|
506
|
+
tokens_preview = ''
|
|
507
|
+
if prompt is not None:
|
|
508
|
+
try:
|
|
509
|
+
tokenizer = get_tokenizer(self.model)
|
|
510
|
+
if isinstance(prompt, list):
|
|
511
|
+
prompt_text = '\n'.join(
|
|
512
|
+
m.get('content', '') for m in prompt if isinstance(m, dict)
|
|
513
|
+
)
|
|
514
|
+
else:
|
|
515
|
+
prompt_text = str(prompt)
|
|
516
|
+
tokens = tokenizer.encode(prompt_text)
|
|
517
|
+
n_tokens = len(tokens)
|
|
518
|
+
first_100 = tokens[:100]
|
|
519
|
+
last_100 = tokens[-100:] if n_tokens > 100 else []
|
|
520
|
+
tokens_preview = (
|
|
521
|
+
f'\nInput tokens: {n_tokens}'
|
|
522
|
+
f'\nFirst 100 tokens: {first_100}'
|
|
523
|
+
f'\nLast 100 tokens: {last_100}'
|
|
524
|
+
)
|
|
525
|
+
except Exception as exc:
|
|
526
|
+
tokens_preview = f'\n[Tokenization failed: {exc}]'
|
|
527
|
+
|
|
528
|
+
raise ValueError(
|
|
529
|
+
f'Empty content in response.'
|
|
530
|
+
f'\nInput tokens (if available): {input_tokens}'
|
|
531
|
+
f'{tokens_preview}'
|
|
532
|
+
)
|
|
559
533
|
|
|
560
534
|
try:
|
|
561
535
|
data = json.loads(content)
|
|
@@ -737,6 +711,7 @@ async def inspect_word_probs_async(lm, tokenizer, messages):
|
|
|
737
711
|
"""Async version of inspect_word_probs."""
|
|
738
712
|
|
|
739
713
|
import numpy as np
|
|
714
|
+
|
|
740
715
|
|
|
741
716
|
async def compute_word_log_probs(
|
|
742
717
|
tokenizer: Any,
|
|
@@ -894,12 +869,14 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
|
|
|
894
869
|
temperature: float = 0.6
|
|
895
870
|
think: bool = False
|
|
896
871
|
add_json_schema: bool = False
|
|
872
|
+
cache: bool = False
|
|
897
873
|
|
|
898
874
|
async def __call__(
|
|
899
875
|
self,
|
|
900
876
|
data: BaseModel | dict,
|
|
901
877
|
temperature: float = 0.1,
|
|
902
878
|
cache: bool = False,
|
|
879
|
+
think: Optional[bool] = None, # if not None, overrides self.think
|
|
903
880
|
) -> tuple[OutputModelType, List[Dict[str, Any]]]:
|
|
904
881
|
# Get the input and output model types from the generic parameters
|
|
905
882
|
type_args = getattr(self.__class__, "__orig_bases__", None)
|
|
@@ -940,9 +917,9 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
|
|
|
940
917
|
instruction=self.__doc__ or "",
|
|
941
918
|
response_model=output_model,
|
|
942
919
|
temperature=temperature or self.temperature,
|
|
943
|
-
think=self.think,
|
|
920
|
+
think=think if think is not None else self.think,
|
|
944
921
|
add_json_schema_to_instruction=self.add_json_schema,
|
|
945
|
-
cache=cache,
|
|
922
|
+
cache=self.cache or cache,
|
|
946
923
|
)
|
|
947
924
|
|
|
948
925
|
return (
|