speedy-utils 1.1.2__tar.gz → 1.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/PKG-INFO +1 -1
  2. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/pyproject.toml +1 -1
  3. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/lm/async_lm.py +48 -78
  4. speedy_utils-1.1.4/src/llm_utils/scripts/vllm_load_balancer.py +882 -0
  5. speedy_utils-1.1.2/src/llm_utils/scripts/vllm_load_balancer.py +0 -509
  6. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/README.md +0 -0
  7. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/__init__.py +0 -0
  8. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/chat_format/__init__.py +0 -0
  9. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/chat_format/display.py +0 -0
  10. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/chat_format/transform.py +0 -0
  11. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/chat_format/utils.py +0 -0
  12. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/group_messages.py +0 -0
  13. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/lm/__init__.py +0 -0
  14. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/lm/chat_html.py +0 -0
  15. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/lm/lm_json.py +0 -0
  16. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/lm/sync_lm.py +0 -0
  17. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/lm/utils.py +0 -0
  18. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/scripts/README.md +0 -0
  19. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/llm_utils/scripts/vllm_serve.py +0 -0
  20. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/__init__.py +0 -0
  21. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/all.py +0 -0
  22. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/common/__init__.py +0 -0
  23. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/common/clock.py +0 -0
  24. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/common/function_decorator.py +0 -0
  25. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/common/logger.py +0 -0
  26. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/common/notebook_utils.py +0 -0
  27. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/common/report_manager.py +0 -0
  28. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/common/utils_cache.py +0 -0
  29. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/common/utils_io.py +0 -0
  30. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/common/utils_misc.py +0 -0
  31. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/common/utils_print.py +0 -0
  32. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/multi_worker/__init__.py +0 -0
  33. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/multi_worker/process.py +0 -0
  34. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/multi_worker/thread.py +0 -0
  35. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/scripts/__init__.py +0 -0
  36. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/scripts/mpython.py +0 -0
  37. {speedy_utils-1.1.2 → speedy_utils-1.1.4}/src/speedy_utils/scripts/openapi_client_codegen.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: speedy-utils
3
- Version: 1.1.2
3
+ Version: 1.1.4
4
4
  Summary: Fast and easy-to-use package for data science
5
5
  Author: AnhVTH
6
6
  Author-email: anhvth.226@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "speedy-utils"
3
- version = "1.1.2"
3
+ version = "1.1.4"
4
4
  description = "Fast and easy-to-use package for data science"
5
5
  authors = ["AnhVTH <anhvth.226@gmail.com>"]
6
6
  readme = "README.md"
@@ -1,77 +1,3 @@
1
- """
2
- # ============================================================================= #
3
- # ASYNCHRONOUS LANGUAGE MODEL WRAPPER WITH CONCURRENT EXECUTION SUPPORT
4
- # ============================================================================= #
5
- #
6
- # Title & Intent:
7
- # High-performance asynchronous language model interface for concurrent LLM operations
8
- #
9
- # High-level Summary:
10
- # This module provides an async drop-in replacement for the synchronous LM class, designed
11
- # for high-throughput applications requiring concurrent language model operations. It maintains
12
- # full API compatibility while adding async/await semantics, connection pooling, and efficient
13
- # resource management. The AsyncLM class supports batch processing, concurrent request handling,
14
- # and maintains the same caching and type safety guarantees as the synchronous version.
15
- #
16
- # Public API / Data Contracts:
17
- # • AsyncLM(model, temperature=0.0, max_tokens=2000, host="localhost", port=None, **kwargs) - Async wrapper class
18
- # • async AsyncLM.__call__(prompt=None, messages=None, response_format=str, cache=None, **kwargs) -> str | BaseModel
19
- # • async AsyncLM.list_models(port=None) -> List[str] - Enumerate available models
20
- # • async AsyncLM.count_tokens(messages, model=None) -> int - Token counting utility
21
- # • async AsyncLM.price(messages, model=None, response_tokens=0) -> float - Cost estimation
22
- # • AsyncLM.set_model(model_name) -> None - Runtime model switching (sync method)
23
- # • async AsyncLM.batch_call(requests) -> List[Union[str, BaseModel]] - Concurrent batch processing
24
- # • TModel = TypeVar("TModel", bound=BaseModel) - Generic type for structured responses
25
- # • Messages = List[ChatCompletionMessageParam] - Typed message format
26
- #
27
- # Invariants / Constraints:
28
- # • MUST be used within async context (asyncio event loop required)
29
- # • MUST provide either 'prompt' or 'messages' parameter, but not both
30
- # • MUST properly await all async method calls
31
- # • Connection pooling MUST handle concurrent requests efficiently
32
- # • MUST maintain thread safety across concurrent operations
33
- # • Rate limit handling MUST use async backoff without blocking event loop
34
- # • MUST preserve all synchronous LM class behaviors and constraints
35
- # • Resource cleanup MUST occur on context manager exit or explicit close
36
- #
37
- # Usage Example:
38
- # ```python
39
- # import asyncio
40
- # from llm_utils.lm.async_lm import AsyncLM
41
- # from pydantic import BaseModel
42
- #
43
- # class SummaryResponse(BaseModel):
44
- # summary: str
45
- # key_points: List[str]
46
- # confidence: float
47
- #
48
- # async def main():
49
- # # Single async call
50
- # lm = AsyncLM(model="gpt-4o-mini", temperature=0.1)
51
- # response = await lm(prompt="Summarize quantum computing")
52
- # print(response)
53
- #
54
- # # Concurrent batch processing
55
- # texts = ["Text 1 to summarize", "Text 2 to summarize", "Text 3 to summarize"]
56
- # tasks = [lm(prompt=f"Summarize: {text}", response_format=SummaryResponse) for text in texts]
57
- # summaries = await asyncio.gather(*tasks)
58
- #
59
- # for summary in summaries:
60
- # print(f"Summary: {summary.summary}")
61
- # print(f"Key points: {summary.key_points}")
62
- #
63
- # asyncio.run(main())
64
- # ```
65
- #
66
- # TODO & Future Work:
67
- # • Add async context manager support for automatic resource cleanup
68
- # • Implement connection pool size optimization based on usage patterns
69
- # • Add async streaming response support with async generators
70
- # • Optimize memory usage for large-scale concurrent operations
71
- # • Add async rate limiting with priority queuing
72
- #
73
- # ============================================================================= #
74
- """
75
1
 
76
2
  import base64
77
3
  import hashlib
@@ -110,7 +36,7 @@ from openai.types.chat import (
110
36
  )
111
37
  from openai.types.model import Model
112
38
  from pydantic import BaseModel
113
-
39
+ from pydantic import ValidationError
114
40
  from llm_utils.chat_format.display import get_conversation_one_turn
115
41
 
116
42
  # --------------------------------------------------------------------------- #
@@ -146,8 +72,8 @@ def _yellow(t):
146
72
  return _color(33, t)
147
73
 
148
74
 
75
+ TParsed = TypeVar("TParsed", bound=BaseModel)
149
76
 
150
- TParsed = TypeVar('TParsed', bound=BaseModel)
151
77
 
152
78
  class ParsedOutput(TypedDict, Generic[TParsed]):
153
79
  messages: List
@@ -561,7 +487,49 @@ class AsyncLM:
561
487
 
562
488
  content = completion["choices"][0]["message"]["content"]
563
489
  if not content:
564
- raise ValueError("Empty content in response")
490
+ # Enhanced error for debugging: show input tokens and their count
491
+
492
+ # Try to extract tokens from the completion for debugging
493
+ input_tokens = None
494
+ try:
495
+ input_tokens = completion.get('usage', {}).get('prompt_tokens')
496
+ except Exception:
497
+ input_tokens = None
498
+
499
+ # Try to get the prompt/messages for tokenization
500
+ prompt = None
501
+ try:
502
+ prompt = completion.get('messages') or completion.get('prompt')
503
+ except Exception:
504
+ prompt = None
505
+
506
+ tokens_preview = ''
507
+ if prompt is not None:
508
+ try:
509
+ tokenizer = get_tokenizer(self.model)
510
+ if isinstance(prompt, list):
511
+ prompt_text = '\n'.join(
512
+ m.get('content', '') for m in prompt if isinstance(m, dict)
513
+ )
514
+ else:
515
+ prompt_text = str(prompt)
516
+ tokens = tokenizer.encode(prompt_text)
517
+ n_tokens = len(tokens)
518
+ first_100 = tokens[:100]
519
+ last_100 = tokens[-100:] if n_tokens > 100 else []
520
+ tokens_preview = (
521
+ f'\nInput tokens: {n_tokens}'
522
+ f'\nFirst 100 tokens: {first_100}'
523
+ f'\nLast 100 tokens: {last_100}'
524
+ )
525
+ except Exception as exc:
526
+ tokens_preview = f'\n[Tokenization failed: {exc}]'
527
+
528
+ raise ValueError(
529
+ f'Empty content in response.'
530
+ f'\nInput tokens (if available): {input_tokens}'
531
+ f'{tokens_preview}'
532
+ )
565
533
 
566
534
  try:
567
535
  data = json.loads(content)
@@ -743,6 +711,7 @@ async def inspect_word_probs_async(lm, tokenizer, messages):
743
711
  """Async version of inspect_word_probs."""
744
712
 
745
713
  import numpy as np
714
+
746
715
 
747
716
  async def compute_word_log_probs(
748
717
  tokenizer: Any,
@@ -907,6 +876,7 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
907
876
  data: BaseModel | dict,
908
877
  temperature: float = 0.1,
909
878
  cache: bool = False,
879
+ think: Optional[bool] = None, # if not None, overrides self.think
910
880
  ) -> tuple[OutputModelType, List[Dict[str, Any]]]:
911
881
  # Get the input and output model types from the generic parameters
912
882
  type_args = getattr(self.__class__, "__orig_bases__", None)
@@ -947,7 +917,7 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
947
917
  instruction=self.__doc__ or "",
948
918
  response_model=output_model,
949
919
  temperature=temperature or self.temperature,
950
- think=self.think,
920
+ think=think if think is not None else self.think,
951
921
  add_json_schema_to_instruction=self.add_json_schema,
952
922
  cache=self.cache or cache,
953
923
  )