speedy-utils 1.1.0__tar.gz → 1.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/PKG-INFO +1 -1
  2. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/pyproject.toml +1 -1
  3. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/async_lm.py +63 -86
  4. speedy_utils-1.1.3/src/llm_utils/scripts/vllm_load_balancer.py +882 -0
  5. speedy_utils-1.1.0/src/llm_utils/scripts/vllm_load_balancer.py +0 -509
  6. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/README.md +0 -0
  7. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/__init__.py +0 -0
  8. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/chat_format/__init__.py +0 -0
  9. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/chat_format/display.py +0 -0
  10. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/chat_format/transform.py +0 -0
  11. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/chat_format/utils.py +0 -0
  12. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/group_messages.py +0 -0
  13. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/__init__.py +0 -0
  14. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/chat_html.py +0 -0
  15. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/lm_json.py +0 -0
  16. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/sync_lm.py +0 -0
  17. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/lm/utils.py +0 -0
  18. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/scripts/README.md +0 -0
  19. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/llm_utils/scripts/vllm_serve.py +0 -0
  20. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/__init__.py +0 -0
  21. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/all.py +0 -0
  22. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/__init__.py +0 -0
  23. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/clock.py +0 -0
  24. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/function_decorator.py +0 -0
  25. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/logger.py +0 -0
  26. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/notebook_utils.py +0 -0
  27. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/report_manager.py +0 -0
  28. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/utils_cache.py +0 -0
  29. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/utils_io.py +0 -0
  30. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/utils_misc.py +0 -0
  31. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/common/utils_print.py +0 -0
  32. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/multi_worker/__init__.py +0 -0
  33. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/multi_worker/process.py +0 -0
  34. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/multi_worker/thread.py +0 -0
  35. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/scripts/__init__.py +0 -0
  36. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/scripts/mpython.py +0 -0
  37. {speedy_utils-1.1.0 → speedy_utils-1.1.3}/src/speedy_utils/scripts/openapi_client_codegen.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: speedy-utils
3
- Version: 1.1.0
3
+ Version: 1.1.3
4
4
  Summary: Fast and easy-to-use package for data science
5
5
  Author: AnhVTH
6
6
  Author-email: anhvth.226@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "speedy-utils"
3
- version = "1.1.0"
3
+ version = "1.1.3"
4
4
  description = "Fast and easy-to-use package for data science"
5
5
  authors = ["AnhVTH <anhvth.226@gmail.com>"]
6
6
  readme = "README.md"
@@ -1,84 +1,10 @@
1
- """
2
- # ============================================================================= #
3
- # ASYNCHRONOUS LANGUAGE MODEL WRAPPER WITH CONCURRENT EXECUTION SUPPORT
4
- # ============================================================================= #
5
- #
6
- # Title & Intent:
7
- # High-performance asynchronous language model interface for concurrent LLM operations
8
- #
9
- # High-level Summary:
10
- # This module provides an async drop-in replacement for the synchronous LM class, designed
11
- # for high-throughput applications requiring concurrent language model operations. It maintains
12
- # full API compatibility while adding async/await semantics, connection pooling, and efficient
13
- # resource management. The AsyncLM class supports batch processing, concurrent request handling,
14
- # and maintains the same caching and type safety guarantees as the synchronous version.
15
- #
16
- # Public API / Data Contracts:
17
- # • AsyncLM(model, temperature=0.0, max_tokens=2000, host="localhost", port=None, **kwargs) - Async wrapper class
18
- # • async AsyncLM.__call__(prompt=None, messages=None, response_format=str, cache=None, **kwargs) -> str | BaseModel
19
- # • async AsyncLM.list_models(port=None) -> List[str] - Enumerate available models
20
- # • async AsyncLM.count_tokens(messages, model=None) -> int - Token counting utility
21
- # • async AsyncLM.price(messages, model=None, response_tokens=0) -> float - Cost estimation
22
- # • AsyncLM.set_model(model_name) -> None - Runtime model switching (sync method)
23
- # • async AsyncLM.batch_call(requests) -> List[Union[str, BaseModel]] - Concurrent batch processing
24
- # • TModel = TypeVar("TModel", bound=BaseModel) - Generic type for structured responses
25
- # • Messages = List[ChatCompletionMessageParam] - Typed message format
26
- #
27
- # Invariants / Constraints:
28
- # • MUST be used within async context (asyncio event loop required)
29
- # • MUST provide either 'prompt' or 'messages' parameter, but not both
30
- # • MUST properly await all async method calls
31
- # • Connection pooling MUST handle concurrent requests efficiently
32
- # • MUST maintain thread safety across concurrent operations
33
- # • Rate limit handling MUST use async backoff without blocking event loop
34
- # • MUST preserve all synchronous LM class behaviors and constraints
35
- # • Resource cleanup MUST occur on context manager exit or explicit close
36
- #
37
- # Usage Example:
38
- # ```python
39
- # import asyncio
40
- # from llm_utils.lm.async_lm import AsyncLM
41
- # from pydantic import BaseModel
42
- #
43
- # class SummaryResponse(BaseModel):
44
- # summary: str
45
- # key_points: List[str]
46
- # confidence: float
47
- #
48
- # async def main():
49
- # # Single async call
50
- # lm = AsyncLM(model="gpt-4o-mini", temperature=0.1)
51
- # response = await lm(prompt="Summarize quantum computing")
52
- # print(response)
53
- #
54
- # # Concurrent batch processing
55
- # texts = ["Text 1 to summarize", "Text 2 to summarize", "Text 3 to summarize"]
56
- # tasks = [lm(prompt=f"Summarize: {text}", response_format=SummaryResponse) for text in texts]
57
- # summaries = await asyncio.gather(*tasks)
58
- #
59
- # for summary in summaries:
60
- # print(f"Summary: {summary.summary}")
61
- # print(f"Key points: {summary.key_points}")
62
- #
63
- # asyncio.run(main())
64
- # ```
65
- #
66
- # TODO & Future Work:
67
- # • Add async context manager support for automatic resource cleanup
68
- # • Implement connection pool size optimization based on usage patterns
69
- # • Add async streaming response support with async generators
70
- # • Optimize memory usage for large-scale concurrent operations
71
- # • Add async rate limiting with priority queuing
72
- #
73
- # ============================================================================= #
74
- """
75
1
 
76
2
  import base64
77
3
  import hashlib
78
4
  import json
79
5
  import os
80
6
  from abc import ABC
81
- from functools import lru_cache
7
+ from functools import cache, lru_cache
82
8
  from typing import (
83
9
  Any,
84
10
  Dict,
@@ -110,7 +36,7 @@ from openai.types.chat import (
110
36
  )
111
37
  from openai.types.model import Model
112
38
  from pydantic import BaseModel
113
-
39
+ from pydantic import ValidationError
114
40
  from llm_utils.chat_format.display import get_conversation_one_turn
115
41
 
116
42
  # --------------------------------------------------------------------------- #
@@ -146,10 +72,13 @@ def _yellow(t):
146
72
  return _color(33, t)
147
73
 
148
74
 
149
- class ParsedOutput(TypedDict):
75
+ TParsed = TypeVar("TParsed", bound=BaseModel)
76
+
77
+
78
+ class ParsedOutput(TypedDict, Generic[TParsed]):
150
79
  messages: List
151
80
  completion: Any
152
- parsed: BaseModel
81
+ parsed: TParsed
153
82
 
154
83
 
155
84
  class AsyncLM:
@@ -460,7 +389,7 @@ class AsyncLM:
460
389
  # ------------------------------------------------------------------ #
461
390
  async def parse(
462
391
  self,
463
- response_model: Type[BaseModel],
392
+ response_model: Type[TParsed],
464
393
  instruction: Optional[str] = None,
465
394
  prompt: Optional[str] = None,
466
395
  messages: Optional[RawMsgs] = None,
@@ -470,7 +399,7 @@ class AsyncLM:
470
399
  max_tokens: Optional[int] = None,
471
400
  cache: Optional[bool] = True,
472
401
  **kwargs,
473
- ) -> ParsedOutput: # -> dict[str, Any]:
402
+ ) -> ParsedOutput[TParsed]:
474
403
  """Parse response using guided JSON generation."""
475
404
  if messages is None:
476
405
  assert instruction is not None, "Instruction must be provided."
@@ -513,6 +442,7 @@ class AsyncLM:
513
442
 
514
443
  use_cache = self.do_cache if cache is None else cache
515
444
  cache_key = None
445
+ completion = None
516
446
  if use_cache:
517
447
  cache_data = {
518
448
  "messages": messages,
@@ -522,7 +452,7 @@ class AsyncLM:
522
452
  }
523
453
  cache_key = self._cache_key(cache_data, {}, response_model)
524
454
  completion = self._load_cache(cache_key) # dict
525
- else:
455
+ if not completion:
526
456
  completion = await self.client.chat.completions.create(
527
457
  model=self.model, # type: ignore
528
458
  messages=messages, # type: ignore
@@ -532,10 +462,12 @@ class AsyncLM:
532
462
  completion = completion.model_dump()
533
463
  if cache_key:
534
464
  self._dump_cache(cache_key, completion)
535
-
465
+ assert isinstance(completion, dict), (
466
+ "Completion must be a dictionary with OpenAI response format."
467
+ )
536
468
  self.last_log = [prompt, messages, completion]
537
469
 
538
- output = self._parse_complete_output(completion, response_model)
470
+ output = cast(TParsed, self._parse_complete_output(completion, response_model))
539
471
  full_messages = messages + [completion]
540
472
  return ParsedOutput(
541
473
  messages=full_messages,
@@ -555,7 +487,49 @@ class AsyncLM:
555
487
 
556
488
  content = completion["choices"][0]["message"]["content"]
557
489
  if not content:
558
- raise ValueError("Empty content in response")
490
+ # Enhanced error for debugging: show input tokens and their count
491
+
492
+ # Try to extract tokens from the completion for debugging
493
+ input_tokens = None
494
+ try:
495
+ input_tokens = completion.get('usage', {}).get('prompt_tokens')
496
+ except Exception:
497
+ input_tokens = None
498
+
499
+ # Try to get the prompt/messages for tokenization
500
+ prompt = None
501
+ try:
502
+ prompt = completion.get('messages') or completion.get('prompt')
503
+ except Exception:
504
+ prompt = None
505
+
506
+ tokens_preview = ''
507
+ if prompt is not None:
508
+ try:
509
+ tokenizer = get_tokenizer(self.model)
510
+ if isinstance(prompt, list):
511
+ prompt_text = '\n'.join(
512
+ m.get('content', '') for m in prompt if isinstance(m, dict)
513
+ )
514
+ else:
515
+ prompt_text = str(prompt)
516
+ tokens = tokenizer.encode(prompt_text)
517
+ n_tokens = len(tokens)
518
+ first_100 = tokens[:100]
519
+ last_100 = tokens[-100:] if n_tokens > 100 else []
520
+ tokens_preview = (
521
+ f'\nInput tokens: {n_tokens}'
522
+ f'\nFirst 100 tokens: {first_100}'
523
+ f'\nLast 100 tokens: {last_100}'
524
+ )
525
+ except Exception as exc:
526
+ tokens_preview = f'\n[Tokenization failed: {exc}]'
527
+
528
+ raise ValueError(
529
+ f'Empty content in response.'
530
+ f'\nInput tokens (if available): {input_tokens}'
531
+ f'{tokens_preview}'
532
+ )
559
533
 
560
534
  try:
561
535
  data = json.loads(content)
@@ -737,6 +711,7 @@ async def inspect_word_probs_async(lm, tokenizer, messages):
737
711
  """Async version of inspect_word_probs."""
738
712
 
739
713
  import numpy as np
714
+
740
715
 
741
716
  async def compute_word_log_probs(
742
717
  tokenizer: Any,
@@ -894,12 +869,14 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
894
869
  temperature: float = 0.6
895
870
  think: bool = False
896
871
  add_json_schema: bool = False
872
+ cache: bool = False
897
873
 
898
874
  async def __call__(
899
875
  self,
900
876
  data: BaseModel | dict,
901
877
  temperature: float = 0.1,
902
878
  cache: bool = False,
879
+ think: Optional[bool] = None, # if not None, overrides self.think
903
880
  ) -> tuple[OutputModelType, List[Dict[str, Any]]]:
904
881
  # Get the input and output model types from the generic parameters
905
882
  type_args = getattr(self.__class__, "__orig_bases__", None)
@@ -940,9 +917,9 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
940
917
  instruction=self.__doc__ or "",
941
918
  response_model=output_model,
942
919
  temperature=temperature or self.temperature,
943
- think=self.think,
920
+ think=think if think is not None else self.think,
944
921
  add_json_schema_to_instruction=self.add_json_schema,
945
- cache=cache,
922
+ cache=self.cache or cache,
946
923
  )
947
924
 
948
925
  return (