speedy-utils 1.0.21__tar.gz → 1.0.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/PKG-INFO +1 -1
  2. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/pyproject.toml +1 -1
  3. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/lm/async_lm.py +74 -13
  4. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/lm/sync_lm.py +83 -6
  5. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/scripts/vllm_load_balancer.py +81 -7
  6. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/scripts/vllm_serve.py +69 -19
  7. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/__init__.py +100 -47
  8. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/all.py +43 -0
  9. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/multi_worker/thread.py +78 -1
  10. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/README.md +0 -0
  11. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/__init__.py +0 -0
  12. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/chat_format/__init__.py +0 -0
  13. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/chat_format/display.py +0 -0
  14. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/chat_format/transform.py +0 -0
  15. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/chat_format/utils.py +0 -0
  16. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/group_messages.py +0 -0
  17. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/lm/__init__.py +0 -0
  18. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/lm/chat_html.py +0 -0
  19. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/lm/lm_json.py +0 -0
  20. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/lm/utils.py +0 -0
  21. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/llm_utils/scripts/README.md +0 -0
  22. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/common/__init__.py +0 -0
  23. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/common/clock.py +0 -0
  24. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/common/function_decorator.py +0 -0
  25. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/common/logger.py +0 -0
  26. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/common/notebook_utils.py +0 -0
  27. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/common/report_manager.py +0 -0
  28. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/common/utils_cache.py +0 -0
  29. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/common/utils_io.py +0 -0
  30. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/common/utils_misc.py +0 -0
  31. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/common/utils_print.py +0 -0
  32. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/multi_worker/__init__.py +0 -0
  33. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/multi_worker/process.py +0 -0
  34. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/scripts/__init__.py +0 -0
  35. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/scripts/mpython.py +0 -0
  36. {speedy_utils-1.0.21 → speedy_utils-1.0.23}/src/speedy_utils/scripts/openapi_client_codegen.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: speedy-utils
3
- Version: 1.0.21
3
+ Version: 1.0.23
4
4
  Summary: Fast and easy-to-use package for data science
5
5
  Author: AnhVTH
6
6
  Author-email: anhvth.226@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "speedy-utils"
3
- version = "1.0.21"
3
+ version = "1.0.23"
4
4
  description = "Fast and easy-to-use package for data science"
5
5
  authors = ["AnhVTH <anhvth.226@gmail.com>"]
6
6
  readme = "README.md"
@@ -1,16 +1,76 @@
1
- """An **asynchronous** drop‑in replacement for the original `LM` class.
2
-
3
- Usage example (Python ≥3.8):
4
-
5
- from async_lm import AsyncLM
6
- import asyncio
7
-
8
- async def main():
9
- lm = AsyncLM(model="gpt-4o-mini")
10
- reply: str = await lm(prompt="Hello, world!")
11
- print(reply)
12
-
13
- asyncio.run(main())
1
+ """
2
+ # ============================================================================= #
3
+ # ASYNCHRONOUS LANGUAGE MODEL WRAPPER WITH CONCURRENT EXECUTION SUPPORT
4
+ # ============================================================================= #
5
+ #
6
+ # Title & Intent:
7
+ # High-performance asynchronous language model interface for concurrent LLM operations
8
+ #
9
+ # High-level Summary:
10
+ # This module provides an async drop-in replacement for the synchronous LM class, designed
11
+ # for high-throughput applications requiring concurrent language model operations. It maintains
12
+ # full API compatibility while adding async/await semantics, connection pooling, and efficient
13
+ # resource management. The AsyncLM class supports batch processing, concurrent request handling,
14
+ # and maintains the same caching and type safety guarantees as the synchronous version.
15
+ #
16
+ # Public API / Data Contracts:
17
+ # • AsyncLM(model, temperature=0.0, max_tokens=2000, host="localhost", port=None, **kwargs) - Async wrapper class
18
+ # • async AsyncLM.__call__(prompt=None, messages=None, response_format=str, cache=None, **kwargs) -> str | BaseModel
19
+ # • async AsyncLM.list_models(port=None) -> List[str] - Enumerate available models
20
+ # • async AsyncLM.count_tokens(messages, model=None) -> int - Token counting utility
21
+ # • async AsyncLM.price(messages, model=None, response_tokens=0) -> float - Cost estimation
22
+ # • AsyncLM.set_model(model_name) -> None - Runtime model switching (sync method)
23
+ # • async AsyncLM.batch_call(requests) -> List[Union[str, BaseModel]] - Concurrent batch processing
24
+ # • TModel = TypeVar("TModel", bound=BaseModel) - Generic type for structured responses
25
+ # • Messages = List[ChatCompletionMessageParam] - Typed message format
26
+ #
27
+ # Invariants / Constraints:
28
+ # • MUST be used within async context (asyncio event loop required)
29
+ # • MUST provide either 'prompt' or 'messages' parameter, but not both
30
+ # • MUST properly await all async method calls
31
+ # • Connection pooling MUST handle concurrent requests efficiently
32
+ # • MUST maintain thread safety across concurrent operations
33
+ # • Rate limit handling MUST use async backoff without blocking event loop
34
+ # • MUST preserve all synchronous LM class behaviors and constraints
35
+ # • Resource cleanup MUST occur on context manager exit or explicit close
36
+ #
37
+ # Usage Example:
38
+ # ```python
39
+ # import asyncio
40
+ # from llm_utils.lm.async_lm import AsyncLM
41
+ # from pydantic import BaseModel
42
+ #
43
+ # class SummaryResponse(BaseModel):
44
+ # summary: str
45
+ # key_points: List[str]
46
+ # confidence: float
47
+ #
48
+ # async def main():
49
+ # # Single async call
50
+ # lm = AsyncLM(model="gpt-4o-mini", temperature=0.1)
51
+ # response = await lm(prompt="Summarize quantum computing")
52
+ # print(response)
53
+ #
54
+ # # Concurrent batch processing
55
+ # texts = ["Text 1 to summarize", "Text 2 to summarize", "Text 3 to summarize"]
56
+ # tasks = [lm(prompt=f"Summarize: {text}", response_format=SummaryResponse) for text in texts]
57
+ # summaries = await asyncio.gather(*tasks)
58
+ #
59
+ # for summary in summaries:
60
+ # print(f"Summary: {summary.summary}")
61
+ # print(f"Key points: {summary.key_points}")
62
+ #
63
+ # asyncio.run(main())
64
+ # ```
65
+ #
66
+ # TODO & Future Work:
67
+ # • Add async context manager support for automatic resource cleanup
68
+ # • Implement connection pool size optimization based on usage patterns
69
+ # • Add async streaming response support with async generators
70
+ # • Optimize memory usage for large-scale concurrent operations
71
+ # • Add async rate limiting with priority queuing
72
+ #
73
+ # ============================================================================= #
14
74
  """
15
75
 
16
76
  import base64
@@ -857,3 +917,4 @@ class AsyncLLMTask(ABC):
857
917
  system_msg=system_prompt, user_msg=user_msg, assistant_msg=assistant_msg
858
918
  )
859
919
  return {"messages": messages}
920
+ arun = __call__ # alias for compatibility with other LLMTask implementations
@@ -1,3 +1,79 @@
1
+ """
2
+ # ============================================================================= #
3
+ # SYNCHRONOUS LANGUAGE MODEL WRAPPER WITH OPENAI COMPATIBILITY
4
+ # ============================================================================= #
5
+ #
6
+ # Title & Intent:
7
+ # Unified synchronous language model interface with caching, type safety, and OpenAI API compatibility
8
+ #
9
+ # High-level Summary:
10
+ # This module provides a comprehensive synchronous wrapper for language models that supports both
11
+ # string prompts and structured Pydantic model responses. It includes intelligent caching with
12
+ # content-based hashing, automatic retry logic for rate limits, and seamless integration with
13
+ # OpenAI-compatible APIs. The LM class handles message formatting, response parsing, token counting,
14
+ # and provides detailed logging and debugging capabilities for production use.
15
+ #
16
+ # Public API / Data Contracts:
17
+ # • LM(model, temperature=0.0, max_tokens=2000, host="localhost", port=None, **kwargs) - Main wrapper class
18
+ # • LM.__call__(prompt=None, messages=None, response_format=str, cache=None, **kwargs) -> str | BaseModel
19
+ # • LM.list_models(port=None) -> List[str] - Enumerate available models
20
+ # • LM.count_tokens(messages, model=None) -> int - Token counting utility
21
+ # • LM.price(messages, model=None, response_tokens=0) -> float - Cost estimation
22
+ # • LM.set_model(model_name) -> None - Runtime model switching
23
+ # • TModel = TypeVar("TModel", bound=BaseModel) - Generic type for structured responses
24
+ # • Messages = List[ChatCompletionMessageParam] - Typed message format
25
+ # • RawMsgs = Union[Messages, LegacyMsgs] - Flexible input format
26
+ #
27
+ # Invariants / Constraints:
28
+ # • MUST provide either 'prompt' or 'messages' parameter, but not both
29
+ # • MUST set model name before making API calls (auto-detection available)
30
+ # • response_format=str MUST return string; response_format=PydanticModel MUST return model instance
31
+ # • Caching MUST use content-based hashing for reproducible results
32
+ # • MUST handle OpenAI rate limits with exponential backoff (up to 3 retries)
33
+ # • MUST preserve message order and format during transformations
34
+ # • Token counting SHOULD use tiktoken when available, fall back to character estimation
35
+ # • MUST validate Pydantic responses and retry on parsing failures
36
+ #
37
+ # Usage Example:
38
+ # ```python
39
+ # from llm_utils.lm.sync_lm import LM
40
+ # from pydantic import BaseModel
41
+ #
42
+ # class CodeResponse(BaseModel):
43
+ # language: str
44
+ # code: str
45
+ # explanation: str
46
+ #
47
+ # # String response
48
+ # lm = LM(model="gpt-4o-mini", temperature=0.1)
49
+ # response = lm(prompt="Write a Python hello world")
50
+ # print(response) # Returns string
51
+ #
52
+ # # Structured response
53
+ # code_response = lm(
54
+ # prompt="Write a Python function to calculate fibonacci",
55
+ # response_format=CodeResponse
56
+ # )
57
+ # print(f"Language: {code_response.language}") # Returns CodeResponse instance
58
+ #
59
+ # # Message-based conversation
60
+ # messages = [
61
+ # {"role": "system", "content": "You are a helpful coding assistant"},
62
+ # {"role": "user", "content": "Explain async/await in Python"}
63
+ # ]
64
+ # response = lm(messages=messages, max_tokens=1000)
65
+ # ```
66
+ #
67
+ # TODO & Future Work:
68
+ # • Add streaming response support for long-form generation
69
+ # • Implement fine-grained token usage tracking per conversation
70
+ # • Add support for function calling and tool use
71
+ # • Optimize caching strategy for conversation contexts
72
+ # • Add async context manager support for resource cleanup
73
+ #
74
+ # ============================================================================= #
75
+ """
76
+
1
77
  from __future__ import annotations
2
78
 
3
79
  import base64
@@ -551,9 +627,9 @@ class LM:
551
627
 
552
628
  assert isinstance(messages, list), "Messages must be a list."
553
629
  assert len(messages) > 0, "Messages cannot be empty."
554
- assert (
555
- messages[0]["role"] == "system"
556
- ), "First message must be a system message with instruction."
630
+ assert messages[0]["role"] == "system", (
631
+ "First message must be a system message with instruction."
632
+ )
557
633
  messages[0]["content"] += post_fix # type: ignore
558
634
 
559
635
  model_kwargs = {}
@@ -674,14 +750,13 @@ class LM:
674
750
 
675
751
  @lru_cache(maxsize=10)
676
752
  def get_tokenizer(model_name: str) -> Any:
677
- from transformers import AutoTokenizer # type: ignore
753
+ from transformers import AutoTokenizer # type: ignore
678
754
 
679
755
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
680
756
  return tokenizer
681
757
 
682
758
 
683
759
  def inspect_word_probs(lm, tokenizer, messages):
684
-
685
760
  import numpy as np
686
761
 
687
762
  def compute_word_log_probs(
@@ -819,7 +894,7 @@ class LLMTask(ABC):
819
894
 
820
895
  temperature = 0.6
821
896
  think=False
822
-
897
+
823
898
  demo_task = DemoTask()
824
899
  demo_task({'text_to_translate': 'Translate from english to vietnamese: Hello how are you'})
825
900
  ```
@@ -864,3 +939,5 @@ class LLMTask(ABC):
864
939
  return get_conversation_one_turn(
865
940
  system_msg=system_prompt, user_msg=user_msg, assistant_msg=assistant_msg
866
941
  )
942
+
943
+ run = __call__ # alias for compatibility with other LLMTask implementations
@@ -1,11 +1,85 @@
1
+ """
2
+ # ============================================================================= #
3
+ # VLLM LOAD BALANCER WITH HEALTH MONITORING AND DYNAMIC ROUTING
4
+ # ============================================================================= #
5
+ #
6
+ # Title & Intent:
7
+ # Production-ready TCP load balancer for vLLM model servers with health checks and connection management
8
+ #
9
+ # High-level Summary:
10
+ # This module implements a high-performance load balancer specifically designed for vLLM model
11
+ # serving infrastructure. It provides intelligent routing across multiple vLLM server instances,
12
+ # continuous health monitoring, automatic failover, and connection pooling. The load balancer
13
+ # uses async TCP proxying to handle concurrent requests efficiently while maintaining session
14
+ # affinity and providing detailed metrics for monitoring and debugging.
15
+ #
16
+ # Public API / Data Contracts:
17
+ # • LOAD_BALANCER_HOST = "0.0.0.0" - Load balancer bind address
18
+ # • LOAD_BALANCER_PORT = 8008 - Load balancer listening port
19
+ # • SCAN_TARGET_HOST = "localhost" - Target server host for health checks
20
+ # • SCAN_PORT_START = 8140, SCAN_PORT_END = 8170 - Port range for server discovery
21
+ # • start_load_balancer() -> None - Main entry point to start the service
22
+ # • scan_for_healthy_servers() -> None - Background health monitoring task
23
+ # • handle_client(reader, writer) -> None - Client connection handler
24
+ # • relay_data(reader, writer, direction) -> None - Bidirectional data relay
25
+ # • get_next_server() -> Optional[Tuple[str, int]] - Round-robin server selection
26
+ #
27
+ # Invariants / Constraints:
28
+ # • MUST continuously monitor server health every SCAN_INTERVAL seconds
29
+ # • MUST handle connection failures gracefully with automatic failover
30
+ # • Health checks MUST complete within HEALTH_CHECK_TIMEOUT seconds
31
+ # • MUST maintain connection counts for load balancing decisions
32
+ # • Server availability MUST be updated atomically using async locks
33
+ # • TCP connections MUST be properly closed on errors or completion
34
+ # • MUST log all connection events and health status changes
35
+ # • Round-robin selection MUST distribute load evenly across healthy servers
36
+ #
37
+ # Usage Example:
38
+ # ```python
39
+ # # Start the load balancer (blocking operation)
40
+ # import asyncio
41
+ # from llm_utils.scripts.vllm_load_balancer import start_load_balancer
42
+ #
43
+ # # Configure environment or modify constants as needed
44
+ # LOAD_BALANCER_HOST = "0.0.0.0"
45
+ # LOAD_BALANCER_PORT = 8008
46
+ # SCAN_TARGET_HOST = "localhost"
47
+ # SCAN_PORT_START = 8140
48
+ # SCAN_PORT_END = 8150
49
+ #
50
+ # # Start the load balancer service
51
+ # asyncio.run(start_load_balancer())
52
+ #
53
+ # # The service will:
54
+ # # 1. Scan for healthy vLLM servers on ports 8140-8150
55
+ # # 2. Accept client connections on port 8008
56
+ # # 3. Route requests to healthy backend servers
57
+ # # 4. Monitor server health continuously
58
+ # # 5. Provide connection statistics
59
+ # ```
60
+ #
61
+ # TODO & Future Work:
62
+ # • Add weighted round-robin based on server capacity metrics
63
+ # • Implement session affinity for stateful model interactions
64
+ # • Add HTTP health check endpoints for better monitoring integration
65
+ # • Support dynamic server registration and deregistration
66
+ # • Add metrics export for Prometheus/Grafana monitoring
67
+ # • Implement graceful shutdown with connection draining
68
+ #
69
+ # ============================================================================= #
70
+ """
71
+
1
72
  import asyncio
73
+ import contextlib
2
74
  import random
3
75
  from collections import defaultdict
4
- from tabulate import tabulate
5
- import contextlib
76
+
6
77
  import aiohttp # <-- Import aiohttp
7
- from speedy_utils import setup_logger
8
78
  from loguru import logger
79
+ from tabulate import tabulate
80
+
81
+ from speedy_utils import setup_logger
82
+
9
83
  setup_logger(min_interval=5)
10
84
  # --- Configuration ---
11
85
  LOAD_BALANCER_HOST = "0.0.0.0"
@@ -180,7 +254,9 @@ async def scan_and_update_servers():
180
254
  if server not in connection_counts:
181
255
  connection_counts[server] = 0
182
256
 
183
- logger.debug(f"[{LOAD_BALANCER_PORT=}]Scan complete. Active servers: {available_servers}")
257
+ logger.debug(
258
+ f"[{LOAD_BALANCER_PORT=}]Scan complete. Active servers: {available_servers}"
259
+ )
184
260
 
185
261
  except asyncio.CancelledError:
186
262
  logger.info("Server scan task cancelled.")
@@ -219,9 +295,7 @@ async def handle_client(client_reader, client_writer):
219
295
 
220
296
  min_connections = float("inf")
221
297
  least_used_available_servers = []
222
- for (
223
- server
224
- ) in (
298
+ for server in (
225
299
  available_servers
226
300
  ): # Iterate only over servers that passed health check
227
301
  count = connection_counts.get(server, 0)
@@ -1,29 +1,79 @@
1
- """ "
2
- USAGE:
3
- Serve models and LoRAs with vLLM:
4
-
5
- Serve a LoRA model:
6
- svllm serve --lora LORA_NAME LORA_PATH --gpus GPU_GROUPS
7
-
8
- Serve a base model:
9
- svllm serve --model MODEL_NAME --gpus GPU_GROUPS
10
-
11
- Add a LoRA to a served model:
12
- svllm add-lora --lora LORA_NAME LORA_PATH --host_port host:port
13
- (if add then the port must be specify)
1
+ """
2
+ # ============================================================================= #
3
+ # VLLM MODEL SERVING AND LORA MANAGEMENT UTILITIES
4
+ # ============================================================================= #
5
+ #
6
+ # Title & Intent:
7
+ # Command-line interface for serving language models and managing LoRA adapters with vLLM
8
+ #
9
+ # High-level Summary:
10
+ # This module provides a comprehensive CLI tool for deploying and managing vLLM model servers
11
+ # with support for base models, LoRA adapters, and dynamic adapter loading. It handles GPU
12
+ # allocation, process management, model discovery, and provides utilities for adding/removing
13
+ # LoRA adapters to running servers. The tool simplifies the deployment of production-ready
14
+ # language model serving infrastructure with fine-tuned model support.
15
+ #
16
+ # Public API / Data Contracts:
17
+ # • serve_model(model_name, gpus, **kwargs) -> subprocess.Popen - Start vLLM server for base model
18
+ # • serve_lora(lora_name_or_path, gpus, **kwargs) -> subprocess.Popen - Start vLLM server with LoRA
19
+ # • add_lora(lora_name_or_path, host_port, **kwargs) -> dict - Add LoRA to running server
20
+ # • list_loras(host_port, api_key="abc") -> None - List available LoRA adapters
21
+ # • model_list(host_port, api_key="abc") -> None - List available models
22
+ # • remove_lora(lora_name, host_port, api_key="abc") -> dict - Remove LoRA adapter
23
+ # • get_lora_path(lora_name_or_path) -> str - Resolve LoRA adapter path
24
+ # • LORA_DIR: str - Environment-configurable LoRA storage directory
25
+ # • HF_HOME: str - Hugging Face cache directory
26
+ #
27
+ # Invariants / Constraints:
28
+ # • GPU groups MUST be specified as comma-separated integers (e.g., "0,1,2,3")
29
+ # • LoRA paths MUST exist and contain valid adapter files
30
+ # • Server endpoints MUST be reachable for dynamic LoRA operations
31
+ # • MUST validate model and LoRA compatibility before serving
32
+ # • Process management MUST handle graceful shutdown on interruption
33
+ # • MUST respect CUDA device visibility and memory constraints
34
+ # • LoRA operations MUST verify server API compatibility
35
+ # • MUST log all serving operations and adapter changes
36
+ #
37
+ # Usage Example:
38
+ # ```bash
39
+ # # Serve a base model on GPUs 0,1
40
+ # svllm serve --model meta-llama/Llama-2-7b-hf --gpus 0,1
41
+ #
42
+ # # Serve a model with LoRA adapter
43
+ # svllm serve --lora my-adapter /path/to/adapter --gpus 0,1,2,3
44
+ #
45
+ # # Add LoRA to running server
46
+ # svllm add-lora --lora new-adapter /path/to/new-adapter --host_port localhost:8000
47
+ #
48
+ # # List available models
49
+ # svllm list-models --host_port localhost:8000
50
+ #
51
+ # # Remove LoRA adapter
52
+ # svllm remove-lora --lora adapter-name --host_port localhost:8000
53
+ # ```
54
+ #
55
+ # TODO & Future Work:
56
+ # • Add support for multi-node distributed serving
57
+ # • Implement automatic model quantization options
58
+ # • Add configuration validation before server startup
59
+ # • Support for custom tokenizer and chat templates
60
+ # • Add health check endpoints for load balancer integration
61
+ # • Implement rolling updates for zero-downtime deployments
62
+ #
63
+ # ============================================================================= #
14
64
  """
15
65
 
66
+ import argparse
16
67
  import os
17
68
  import subprocess
18
69
  from typing import List, Optional
19
- import argparse
20
- import requests
70
+
21
71
  import openai
72
+ import requests
22
73
  from loguru import logger
23
74
 
24
75
  from speedy_utils.common.utils_io import load_by_ext
25
76
 
26
-
27
77
  LORA_DIR: str = os.environ.get("LORA_DIR", "/loras")
28
78
  LORA_DIR = os.path.abspath(LORA_DIR)
29
79
  HF_HOME: str = os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
@@ -181,9 +231,9 @@ def get_vllm() -> str:
181
231
  vllm_binary = subprocess.check_output("which vllm", shell=True, text=True).strip()
182
232
  vllm_binary = os.getenv("VLLM_BINARY", vllm_binary)
183
233
  logger.info(f"vLLM binary: {vllm_binary}")
184
- assert os.path.exists(
185
- vllm_binary
186
- ), f"vLLM binary not found at {vllm_binary}, please set VLLM_BINARY env variable"
234
+ assert os.path.exists(vllm_binary), (
235
+ f"vLLM binary not found at {vllm_binary}, please set VLLM_BINARY env variable"
236
+ )
187
237
  return vllm_binary
188
238
 
189
239
 
@@ -1,5 +1,97 @@
1
+ # ----------------------------------------------------------------------------
2
+ # speedy_utils/__init__.py
3
+ #
4
+ # Main entry point and public API for the Speedy Utils library
5
+ #
6
+ # This module exports the primary utilities for enhanced Python development
7
+ # productivity including caching mechanisms, parallel processing, file I/O,
8
+ # timing utilities, and data manipulation functions. It provides a convenient
9
+ # single-import interface for the most commonly used functionality.
10
+ #
11
+ # Public API / Data Contracts:
12
+ # • setup_logger(min_interval: int = 5) -> None - Configure logging system
13
+ # • log(*args, **kwargs) -> None - Rate-limited logging function
14
+ # • Clock() - Timing and performance measurement utility
15
+ # • speedy_timer: Clock - Pre-configured global timer instance
16
+ # • timef(func) -> Callable - Function execution time decorator
17
+ # • retry_runtime(sleep_seconds: int, max_retry: int, exceptions) -> Callable
18
+ # • memoize(func) -> Callable - Function result caching decorator
19
+ # • identify(obj: Any) -> str - Generate unique object identifier
20
+ # • identify_uuid(obj: Any) -> str - Generate UUID-based object identifier
21
+ # • load_by_ext(fname: str | list[str]) -> Any - Auto-detect file format loader
22
+ # • dump_json_or_pickle(obj: Any, fname: str) -> None - Smart file serializer
23
+ # • load_json_or_pickle(fname: str) -> Any - Smart file deserializer
24
+ # • multi_thread(func, items, **kwargs) -> list - Parallel thread execution
25
+ # • multi_process(func, items, **kwargs) -> list - Parallel process execution
26
+ #
27
+ # Invariants / Constraints:
28
+ # • MUST import only stable, tested utilities into public namespace
29
+ # • SHOULD maintain backward compatibility across minor versions
30
+ # • MUST provide consistent error handling across all public functions
31
+ # • SHOULD use lazy imports for heavy dependencies when possible
32
+ #
33
+ # Usage Example:
34
+ # ```python
35
+ # from speedy_utils import Clock, memoize, multi_thread, load_by_ext
36
+ #
37
+ # @memoize
38
+ # def expensive_computation(x):
39
+ # return x ** 2
40
+ #
41
+ # timer = Clock()
42
+ # timer.start()
43
+ # results = multi_thread(expensive_computation, range(100))
44
+ # timer.end()
45
+ # data = load_by_ext("config.json")
46
+ # ```
47
+ #
48
+ # TODO & Future Work:
49
+ # • Add async variants for I/O operations
50
+ # • Implement distributed caching backend
51
+ # • Add GPU acceleration utilities
52
+ # ----------------------------------------------------------------------------
53
+
1
54
  # Import specific functions and classes from modules
2
55
  # Logger
56
+ # Standard library imports
57
+ import copy
58
+ import functools
59
+ import gc
60
+ import inspect
61
+ import json
62
+ import multiprocessing
63
+ import os
64
+ import os.path as osp
65
+ import pickle
66
+ import pprint
67
+ import random
68
+ import re
69
+ import sys
70
+ import textwrap
71
+ import threading
72
+ import time
73
+ import traceback
74
+ import uuid
75
+ from collections import Counter, defaultdict
76
+ from collections.abc import Callable
77
+ from concurrent.futures import ThreadPoolExecutor, as_completed
78
+ from glob import glob
79
+ from multiprocessing import Pool
80
+ from pathlib import Path
81
+ from threading import Lock
82
+ from typing import Any, Dict, Generic, List, Literal, Optional, TypeVar, Union
83
+
84
+ # Third-party imports
85
+ import numpy as np
86
+ import pandas as pd
87
+ import xxhash
88
+ from IPython.core.getipython import get_ipython
89
+ from IPython.display import HTML, display
90
+ from loguru import logger
91
+ from pydantic import BaseModel
92
+ from tabulate import tabulate
93
+ from tqdm import tqdm
94
+
3
95
  from speedy_utils.common.logger import log, setup_logger
4
96
 
5
97
  # Clock module
@@ -8,6 +100,13 @@ from .common.clock import Clock, speedy_timer, timef
8
100
  # Function decorators
9
101
  from .common.function_decorator import retry_runtime
10
102
 
103
+ # notebook
104
+ from .common.notebook_utils import (
105
+ change_dir,
106
+ display_pretty_table_html,
107
+ print_table,
108
+ )
109
+
11
110
  # Cache utilities
12
111
  from .common.utils_cache import identify, identify_uuid, memoize
13
112
 
@@ -36,57 +135,11 @@ from .common.utils_print import (
36
135
  flatten_dict,
37
136
  fprint,
38
137
  )
39
- from .common.notebook_utils import (
40
- display_pretty_table_html,
41
- print_table,
42
- )
43
138
 
44
139
  # Multi-worker processing
45
140
  from .multi_worker.process import multi_process
46
141
  from .multi_worker.thread import multi_thread
47
142
 
48
- # notebook
49
- from .common.notebook_utils import change_dir
50
-
51
- # Standard library imports
52
- import copy
53
- import functools
54
- import gc
55
- import inspect
56
- import json
57
- import multiprocessing
58
- import os
59
- import os.path as osp
60
- import pickle
61
- import pprint
62
- import random
63
- import re
64
- import sys
65
- import textwrap
66
- import threading
67
- import time
68
- import traceback
69
- import uuid
70
- from collections import Counter, defaultdict
71
- from collections.abc import Callable
72
- from concurrent.futures import ThreadPoolExecutor, as_completed
73
- from glob import glob
74
- from multiprocessing import Pool
75
- from pathlib import Path
76
- from threading import Lock
77
- from typing import Any, Dict, Generic, List, Literal, Optional, TypeVar, Union
78
-
79
- # Third-party imports
80
- import numpy as np
81
- import pandas as pd
82
- import xxhash
83
- from IPython.core.getipython import get_ipython
84
- from IPython.display import HTML, display
85
- from loguru import logger
86
- from pydantic import BaseModel
87
- from tabulate import tabulate
88
- from tqdm import tqdm
89
-
90
143
  # Define __all__ explicitly
91
144
  __all__ = [
92
145
  # Standard library
@@ -173,4 +226,4 @@ __all__ = [
173
226
  "multi_thread",
174
227
  # Notebook utilities
175
228
  "change_dir",
176
- ]
229
+ ]
@@ -1,3 +1,46 @@
1
+ # ----------------------------------------------------------------------------
2
+ # speedy_utils/all.py
3
+ #
4
+ # Consolidated import collection for comprehensive library access
5
+ #
6
+ # This module provides a unified collection of standard library, third-party,
7
+ # and internal imports commonly used across data science and development
8
+ # workflows. It serves as a convenience module for interactive environments
9
+ # and rapid prototyping by reducing boilerplate import statements.
10
+ #
11
+ # Public API / Data Contracts:
12
+ # • All standard library modules: collections, concurrent.futures, pathlib, etc.
13
+ # • Third-party dependencies: loguru.logger, pydantic.BaseModel, tqdm, tabulate
14
+ # • Core utilities: Counter, defaultdict, ThreadPoolExecutor, as_completed
15
+ # • Development tools: IPython.display.HTML, get_ipython for notebook detection
16
+ # • Type system: Any, Dict, List, Optional, Union, TypeVar, Generic, Literal
17
+ #
18
+ # Invariants / Constraints:
19
+ # • MUST only import stable, widely-used packages
20
+ # • SHOULD handle import failures gracefully for optional dependencies
21
+ # • MUST maintain consistent import aliases across the library
22
+ # • SHOULD group imports by category (stdlib, third-party, internal)
23
+ #
24
+ # Usage Example:
25
+ # ```python
26
+ # from speedy_utils.all import *
27
+ #
28
+ # # Now have access to common utilities without individual imports
29
+ # data = defaultdict(list)
30
+ # results = []
31
+ # for item in tqdm(items):
32
+ # results.append(process(item))
33
+ #
34
+ # df = tabulate(results, headers=['Item', 'Result'])
35
+ # display(HTML(df))
36
+ # ```
37
+ #
38
+ # TODO & Future Work:
39
+ # • Add conditional imports for ML libraries (torch, numpy, pandas)
40
+ # • Implement import health checking
41
+ # • Add version compatibility warnings
42
+ # ----------------------------------------------------------------------------
43
+
1
44
  # speedy_utils/all.py
2
45
 
3
46
  # Provide a consolidated set of imports for convenience
@@ -1,4 +1,81 @@
1
- """Provides thread-based parallel execution utilities."""
1
+ """
2
+ # ============================================================================= #
3
+ # THREAD-BASED PARALLEL EXECUTION WITH PROGRESS TRACKING AND ERROR HANDLING
4
+ # ============================================================================= #
5
+ #
6
+ # Title & Intent:
7
+ # High-performance thread pool utilities for parallel processing with comprehensive error handling
8
+ #
9
+ # High-level Summary:
10
+ # This module provides robust thread-based parallel execution utilities designed for CPU-bound
11
+ # and I/O-bound tasks requiring concurrent processing. It features intelligent worker management,
12
+ # comprehensive error handling with detailed tracebacks, progress tracking with tqdm integration,
13
+ # and flexible batching strategies. The module optimizes for both throughput and reliability,
14
+ # making it suitable for data processing pipelines, batch operations, and concurrent API calls.
15
+ #
16
+ # Public API / Data Contracts:
17
+ # • multi_thread(func, inputs, num_workers=None, progress=True, **kwargs) -> List[Any] - Main parallel execution
18
+ # • multi_thread_batch(func, inputs, batch_size=10, num_workers=None, **kwargs) -> List[Any] - Batched processing
19
+ # • DEFAULT_WORKERS = (cpu_count * 2) - Default worker thread count
20
+ # • T = TypeVar("T"), R = TypeVar("R") - Generic type variables for input/output typing
21
+ # • _group_iter(src, size) -> Iterable[List[T]] - Utility for chunking iterables
22
+ # • _worker(item, func, fixed_kwargs) -> R - Individual worker function wrapper
23
+ # • _short_tb() -> str - Shortened traceback formatter for cleaner error logs
24
+ #
25
+ # Invariants / Constraints:
26
+ # • Worker count MUST be positive integer, defaults to (CPU cores * 2)
27
+ # • Input iterables MUST be finite and non-empty for meaningful processing
28
+ # • Functions MUST be thread-safe when used with multiple workers
29
+ # • Error handling MUST capture and log detailed tracebacks for debugging
30
+ # • Progress tracking MUST be optional and gracefully handle tqdm unavailability
31
+ # • Batch processing MUST maintain input order in results
32
+ # • MUST handle keyboard interruption gracefully with resource cleanup
33
+ # • Thread pool MUST be properly closed and joined after completion
34
+ #
35
+ # Usage Example:
36
+ # ```python
37
+ # from speedy_utils.multi_worker.thread import multi_thread, multi_thread_batch
38
+ # import requests
39
+ #
40
+ # # Simple parallel processing
41
+ # def square(x):
42
+ # return x ** 2
43
+ #
44
+ # numbers = list(range(100))
45
+ # results = multi_thread(square, numbers, num_workers=8)
46
+ # print(f"Processed {len(results)} items")
47
+ #
48
+ # # Parallel API calls with error handling
49
+ # def fetch_url(url):
50
+ # response = requests.get(url, timeout=10)
51
+ # return response.status_code, len(response.content)
52
+ #
53
+ # urls = ["http://example.com", "http://google.com", "http://github.com"]
54
+ # results = multi_thread(fetch_url, urls, num_workers=3, progress=True)
55
+ #
56
+ # # Batched processing for memory efficiency
57
+ # def process_batch(items):
58
+ # return [item.upper() for item in items]
59
+ #
60
+ # large_dataset = ["item" + str(i) for i in range(10000)]
61
+ # batched_results = multi_thread_batch(
62
+ # process_batch,
63
+ # large_dataset,
64
+ # batch_size=100,
65
+ # num_workers=4
66
+ # )
67
+ # ```
68
+ #
69
+ # TODO & Future Work:
70
+ # • Add adaptive worker count based on task characteristics
71
+ # • Implement priority queuing for time-sensitive tasks
72
+ # • Add memory usage monitoring and automatic batch size adjustment
73
+ # • Support for async function execution within thread pool
74
+ # • Add detailed performance metrics and timing analysis
75
+ # • Implement graceful degradation for resource-constrained environments
76
+ #
77
+ # ============================================================================= #
78
+ """
2
79
 
3
80
  import os
4
81
  import time
File without changes