PyPI - speedy-utils - Versions diffs - 1.1.39__py3-none-any.whl → 1.1.42__py3-none-any.whl - Mend

speedy-utils 1.1.39py3-none-any.whl → 1.1.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

llm_utils/__init__.py +2 -0
llm_utils/llm_ray.py +370 -0
llm_utils/lm/async_lm/async_lm.py +7 -1
llm_utils/lm/llm.py +36 -29
speedy_utils/__init__.py +3 -0
speedy_utils/common/utils_io.py +3 -1
speedy_utils/multi_worker/__init__.py +12 -0
speedy_utils/multi_worker/dataset_ray.py +303 -0
speedy_utils/multi_worker/parallel_gpu_pool.py +178 -0
speedy_utils/multi_worker/process.py +375 -75
speedy_utils/multi_worker/progress.py +140 -0
speedy_utils/scripts/mpython.py +49 -4
{speedy_utils-1.1.39.dist-info → speedy_utils-1.1.42.dist-info}/METADATA +3 -2
{speedy_utils-1.1.39.dist-info → speedy_utils-1.1.42.dist-info}/RECORD +16 -12
{speedy_utils-1.1.39.dist-info → speedy_utils-1.1.42.dist-info}/WHEEL +0 -0
{speedy_utils-1.1.39.dist-info → speedy_utils-1.1.42.dist-info}/entry_points.txt +0 -0

llm_utils/__init__.py CHANGED Viewed

@@ -12,6 +12,7 @@ from llm_utils.lm import (
 from llm_utils.lm.base_prompt_builder import BasePromptBuilder
 from llm_utils.lm.lm_base import get_model_name
 from llm_utils.lm.openai_memoize import MOpenAI
+from llm_utils.llm_ray import LLMRay
 from llm_utils.vector_cache import VectorCache
@@ -57,6 +58,7 @@ __all__ = [
     "AsyncLM",
     "AsyncLLMTask",
     "LLM",
+    "LLMRay",
     "MOpenAI",
     "get_model_name",
     "VectorCache",

llm_utils/llm_ray.py ADDED Viewed

@@ -0,0 +1,370 @@
+"""
+LLMRay: Simplified Ray-based vLLM wrapper for offline batch inference.
+Automatically handles data parallelism across available GPUs in Ray cluster.
+Pipeline parallel is always 1 (no layer splitting).
+Example:
+    # dp=4, tp=2 means 8 GPUs total, 4 model replicas each using 2 GPUs
+    llm = LLMRay(model_name='Qwen/Qwen3-0.6B', dp=4, tp=2)
+    # dp=8, tp=2 means 16 GPUs across nodes, 8 model replicas
+    llm = LLMRay(model_name='meta-llama/Llama-3-70B', dp=8, tp=2)
+"""
+import os
+import datetime
+import ray
+import numpy as np
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+from tqdm.auto import tqdm
+# Type alias for OpenAI-style messages
+Message = Dict[str, str]  # {'role': str, 'content': str}
+Messages = List[Message]
+@ray.remote
+class _ProgressTracker:
+    """Ray actor for tracking global progress across workers."""
+    def __init__(self, total_items: int):
+        self.total_items = total_items
+        self.processed_count = 0
+        import time
+        self.start_time = time.time()
+    def increment(self) -> None:
+        self.processed_count += 1
+    def get_stats(self) -> tuple:
+        import time
+        elapsed = time.time() - self.start_time
+        speed = self.processed_count / elapsed if elapsed > 0 else 0
+        return self.processed_count, self.total_items, speed, elapsed
+class _VLLMWorkerBase(ABC):
+    """Base worker class for vLLM inference."""
+    def __init__(
+        self,
+        worker_id: int,
+        log_dir: Optional[str],
+        tracker: Any,
+        **kwargs: Any,
+    ):
+        self.worker_id = worker_id
+        self.log_dir = log_dir
+        self.tracker = tracker
+        self.kwargs = kwargs
+        self._log_file_handle = None
+        self._last_print_time = 0
+    @abstractmethod
+    def setup(self) -> None:
+        pass
+    @abstractmethod
+    def process_one_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
+        pass
+    def _redirect_output(self) -> None:
+        """Workers > 0 write to disk. Worker 0 writes to stdout."""
+        import sys
+        if self.worker_id == 0 or self.log_dir is None:
+            return
+        log_path = os.path.join(self.log_dir, f'worker_{self.worker_id}.log')
+        self._log_file_handle = open(log_path, 'w', buffering=1)
+        sys.stdout = self._log_file_handle
+        sys.stderr = self._log_file_handle
+    def _print_global_stats(self) -> None:
+        """Only used by Worker 0 to print global stats."""
+        import time
+        import datetime as dt
+        if self.tracker is None:
+            return
+        if time.time() - self._last_print_time < 5:
+            return
+        count, total, speed, elapsed = ray.get(self.tracker.get_stats.remote())
+        if speed > 0:
+            eta = (total - count) / speed
+            eta_str = str(dt.timedelta(seconds=int(eta)))
+        else:
+            eta_str = '?'
+        msg = (
+            f'[Global] {count}/{total} | {count/total:.1%} | '
+            f'Speed: {speed:.2f} it/s | ETA: {eta_str}'
+        )
+        print(msg)
+        self._last_print_time = time.time()
+    def _run_shard(self, shard: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        self._redirect_output()
+        try:
+            self.setup()
+            results = []
+            iterator = shard
+            if self.worker_id > 0:
+                iterator = tqdm(shard, desc=f'Worker {self.worker_id}')
+            for item in iterator:
+                try:
+                    res = self.process_one_item(item)
+                    results.append(res)
+                except Exception as e:
+                    print(f'Error {item}: {e}')
+                    results.append(None)
+                if self.tracker:
+                    self.tracker.increment.remote()
+                if self.worker_id == 0:
+                    self._print_global_stats()
+            return results
+        finally:
+            if self._log_file_handle:
+                self._log_file_handle.close()
+class _VLLMWorker(_VLLMWorkerBase):
+    """Worker that runs vLLM inference on assigned GPUs."""
+    def setup(self) -> None:
+        """Initialize vLLM engine with configured parameters."""
+        from vllm import LLM
+        model_name = self.kwargs['model_name']
+        tp = self.kwargs.get('tp', 1)
+        gpu_memory_utilization = self.kwargs.get(
+            'gpu_memory_utilization', 0.9
+        )
+        trust_remote_code = self.kwargs.get('trust_remote_code', True)
+        vllm_kwargs = self.kwargs.get('vllm_kwargs', {})
+        print(
+            f'Worker {self.worker_id}: Loading vLLM model {model_name} '
+            f'with TP={tp}...'
+        )
+        self.model = LLM(
+            model=model_name,
+            tensor_parallel_size=tp,
+            pipeline_parallel_size=1,  # Always 1 as per requirement
+            gpu_memory_utilization=gpu_memory_utilization,
+            trust_remote_code=trust_remote_code,
+            enforce_eager=True,
+            **vllm_kwargs,
+        )
+        # Store default sampling params
+        self.default_sampling_params = self.kwargs.get(
+            'sampling_params', {}
+        )
+    def process_one_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
+        """Process a single input item with OpenAI-style messages."""
+        from vllm import SamplingParams
+        messages = item.get('messages')
+        if not messages:
+            raise ValueError('Item must contain "messages" key')
+        # Validate messages format
+        for msg in messages:
+            if not isinstance(msg, dict):
+                raise ValueError(
+                    f'Each message must be dict, got {type(msg)}'
+                )
+            if 'role' not in msg or 'content' not in msg:
+                raise ValueError(
+                    'Each message must have "role" and "content"'
+                )
+        # Build sampling params (item-specific overrides default)
+        sampling_config = {
+            **self.default_sampling_params,
+            **item.get('sampling_params', {}),
+        }
+        sampling_params = SamplingParams(**sampling_config)
+        # Use vLLM chat interface
+        outputs = self.model.chat(
+            messages=[messages],
+            sampling_params=sampling_params,
+        )
+        generated_text = outputs[0].outputs[0].text
+        # Build result
+        result = {
+            'messages': messages,
+            'generated_text': generated_text,
+            'worker_id': self.worker_id,
+            'finish_reason': outputs[0].outputs[0].finish_reason,
+        }
+        # Include any extra metadata from input
+        for key in item:
+            if key not in ['messages', 'sampling_params']:
+                result[f'meta_{key}'] = item[key]
+        return result
+class LLMRay:
+    """
+    Ray-based LLM wrapper for offline batch inference with OpenAI messages.
+    Spawns multiple model replicas (data parallel) across GPUs/nodes.
+    Each replica can use multiple GPUs (tensor parallel).
+    Args:
+        model_name: HuggingFace model name or path
+        dp: Data parallel - number of model replicas
+        tp: Tensor parallel - GPUs per replica
+        Total GPUs used = dp * tp
+    Example:
+        # 8 GPUs: 4 replicas, each using 2 GPUs
+        >>> llm = LLMRay(model_name='Qwen/Qwen3-0.6B', dp=4, tp=2)
+        # 16 GPUs across 2 nodes: 8 replicas, each using 2 GPUs
+        >>> llm = LLMRay(model_name='meta-llama/Llama-3-70B', dp=8, tp=2)
+        >>> inputs = [
+        ...     [{'role': 'user', 'content': 'What is AI?'}],
+        ...     [{'role': 'user', 'content': 'Explain quantum computing.'}],
+        ... ]
+        >>> results = llm.generate(inputs)
+    """
+    def __init__(
+        self,
+        model_name: str,
+        dp: int = 1,
+        tp: int = 1,
+        gpu_memory_utilization: float = 0.9,
+        trust_remote_code: bool = True,
+        sampling_params: Optional[Dict[str, Any]] = None,
+        vllm_kwargs: Optional[Dict[str, Any]] = None,
+        ray_address: Optional[str] = None,
+    ):
+        """
+        Initialize LLMRay.
+        Args:
+            model_name: HuggingFace model name or path
+            dp: Data parallel - number of model replicas (workers)
+            tp: Tensor parallel - number of GPUs per replica
+            gpu_memory_utilization: Fraction of GPU memory to use
+            trust_remote_code: Whether to trust remote code from HF
+            sampling_params: Default sampling parameters
+            vllm_kwargs: Additional kwargs to pass to vLLM constructor
+            ray_address: Ray cluster address ('auto' for existing cluster,
+                None for local, or specific address like 'ray://...')
+        """
+        self.model_name = model_name
+        self.dp = dp
+        self.tp = tp
+        self.gpu_memory_utilization = gpu_memory_utilization
+        self.trust_remote_code = trust_remote_code
+        self.sampling_params = sampling_params or {
+            'temperature': 0.7,
+            'max_tokens': 512,
+        }
+        self.vllm_kwargs = vllm_kwargs or {}
+        self.ray_address = ray_address
+        # Setup logging
+        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+        self.log_base = f'/tmp/raylog/llmray_{timestamp}'
+        # Initialize Ray
+        self._init_ray()
+    def _init_ray(self) -> None:
+        """Initialize Ray cluster connection."""
+        if not ray.is_initialized():
+            if self.ray_address:
+                ray.init(address=self.ray_address, ignore_reinit_error=True)
+            else:
+                ray.init(ignore_reinit_error=True)
+        resources = ray.cluster_resources()
+        total_gpus = int(resources.get('GPU', 0))
+        required_gpus = self.dp * self.tp
+        if total_gpus == 0:
+            raise RuntimeError('No GPUs found in Ray cluster!')
+        if total_gpus < required_gpus:
+            raise RuntimeError(
+                f'Not enough GPUs: need {required_gpus} (dp={self.dp} x '
+                f'tp={self.tp}), but cluster has {total_gpus}'
+            )
+        print(f'>>> Ray cluster connected. Total GPUs: {total_gpus}')
+        print(f'>>> Config: dp={self.dp}, tp={self.tp} → {required_gpus} GPUs')
+        print(f'>>> Logs: {self.log_base}')
+        os.makedirs(self.log_base, exist_ok=True)
+    def generate(self, inputs: List[Messages]) -> List[Dict[str, Any]]:
+        """
+        Generate responses for a batch of message lists.
+        Args:
+            inputs: List of message lists, where each message list is
+                OpenAI-style: [{'role': 'user', 'content': '...'}]
+        Returns:
+            List of result dictionaries with generated text and metadata
+        """
+        # Normalize inputs to dict format with 'messages' key
+        normalized_inputs = []
+        for messages in inputs:
+            if not isinstance(messages, list):
+                raise ValueError(
+                    f'Each input must be list of messages, got {type(messages)}'
+                )
+            normalized_inputs.append({'messages': messages})
+        num_workers = self.dp
+        print(f'>>> Spawning {num_workers} workers for {len(inputs)} items.')
+        # 1. Start the Global Tracker
+        tracker = _ProgressTracker.remote(len(normalized_inputs))
+        # 2. Prepare Shards
+        shards = np.array_split(normalized_inputs, num_workers)
+        # 3. Create Remote Worker Class with tp GPUs per worker
+        RemoteWorker = ray.remote(num_gpus=self.tp)(_VLLMWorker)
+        actors = []
+        futures = []
+        for i, shard in enumerate(shards):
+            if len(shard) == 0:
+                continue
+            # Initialize Actor
+            actor = RemoteWorker.remote(
+                worker_id=i,
+                log_dir=self.log_base,
+                tracker=tracker,
+                model_name=self.model_name,
+                tp=self.tp,
+                gpu_memory_utilization=self.gpu_memory_utilization,
+                trust_remote_code=self.trust_remote_code,
+                sampling_params=self.sampling_params,
+                vllm_kwargs=self.vllm_kwargs,
+            )
+            actors.append(actor)
+            # Launch Task
+            futures.append(actor._run_shard.remote(shard.tolist()))
+        results = ray.get(futures)
+        return [item for sublist in results for item in sublist]
+    def __call__(self, inputs: List[Messages]) -> List[Dict[str, Any]]:
+        """Alias for generate()."""
+        return self.generate(inputs)

llm_utils/lm/async_lm/async_lm.py CHANGED Viewed

@@ -15,7 +15,13 @@ from pydantic import BaseModel
 # from llm_utils.lm.async_lm.async_llm_task import OutputModelType
 from llm_utils.lm.async_lm.async_lm_base import AsyncLMBase
-from json_repair import jloads
+import json
+try:
+    from json_repair import jloads
+except ImportError:
+    def jloads(x):
+        return json.loads(x)
 from ._utils import (
     LegacyMsgs,

llm_utils/lm/llm.py CHANGED Viewed

@@ -9,6 +9,7 @@ import subprocess
 from typing import Any, Dict, List, Optional, Type, Union, cast
 import requests
+from httpx import Timeout
 from loguru import logger
 from openai import AuthenticationError, BadRequestError, OpenAI, RateLimitError
 from openai.types.chat import ChatCompletionMessageParam
@@ -66,6 +67,7 @@ class LLM(
         vllm_timeout: int = 1200,
         vllm_reuse: bool = True,
         verbose=False,
+        timeout: float | Timeout | None = None,
         **model_kwargs,
     ):
         """Initialize LLMTask."""
@@ -83,8 +85,10 @@ class LLM(
         self.vllm_timeout = vllm_timeout
         self.vllm_reuse = vllm_reuse
         self.vllm_process: subprocess.Popen | None = None
+        self.timeout = timeout
         self.last_ai_response = None  # Store raw response from client
         self.cache = cache
+        self.api_key = client.api_key if isinstance(client, OpenAI) else 'abc'
         # Handle VLLM server startup if vllm_cmd is provided
         if self.vllm_cmd:
@@ -96,7 +100,11 @@ class LLM(
                 client = port
         self.client = get_base_client(
-            client, cache=cache, vllm_cmd=self.vllm_cmd, vllm_process=self.vllm_process
+            client,
+            cache=cache,
+            api_key=self.api_key,
+            vllm_cmd=self.vllm_cmd,
+            vllm_process=self.vllm_process,
         )
         # check connection of client
         try:
@@ -165,6 +173,9 @@ class LLM(
         # Extract model name from kwargs for API call
         api_kwargs = {k: v for k, v in effective_kwargs.items() if k != 'model'}
+        if 'timeout' not in api_kwargs and self.timeout is not None:
+            api_kwargs['timeout'] = self.timeout
         try:
             completion = self.client.chat.completions.create(
                 model=model_name, messages=messages, **api_kwargs
@@ -220,6 +231,9 @@ class LLM(
         # Extract model name from kwargs for API call
         api_kwargs = {k: v for k, v in effective_kwargs.items() if k != 'model'}
+        if 'timeout' not in api_kwargs and self.timeout is not None:
+            api_kwargs['timeout'] = self.timeout
         pydantic_model_to_use_opt = response_model or self.output_model
         if pydantic_model_to_use_opt is None:
             raise ValueError(
@@ -398,6 +412,7 @@ class LLM(
         vllm_cmd: str | None = None,
         vllm_timeout: int = 120,
         vllm_reuse: bool = True,
+        timeout: float | Timeout | None = None,
         **model_kwargs,
     ) -> 'LLM':
         """
@@ -415,6 +430,7 @@ class LLM(
             vllm_cmd: Optional VLLM command to start server automatically
             vllm_timeout: Timeout in seconds to wait for VLLM server (default 120)
             vllm_reuse: If True (default), reuse existing server on target port
+            timeout: Optional OpenAI client timeout in seconds
             **model_kwargs: Additional model parameters
         """
         instruction = cls.get_instruction()
@@ -433,11 +449,9 @@ class LLM(
             vllm_cmd=vllm_cmd,
             vllm_timeout=vllm_timeout,
             vllm_reuse=vllm_reuse,
+            timeout=timeout,
             **model_kwargs,
         )
-from typing import Any, Dict, List, Optional, Type, Union
-from pydantic import BaseModel
-from .llm import LLM, Messages
 class LLM_NEMOTRON3(LLM):
     """
@@ -447,15 +461,15 @@ class LLM_NEMOTRON3(LLM):
     def __init__(
         self,
-        model: str = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+        model: str = 'nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16',
         thinking_budget: int = 1024,
         enable_thinking: bool = True,
-        **kwargs
+        **kwargs,
     ):
         # Force reasoning_model to True to enable reasoning_content extraction
         kwargs['is_reasoning_model'] = True
         super().__init__(**kwargs)
         self.model_kwargs['model'] = model
         self.thinking_budget = thinking_budget
         self.enable_thinking = enable_thinking
@@ -469,55 +483,48 @@ class LLM_NEMOTRON3(LLM):
         self,
         input_data: str | BaseModel | list[dict],
         thinking_budget: Optional[int] = None,
-        **kwargs
+        **kwargs,
     ) -> List[Dict[str, Any]]:
         budget = thinking_budget or self.thinking_budget
         if not self.enable_thinking:
             # Simple pass with thinking disabled in template
             return super().__call__(
-                input_data,
-                chat_template_kwargs={"enable_thinking": False},
-                **kwargs
+                input_data, chat_template_kwargs={'enable_thinking': False}, **kwargs
             )
         # --- STEP 1: Generate Thinking Trace ---
         # We manually append <think> to force the reasoning MoE layers
         messages = self._prepare_input(input_data)
         # We use the raw text completion for the budget phase
         # Stop at the closing tag or budget limit
         thinking_response = self.text_completion(
-            input_data,
-            max_tokens=budget,
-            stop=["</think>"],
-            **kwargs
+            input_data, max_tokens=budget, stop=['</think>'], **kwargs
         )[0]
         reasoning_content = thinking_response['parsed']
         # Ensure proper tag closing for the second pass
-        if "</think>" not in reasoning_content:
-            reasoning_content = f"{reasoning_content}\n</think>"
-        elif not reasoning_content.endswith("</think>"):
+        if '</think>' not in reasoning_content:
+            reasoning_content = f'{reasoning_content}\n</think>'
+        elif not reasoning_content.endswith('</think>'):
             # Ensure it ends exactly with the tag for continuity
-            reasoning_content = reasoning_content.split("</think>")[0] + "</think>"
+            reasoning_content = reasoning_content.split('</think>')[0] + '</think>'
         # --- STEP 2: Generate Final Answer ---
         # Append the thought to the assistant role and continue
         final_messages = messages + [
-            {"role": "assistant", "content": f"<think>\n{reasoning_content}\n"}
+            {'role': 'assistant', 'content': f'<think>\n{reasoning_content}\n'}
         ]
         # Use continue_final_message to prevent the model from repeating the header
         results = super().__call__(
-            final_messages,
-            extra_body={"continue_final_message": True},
-            **kwargs
+            final_messages, extra_body={'continue_final_message': True}, **kwargs
         )
         # Inject the reasoning back into the result for the UI/API
         for res in results:
             res['reasoning_content'] = reasoning_content
-        return results
+        return results

speedy_utils/__init__.py CHANGED Viewed

@@ -54,6 +54,7 @@ from .common.utils_print import (
 # Multi-worker processing
 from .multi_worker.process import multi_process
 from .multi_worker.thread import kill_all_thread, multi_thread
+from .multi_worker.dataset_ray import multi_process_dataset_ray, WorkerResources
 __all__ = [
@@ -152,6 +153,8 @@ __all__ = [
     'multi_process',
     'multi_thread',
     'kill_all_thread',
+    'multi_process_dataset_ray',
+    'WorkerResources',
     # Notebook utilities
     'change_dir',
 ]

speedy_utils/common/utils_io.py CHANGED Viewed

@@ -87,10 +87,12 @@ def load_json_or_pickle(fname: str, counter=0) -> Any:
         # EOFError: Ran out of input
         except EOFError:
             time.sleep(1)
             if counter > 5:
                 # Keep message concise and actionable
                 print(
-                    f'Corrupted cache file {fname} removed; it will be regenerated on next access'
+                    f"[load_json_or_pickle] EOFError reading cache file='{fname}' (attempt={counter}). "
+                    f"Assuming partial write/corruption; deleted file and will regenerate on next access."
                 )
                 os.remove(fname)
                 raise

speedy_utils/multi_worker/__init__.py CHANGED Viewed

@@ -0,0 +1,12 @@
+from .process import multi_process, cleanup_phantom_workers, create_progress_tracker
+from .thread import multi_thread
+from .dataset_ray import multi_process_dataset_ray, WorkerResources
+__all__ = [
+    'multi_process',
+    'multi_thread',
+    'cleanup_phantom_workers',
+    'create_progress_tracker',
+    'multi_process_dataset_ray',
+    'WorkerResources',
+]

speedy-utils 1.1.39__py3-none-any.whl → 1.1.42__py3-none-any.whl

speedy-utils 1.1.39py3-none-any.whl → 1.1.42py3-none-any.whl