PyPI - speedy-utils - Versions diffs - 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl - Mend

speedy-utils 1.0.4py3-none-any.whl → 1.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

llm_utils/__init__.py +29 -0
llm_utils/chat_format.py +427 -0
llm_utils/group_messages.py +120 -0
llm_utils/lm/__init__.py +8 -0
llm_utils/lm/base_lm.py +304 -0
llm_utils/lm/utils.py +130 -0
llm_utils/scripts/vllm_load_balancer.py +353 -0
llm_utils/scripts/vllm_serve.py +416 -0
speedy_utils/__init__.py +85 -0
speedy_utils/all.py +159 -0
{speedy → speedy_utils}/common/__init__.py +0 -0
speedy_utils/common/clock.py +215 -0
speedy_utils/common/function_decorator.py +66 -0
speedy_utils/common/logger.py +207 -0
speedy_utils/common/report_manager.py +112 -0
speedy_utils/common/utils_cache.py +264 -0
{speedy → speedy_utils}/common/utils_io.py +66 -19
{speedy → speedy_utils}/common/utils_misc.py +25 -11
speedy_utils/common/utils_print.py +216 -0
speedy_utils/multi_worker/__init__.py +0 -0
speedy_utils/multi_worker/process.py +198 -0
speedy_utils/multi_worker/thread.py +327 -0
speedy_utils/scripts/mpython.py +108 -0
speedy_utils-1.0.5.dist-info/METADATA +279 -0
speedy_utils-1.0.5.dist-info/RECORD +27 -0
{speedy_utils-1.0.4.dist-info → speedy_utils-1.0.5.dist-info}/WHEEL +1 -2
speedy_utils-1.0.5.dist-info/entry_points.txt +3 -0
speedy/__init__.py +0 -53
speedy/common/clock.py +0 -68
speedy/common/utils_cache.py +0 -170
speedy/common/utils_print.py +0 -138
speedy/multi_worker.py +0 -121
speedy_utils-1.0.4.dist-info/METADATA +0 -22
speedy_utils-1.0.4.dist-info/RECORD +0 -12
speedy_utils-1.0.4.dist-info/top_level.txt +0 -1

llm_utils/scripts/vllm_serve.py ADDED Viewed

@@ -0,0 +1,416 @@
+""" "
+USAGE:
+Serve models and LoRAs with vLLM:
+Serve a LoRA model:
+svllm serve --lora LORA_NAME LORA_PATH --gpus GPU_GROUPS
+Serve a base model:
+svllm serve --model MODEL_NAME --gpus GPU_GROUPS
+Add a LoRA to a served model:
+svllm add-lora --lora LORA_NAME LORA_PATH --host_port host:port (if add then the port must be specify)
+"""
+from glob import glob
+import os
+import subprocess
+import time
+from typing import List, Literal, Optional
+from fastcore.script import call_parse
+from loguru import logger
+import argparse
+import requests
+import openai
+from speedy_utils.common.utils_io import load_by_ext
+LORA_DIR: str = os.environ.get("LORA_DIR", "/loras")
+LORA_DIR = os.path.abspath(LORA_DIR)
+HF_HOME: str = os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
+logger.info(f"LORA_DIR: {LORA_DIR}")
+def model_list(host_port, api_key="abc"):
+    client = openai.OpenAI(base_url=f"http://{host_port}/v1", api_key=api_key)
+    models = client.models.list()
+    for model in models:
+        print(f"Model ID: {model.id}")
+def kill_existing_vllm(vllm_binary: Optional[str] = None) -> None:
+    """Kill selected vLLM processes using fzf."""
+    if not vllm_binary:
+        vllm_binary = get_vllm()
+    # List running vLLM processes
+    result = subprocess.run(
+        f"ps aux | grep {vllm_binary} | grep -v grep",
+        shell=True,
+        capture_output=True,
+        text=True,
+    )
+    processes = result.stdout.strip().split("\n")
+    if not processes or processes == [""]:
+        print("No running vLLM processes found.")
+        return
+    # Use fzf to select processes to kill
+    fzf = subprocess.Popen(
+        ["fzf", "--multi"],
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        text=True,
+    )
+    selected, _ = fzf.communicate("\n".join(processes))
+    if not selected:
+        print("No processes selected.")
+        return
+    # Extract PIDs and kill selected processes
+    pids = [line.split()[1] for line in selected.strip().split("\n")]
+    for pid in pids:
+        subprocess.run(
+            f"kill -9 {pid}",
+            shell=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+    print(f"Killed processes: {', '.join(pids)}")
+def add_lora(
+    lora_name_or_path: str,
+    host_port: str,
+    url: str = "http://HOST:PORT/v1/load_lora_adapter",
+    served_model_name: Optional[str] = None,
+    lora_module: Optional[str] = None,  # Added parameter
+) -> dict:
+    url = url.replace("HOST:PORT", host_port)
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "lora_name": served_model_name,
+        "lora_path": os.path.abspath(lora_name_or_path),
+    }
+    if lora_module:  # Include lora_module if provided
+        data["lora_module"] = lora_module
+    logger.info(f"{data=}, {headers}, {url=}")
+    # logger.warning(f"Failed to unload LoRA adapter: {str(e)}")
+    try:
+        response = requests.post(url, headers=headers, json=data)
+        response.raise_for_status()
+        # Handle potential non-JSON responses
+        try:
+            return response.json()
+        except ValueError:
+            return {
+                "status": "success",
+                "message": (
+                    response.text
+                    if response.text.strip()
+                    else "Request completed with empty response"
+                ),
+            }
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Request failed: {str(e)}")
+        return {"error": f"Request failed: {str(e)}"}
+def unload_lora(lora_name, host_port):
+    try:
+        url = f"http://{host_port}/v1/unload_lora_adapter"
+        logger.info(f"{url=}")
+        headers = {"Content-Type": "application/json"}
+        data = {"lora_name": lora_name}
+        logger.info(f"Unloading LoRA adapter: {data=}")
+        response = requests.post(url, headers=headers, json=data)
+        response.raise_for_status()
+        logger.success(f"Unloaded LoRA adapter: {lora_name}")
+    except requests.exceptions.RequestException as e:
+        return {"error": f"Request failed: {str(e)}"}
+def serve(
+    model: str,
+    gpu_groups: str,
+    served_model_name: Optional[str] = None,
+    port_start: int = 8155,
+    gpu_memory_utilization: float = 0.93,
+    dtype: str = "bfloat16",
+    max_model_len: int = 8192,
+    enable_lora: bool = False,
+    is_bnb: bool = False,
+    eager: bool = False,
+    lora_modules: Optional[List[str]] = None,  # Updated type
+) -> None:
+    """Main function to start or kill vLLM containers."""
+    """Start vLLM containers with dynamic args."""
+    print("Starting vLLM containers...,")
+    gpu_groups_arr: List[str] = gpu_groups.split(",")
+    VLLM_BINARY: str = get_vllm()
+    if enable_lora:
+        VLLM_BINARY = "VLLM_ALLOW_RUNTIME_LORA_UPDATING=True " + VLLM_BINARY
+    # Auto-detect quantization based on model name if not explicitly set
+    if not is_bnb and model and ("bnb" in model.lower() or "4bit" in model.lower()):
+        is_bnb = True
+        print(f"Auto-detected quantization for model: {model}")
+    # Set environment variables for LoRA if needed
+    if enable_lora:
+        os.environ["VLLM_ALLOW_RUNTIME_LORA_UPDATING"] = "True"
+        print("Enabled runtime LoRA updating")
+    for i, gpu_group in enumerate(gpu_groups_arr):
+        port = port_start + i
+        gpu_group = ",".join([str(x) for x in gpu_group])
+        tensor_parallel = len(gpu_group.split(","))
+        cmd = [
+            f"CUDA_VISIBLE_DEVICES={gpu_group}",
+            VLLM_BINARY,
+            "serve",
+            model,
+            "--port",
+            str(port),
+            "--tensor-parallel",
+            str(tensor_parallel),
+            "--gpu-memory-utilization",
+            str(gpu_memory_utilization),
+            "--dtype",
+            dtype,
+            "--max-model-len",
+            str(max_model_len),
+            "--enable-prefix-caching",
+            "--disable-log-requests",
+            "--uvicorn-log-level critical",
+        ]
+        if HF_HOME:
+            # insert
+            cmd.insert(0, f"HF_HOME={HF_HOME}")
+        if eager:
+            cmd.append("--enforce-eager")
+        if served_model_name:
+            cmd.extend(["--served-model-name", served_model_name])
+        if is_bnb:
+            cmd.extend(
+                ["--quantization", "bitsandbytes", "--load-format", "bitsandbytes"]
+            )
+        if enable_lora:
+            cmd.extend(["--fully-sharded-loras", "--enable-lora"])
+        if lora_modules:
+            # for lora_module in lora_modules:
+            # len must be even and we will join tuple with `=`
+            assert len(lora_modules) % 2 == 0, "lora_modules must be even"
+            # lora_modulle = [f'{name}={module}' for name, module in zip(lora_module[::2], lora_module[1::2])]
+            # import ipdb;ipdb.set_trace()
+            s = ""
+            for i in range(0, len(lora_modules), 2):
+                name = lora_modules[i]
+                module = lora_modules[i + 1]
+                s += f"{name}={module} "
+            cmd.extend(["--lora-modules", s])
+        # add kwargs
+        final_cmd = " ".join(cmd)
+        log_file = f"/tmp/vllm_{port}.txt"
+        final_cmd_with_log = f'"{final_cmd} 2>&1 | tee {log_file}"'
+        run_in_tmux = (
+            f"tmux new-session -d -s vllm_{port} 'bash -c {final_cmd_with_log}'"
+        )
+        print(final_cmd)
+        print("Logging to", log_file)
+        os.system(run_in_tmux)
+def get_vllm():
+    VLLM_BINARY = subprocess.check_output("which vllm", shell=True, text=True).strip()
+    VLLM_BINARY = os.getenv("VLLM_BINARY", VLLM_BINARY)
+    logger.info(f"vLLM binary: {VLLM_BINARY}")
+    assert os.path.exists(
+        VLLM_BINARY
+    ), f"vLLM binary not found at {VLLM_BINARY}, please set VLLM_BINARY env variable"
+    return VLLM_BINARY
+def get_args():
+    """Parse command line arguments."""
+    example_args = [
+        "svllm serve --model MODEL_NAME --gpus 0,1,2,3",
+        "svllm serve --lora LORA_NAME LORA_PATH --gpus 0,1,2,3",
+        "svllm add_lora --lora LORA_NAME LORA_PATH --host_port localhost:8150",
+        "svllm kill",
+    ]
+    parser = argparse.ArgumentParser(
+        description="vLLM Serve Script", epilog="Example: " + " || ".join(example_args)
+    )
+    parser.add_argument(
+        "mode",
+        choices=["serve", "kill", "add_lora", "unload_lora", "list_models"],
+        help="Mode to run the script in",
+    )
+    parser.add_argument("--model", "-m", type=str, help="Model to serve")
+    parser.add_argument(
+        "--gpus",
+        "-g",
+        type=str,
+        help="Comma-separated list of GPU groups",
+        dest="gpu_groups",
+    )
+    parser.add_argument(
+        "--lora",
+        "-l",
+        nargs=2,
+        metavar=("LORA_NAME", "LORA_PATH"),
+        help="Name and path of the LoRA adapter",
+    )
+    parser.add_argument(
+        "--served_model_name", type=str, help="Name of the served model"
+    )
+    parser.add_argument(
+        "--gpu_memory_utilization",
+        "-gmu",
+        type=float,
+        default=0.9,
+        help="GPU memory utilization",
+    )
+    parser.add_argument("--dtype", type=str, default="auto", help="Data type")
+    parser.add_argument(
+        "--max_model_len", "-mml", type=int, default=8192, help="Maximum model length"
+    )
+    parser.add_argument(
+        "--disable_lora",
+        dest="enable_lora",
+        action="store_false",
+        help="Disable LoRA support",
+        default=True,
+    )
+    parser.add_argument("--bnb", action="store_true", help="Enable quantization")
+    parser.add_argument(
+        "--not_verbose", action="store_true", help="Disable verbose logging"
+    )
+    parser.add_argument("--vllm_binary", type=str, help="Path to the vLLM binary")
+    parser.add_argument(
+        "--pipeline_parallel",
+        "-pp",
+        default=1,
+        type=int,
+        help="Number of pipeline parallel stages",
+    )
+    parser.add_argument(
+        "--extra_args",
+        nargs=argparse.REMAINDER,
+        help="Additional arguments for the serve command",
+    )
+    parser.add_argument(
+        "--host_port",
+        "-hp",
+        type=str,
+        default="localhost:8150",
+        help="Host and port for the server format: host:port",
+    )
+    parser.add_argument("--eager", action="store_true", help="Enable eager execution")
+    parser.add_argument(
+        "--lora_modules",
+        "-lm",
+        nargs="+",
+        type=str,
+        help="List of LoRA modules in the format lora_name lora_module",
+    )
+    return parser.parse_args()
+def main():
+    """Main entry point for the script."""
+    args = get_args()
+    if args.mode == "serve":
+        # Handle LoRA model serving via the new --lora argument
+        if args.lora:
+            lora_name, lora_path = args.lora
+            if not args.lora_modules:
+                args.lora_modules = [lora_name, lora_path]
+            # Try to get the model from LoRA config if not specified
+            if args.model is None:
+                lora_config = os.path.join(lora_path, "adapter_config.json")
+                if os.path.exists(lora_config):
+                    config = load_by_ext(lora_config)
+                    model_name = config.get("base_model_name_or_path")
+                    # Handle different quantization suffixes
+                    if model_name.endswith("-unsloth-bnb-4bit") and not args.bnb:
+                        model_name = model_name.replace("-unsloth-bnb-4bit", "")
+                    elif model_name.endswith("-bnb-4bit") and not args.bnb:
+                        model_name = model_name.replace("-bnb-4bit", "")
+                    logger.info(f"Model name from LoRA config: {model_name}")
+                    args.model = model_name
+        # Fall back to existing logic for other cases (already specified lora_modules)
+        if args.model is None and args.lora_modules is not None and not args.lora:
+            lora_config = os.path.join(args.lora_modules[1], "adapter_config.json")
+            if os.path.exists(lora_config):
+                config = load_by_ext(lora_config)
+                model_name = config.get("base_model_name_or_path")
+                if model_name.endswith("-unsloth-bnb-4bit") and not args.bnb:
+                    model_name = model_name.replace("-unsloth-bnb-4bit", "")
+                elif model_name.endswith("-bnb-4bit") and not args.bnb:
+                    model_name = model_name.replace("-bnb-4bit", "")
+                logger.info(f"Model name from LoRA config: {model_name}")
+                args.model = model_name
+        # port_start from hostport
+        port_start = int(args.host_port.split(":")[-1])
+        serve(
+            args.model,
+            args.gpu_groups,
+            args.served_model_name,
+            port_start,
+            args.gpu_memory_utilization,
+            args.dtype,
+            args.max_model_len,
+            args.enable_lora,
+            args.bnb,
+            args.eager,
+            args.lora_modules,
+        )
+    elif args.mode == "kill":
+        kill_existing_vllm(args.vllm_binary)
+    elif args.mode == "add_lora":
+        if args.lora:
+            lora_name, lora_path = args.lora
+            add_lora(lora_path, host_port=args.host_port, served_model_name=lora_name)
+        else:
+            # Fallback to old behavior
+            lora_name = args.model
+            add_lora(
+                lora_name,
+                host_port=args.host_port,
+                served_model_name=args.served_model_name,
+            )
+    elif args.mode == "unload_lora":
+        if args.lora:
+            lora_name = args.lora[0]
+        else:
+            lora_name = args.model
+        unload_lora(lora_name, host_port=args.host_port)
+    elif args.mode == "list_models":
+        model_list(args.host_port)
+    else:
+        raise ValueError(f"Unknown mode: {args.mode}, ")
+if __name__ == "__main__":
+    main()

speedy_utils/__init__.py ADDED Viewed

@@ -0,0 +1,85 @@
+# Import specific functions and classes from modules
+# Logger
+from speedy_utils.common.logger import log, setup_logger
+# Clock module
+from .common.clock import Clock, speedy_timer, timef
+# Function decorators
+from .common.function_decorator import retry_runtime
+# Cache utilities
+from .common.utils_cache import identify, identify_uuid, memoize
+# IO utilities
+from .common.utils_io import (
+    dump_json_or_pickle,
+    dump_jsonl,
+    jdumps,
+    jloads,
+    load_by_ext,
+    load_json_or_pickle,
+    load_jsonl,
+)
+# Misc utilities
+from .common.utils_misc import (
+    convert_to_builtin_python,
+    flatten_list,
+    get_arg_names,
+    is_notebook,
+    mkdir_or_exist,
+)
+# Print utilities
+from .common.utils_print import (
+    display_pretty_table_html,
+    flatten_dict,
+    fprint,
+    print_table,
+)
+# Multi-worker processing
+from .multi_worker.process import multi_process
+from .multi_worker.thread import multi_thread
+# Define __all__ explicitly
+__all__ = [
+    # Clock module
+    "Clock",
+    "speedy_timer",
+    "timef",
+    # Function decorators
+    "retry_runtime",
+    # Cache utilities
+    "memoize",
+    "identify",
+    "identify_uuid",
+    # IO utilities
+    "dump_json_or_pickle",
+    "dump_jsonl",
+    "load_by_ext",
+    "load_json_or_pickle",
+    "load_jsonl",
+    "jdumps",
+    "jloads",
+    # Misc utilities
+    "mkdir_or_exist",
+    "flatten_list",
+    "get_arg_names",
+    "is_notebook",
+    "convert_to_builtin_python",
+    # Print utilities
+    "display_pretty_table_html",
+    "flatten_dict",
+    "fprint",
+    "print_table",
+    "setup_logger",
+    "log",
+    # Multi-worker processing
+    "multi_process",
+    "multi_thread",
+]
+# Setup default logger
+# setup_logger('D')

speedy_utils/all.py ADDED Viewed

@@ -0,0 +1,159 @@
+# speedy_utils/all.py
+# Provide a consolidated set of imports for convenience
+# Standard library imports
+import copy
+import functools
+import gc
+import inspect
+import json
+import multiprocessing
+import os
+import os.path as osp
+import pickle
+import pprint
+import random
+import re
+import sys
+import textwrap
+import threading
+import time
+import traceback
+import uuid
+from collections import Counter, defaultdict
+from collections.abc import Callable
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from glob import glob
+from multiprocessing import Pool
+from pathlib import Path
+from threading import Lock
+from typing import Any, Dict, Generic, List, Literal, Optional, TypeVar, Union
+# Third-party imports
+import numpy as np
+import pandas as pd
+import xxhash
+from IPython.core.getipython import get_ipython
+from IPython.display import HTML, display
+from loguru import logger
+from pydantic import BaseModel
+from tabulate import tabulate
+from tqdm import tqdm
+# Import specific functions from speedy_utils
+from speedy_utils import (  # Clock module; Function decorators; Cache utilities; IO utilities; Misc utilities; Print utilities; Multi-worker processing
+    Clock,
+    convert_to_builtin_python,
+    display_pretty_table_html,
+    dump_json_or_pickle,
+    dump_jsonl,
+    flatten_dict,
+    flatten_list,
+    fprint,
+    get_arg_names,
+    identify,
+    identify_uuid,
+    is_notebook,
+    jdumps,
+    jloads,
+    load_by_ext,
+    load_json_or_pickle,
+    load_jsonl,
+    log,
+    memoize,
+    mkdir_or_exist,
+    multi_process,
+    multi_thread,
+    print_table,
+    retry_runtime,
+    setup_logger,
+    speedy_timer,
+    timef,
+)
+# Define __all__ explicitly with all exports
+__all__ = [
+    # Standard library
+    "random",
+    "copy",
+    "functools",
+    "gc",
+    "inspect",
+    "json",
+    "multiprocessing",
+    "os",
+    "osp",
+    "pickle",
+    "pprint",
+    "re",
+    "sys",
+    "textwrap",
+    "threading",
+    "time",
+    "traceback",
+    "uuid",
+    "Counter",
+    "ThreadPoolExecutor",
+    "as_completed",
+    "glob",
+    "Pool",
+    "Path",
+    "Lock",
+    "defaultdict",
+    # Typing
+    "Any",
+    "Callable",
+    "Dict",
+    "Generic",
+    "List",
+    "Literal",
+    "Optional",
+    "TypeVar",
+    "Union",
+    # Third-party
+    "pd",
+    "xxhash",
+    "get_ipython",
+    "HTML",
+    "display",
+    "logger",
+    "BaseModel",
+    "tabulate",
+    "tqdm",
+    "np",
+    # Clock module
+    "Clock",
+    "speedy_timer",
+    "timef",
+    # Function decorators
+    "retry_runtime",
+    # Cache utilities
+    "memoize",
+    "identify",
+    "identify_uuid",
+    # IO utilities
+    "dump_json_or_pickle",
+    "dump_jsonl",
+    "load_by_ext",
+    "load_json_or_pickle",
+    "load_jsonl",
+    "jdumps",
+    "jloads",
+    # Misc utilities
+    "mkdir_or_exist",
+    "flatten_list",
+    "get_arg_names",
+    "is_notebook",
+    "convert_to_builtin_python",
+    # Print utilities
+    "display_pretty_table_html",
+    "flatten_dict",
+    "fprint",
+    "print_table",
+    "setup_logger",
+    "log",
+    # Multi-worker processing
+    "multi_process",
+    "multi_thread",
+]

{speedy → speedy_utils}/common/__init__.py RENAMED Viewed

File without changes

speedy-utils 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl

speedy-utils 1.0.4py3-none-any.whl → 1.0.5py3-none-any.whl