PyPI - speedy-utils - Versions diffs - 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl - Mend

speedy-utils 1.0.3py3-none-any.whl → 1.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

llm_utils/__init__.py +29 -0
llm_utils/chat_format.py +427 -0
llm_utils/group_messages.py +120 -0
llm_utils/lm/__init__.py +8 -0
llm_utils/lm/base_lm.py +304 -0
llm_utils/lm/utils.py +130 -0
llm_utils/scripts/vllm_load_balancer.py +353 -0
llm_utils/scripts/vllm_serve.py +416 -0
speedy_utils/__init__.py +85 -0
speedy_utils/all.py +159 -0
{speedy → speedy_utils}/common/__init__.py +0 -0
speedy_utils/common/clock.py +215 -0
speedy_utils/common/function_decorator.py +66 -0
speedy_utils/common/logger.py +207 -0
speedy_utils/common/report_manager.py +112 -0
speedy_utils/common/utils_cache.py +264 -0
{speedy → speedy_utils}/common/utils_io.py +66 -19
{speedy → speedy_utils}/common/utils_misc.py +25 -11
speedy_utils/common/utils_print.py +216 -0
speedy_utils/multi_worker/__init__.py +0 -0
speedy_utils/multi_worker/process.py +198 -0
speedy_utils/multi_worker/thread.py +327 -0
speedy_utils/scripts/mpython.py +108 -0
speedy_utils-1.0.5.dist-info/METADATA +279 -0
speedy_utils-1.0.5.dist-info/RECORD +27 -0
{speedy_utils-1.0.3.dist-info → speedy_utils-1.0.5.dist-info}/WHEEL +1 -2
speedy_utils-1.0.5.dist-info/entry_points.txt +3 -0
speedy/__init__.py +0 -53
speedy/common/clock.py +0 -68
speedy/common/utils_cache.py +0 -170
speedy/common/utils_print.py +0 -138
speedy/multi_worker.py +0 -121
speedy_utils-1.0.3.dist-info/METADATA +0 -22
speedy_utils-1.0.3.dist-info/RECORD +0 -12
speedy_utils-1.0.3.dist-info/top_level.txt +0 -1

speedy_utils/common/utils_cache.py ADDED Viewed

@@ -0,0 +1,264 @@
+import functools
+import inspect
+import json
+import os
+import os.path as osp
+import pickle
+import uuid
+from typing import Any, List, Literal
+import cachetools
+import pandas as pd
+import xxhash
+from loguru import logger
+from pydantic import BaseModel
+from .utils_io import dump_json_or_pickle, load_json_or_pickle
+from .utils_misc import mkdir_or_exist
+SPEED_CACHE_DIR = osp.join(osp.expanduser("~"), ".cache/speedy_cache")
+LRU_MEM_CACHE = cachetools.LRUCache(maxsize=128_000)
+from threading import Lock
+thread_locker = Lock()
+# Add two locks for thread-safe cache access
+disk_lock = Lock()
+mem_lock = Lock()
+def compute_func_id(func, args, kwargs, ignore_self, keys):
+    func_source = get_source(func)
+    if keys:
+        arg_spec = inspect.getfullargspec(func).args
+        used_args = {arg_spec[i]: arg for i, arg in enumerate(args)}
+        used_args.update(kwargs)
+        values = [used_args[k] for k in keys if k in used_args]
+        if not values:
+            raise ValueError(f"Keys {keys} not found in function arguments")
+        param_hash = identify(values)
+        dir_path = f"{func.__name__}_{identify(func_source)}"
+        key_id = f"{'_'.join(keys)}_{param_hash}.pkl"
+        return func_source, dir_path, key_id
+    if (
+        inspect.getfullargspec(func).args
+        and inspect.getfullargspec(func).args[0] == "self"
+        and ignore_self
+    ):
+        fid = (func_source, args[1:], kwargs)
+    else:
+        fid = (func_source, args, kwargs)
+    return func_source, "funcs", f"{identify(fid)}.pkl"
+def fast_serialize(x: Any) -> bytes:
+    try:
+        return json.dumps(x, sort_keys=True).encode("utf-8")
+    except (TypeError, ValueError):
+        return pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
+def identify(obj: Any, depth=0, max_depth=2) -> str:
+    if isinstance(obj, (list, tuple)):
+        x = [identify(x, depth + 1, max_depth) for x in obj]
+        x = "\n".join(x)
+        return identify(x, depth + 1, max_depth)
+    # is pandas row or dict
+    elif isinstance(obj, (pd.DataFrame, pd.Series)):
+        x = str(obj.to_dict())
+        return identify(x, depth + 1, max_depth)
+    elif hasattr(obj, "__code__"):
+        return identify(get_source(obj), depth + 1, max_depth)
+    elif isinstance(obj, BaseModel):
+        obj = obj.model_dump()
+        return identify(obj, depth + 1, max_depth)
+    elif isinstance(obj, dict):
+        ks = sorted(obj.keys())
+        vs = [identify(obj[k], depth + 1, max_depth) for k in ks]
+        return identify([ks, vs], depth + 1, max_depth)
+    elif obj is None:
+        return identify("None", depth + 1, max_depth)
+    else:
+        primitive_types = [int, float, str, bool]
+        if not type(obj) in primitive_types:
+            logger.warning(f"Unknown type: {type(obj)}")
+        return xxhash.xxh64_hexdigest(fast_serialize(obj), seed=0)
+def identify_uuid(x: Any) -> str:
+    data = fast_serialize(x)
+    hash_obj = xxhash.xxh128(data, seed=0)
+    return str(uuid.UUID(bytes=hash_obj.digest()))
+def get_source(func):
+    code = inspect.getsource(func)
+    for r in [" ", "\n", "\t", "\r"]:
+        code = code.replace(r, "")
+    return code
+def _disk_memoize(func, keys, cache_dir, ignore_self, verbose):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            # Compute cache path as before
+            func_source, sub_dir, key_id = compute_func_id(
+                func, args, kwargs, ignore_self, keys
+            )
+            if func_source is None:
+                return func(*args, **kwargs)
+            if sub_dir == "funcs":
+                cache_path = osp.join(cache_dir, sub_dir, func.__name__, key_id)
+            else:
+                cache_path = osp.join(cache_dir, sub_dir, key_id)
+            mkdir_or_exist(osp.dirname(cache_path))
+            # First check with disk lock
+            with disk_lock:
+                if osp.exists(cache_path):
+                    # logger.debug(f"Cache HIT for {func.__name__}, key={cache_path}")
+                    try:
+                        return load_json_or_pickle(cache_path)
+                    except Exception as e:
+                        if osp.exists(cache_path):
+                            os.remove(cache_path)
+                        logger.opt(depth=1).warning(
+                            f"Error loading cache: {str(e)[:100]}, continue to recompute"
+                        )
+            result = func(*args, **kwargs)
+            # Write result under disk lock to avoid race conditions
+            with disk_lock:
+                if not osp.exists(cache_path):
+                    dump_json_or_pickle(result, cache_path)
+            return result
+        except Exception as e:
+            logger.opt(depth=1).warning(
+                f"Failed to cache {func.__name__}: {e}, continue to recompute without cache"
+            )
+            return func(*args, **kwargs)
+    return wrapper
+def _memory_memoize(func, size, keys, ignore_self):
+    global LRU_MEM_CACHE
+    if LRU_MEM_CACHE.maxsize != size:
+        LRU_MEM_CACHE = cachetools.LRUCache(maxsize=size)
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        func_source, sub_dir, key_id = compute_func_id(
+            func, args, kwargs, ignore_self, keys
+        )
+        if func_source is None:
+            return func(*args, **kwargs)
+        name = identify((func_source, sub_dir, key_id))
+        if not hasattr(func, "_mem_cache"):
+            func._mem_cache = LRU_MEM_CACHE
+        with mem_lock:
+            if name in func._mem_cache:
+                # logger.debug(f"Cache HIT (memory) for {func.__name__}, key={name}")
+                return func._mem_cache[name]
+        result = func(*args, **kwargs)
+        with mem_lock:
+            if name not in func._mem_cache:
+                func._mem_cache[name] = result
+        return result
+    return wrapper
+def both_memoize(func, keys, cache_dir, ignore_self):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        func_source, sub_dir, key_id = compute_func_id(
+            func, args, kwargs, ignore_self, keys
+        )
+        if func_source is None:
+            return func(*args, **kwargs)
+        mem_key = identify((func_source, sub_dir, key_id))
+        if not hasattr(func, "_mem_cache"):
+            func._mem_cache = LRU_MEM_CACHE
+        with mem_lock:
+            if mem_key in func._mem_cache:
+                # logger.debug(f"Cache HIT (memory) for {func.__name__}, key={mem_key}")
+                return func._mem_cache[mem_key]
+        if sub_dir == "funcs":
+            cache_path = osp.join(cache_dir, sub_dir, func.__name__, key_id)
+        else:
+            cache_path = osp.join(cache_dir, sub_dir, key_id)
+        mkdir_or_exist(osp.dirname(cache_path))
+        with disk_lock:
+            if osp.exists(cache_path):
+                # logger.debug(f"Cache HIT (disk) for {func.__name__}, key={cache_path}")
+                result = load_json_or_pickle(cache_path)
+                with mem_lock:
+                    func._mem_cache[mem_key] = result
+                return result
+        # logger.debug(f"Cache MISS for {func.__name__}, key={cache_path}")
+        result = func(*args, **kwargs)
+        with disk_lock:
+            if not osp.exists(cache_path):
+                dump_json_or_pickle(result, cache_path)
+        with mem_lock:
+            func._mem_cache[mem_key] = result
+        return result
+    return wrapper
+def memoize(
+    _func=None,
+    *,
+    keys=None,
+    cache_dir=SPEED_CACHE_DIR,
+    cache_type: Literal["memory", "disk", "both"] = "disk",
+    size=10240,
+    ignore_self=True,
+    verbose=False,
+):
+    if "~/" in cache_dir:
+        cache_dir = osp.expanduser(cache_dir)
+    def decorator(func):
+        if cache_type == "memory":
+            return _memory_memoize(
+                func,
+                size,
+                keys,
+                ignore_self,
+            )
+        elif cache_type == "disk":
+            return _disk_memoize(
+                func,
+                keys,
+                cache_dir,
+                ignore_self,
+                verbose,
+            )
+        return both_memoize(
+            func,
+            keys,
+            cache_dir,
+            verbose,
+        )
+    if _func is None:
+        return decorator
+    return decorator(_func)
+__all__ = ["memoize", "identify", "identify_uuid"]

{speedy → speedy_utils}/common/utils_io.py RENAMED Viewed

@@ -4,13 +4,17 @@ import json
 import os
 import os.path as osp
 import pickle
+import time
 from glob import glob
-from typing import Any, List, Dict, Union
+from pathlib import Path
+from typing import Any
+from json_repair import loads as jloads
 from .utils_misc import mkdir_or_exist
-def dump_jsonl(list_dictionaries: List[Dict], file_name: str = "output.jsonl") -> None:
+def dump_jsonl(list_dictionaries: list[dict], file_name: str = "output.jsonl") -> None:
     """
     Dumps a list of dictionaries to a file in JSON Lines format.
     """
@@ -25,10 +29,20 @@ def dump_json_or_pickle(
     """
     Dump an object to a file, supporting both JSON and pickle formats.
     """
+    if isinstance(fname, Path):
+        fname = str(fname)
     mkdir_or_exist(osp.abspath(os.path.dirname(osp.abspath(fname))))
     if fname.endswith(".json"):
         with open(fname, "w", encoding="utf-8") as f:
-            json.dump(obj, f, ensure_ascii=ensure_ascii, indent=indent)
+            try:
+                json.dump(obj, f, ensure_ascii=ensure_ascii, indent=indent)
+            # TypeError: Object of type datetime is not JSON serializable
+            except TypeError:
+                print(
+                    "Error: Object of type datetime is not JSON serializable",
+                    str(obj)[:1000],
+                )
+                raise
     elif fname.endswith(".jsonl"):
         dump_jsonl(obj, fname)
     elif fname.endswith(".pkl"):
@@ -38,29 +52,45 @@ def dump_json_or_pickle(
         raise NotImplementedError(f"File type {fname} not supported")
-def load_json_or_pickle(fname: str) -> Any:
+def load_json_or_pickle(fname: str, counter=0) -> Any:
     """
     Load an object from a file, supporting both JSON and pickle formats.
     """
     if fname.endswith(".json") or fname.endswith(".jsonl"):
-        with open(fname, "r", encoding="utf-8") as f:
+        with open(fname, encoding="utf-8") as f:
             return json.load(f)
     else:
-        with open(fname, "rb") as f:
-            return pickle.load(f)
-def load_by_ext(
-    fname: Union[str, List[str]], do_memoize: bool = False
-) -> Any:
+        try:
+            with open(fname, "rb") as f:
+                return pickle.load(f)
+        # EOFError: Ran out of input
+        except EOFError:
+            time.sleep(1)
+            if counter > 5:
+                print("Error: Ran out of input", fname)
+                os.remove(fname)
+                raise
+            return load_json_or_pickle(fname, counter + 1)
+        except Exception as e:
+            raise ValueError(f"Error {e} while loading {fname}") from e
+def load_jsonl(path):
+    lines = open(path, encoding="utf-8").read().splitlines()
+    return [json.loads(line) for line in lines]
+def load_by_ext(fname: str | list[str], do_memoize: bool = False) -> Any:
     """
     Load data based on file extension.
     """
-    from .utils_cache import (
-        memoize,
-    )  # Adjust import based on your actual multi_worker module
+    if isinstance(fname, Path):
+        fname = str(fname)
+    from speedy_utils import multi_process
-    from speedy import multi_process  # Ensure multi_worker is correctly referenced
+    from .utils_cache import (  # Adjust import based on your actual multi_worker module
+        memoize,
+    )
     try:
         if isinstance(fname, str) and "*" in fname:
@@ -76,12 +106,14 @@ def load_by_ext(
             return pd.read_csv(path, engine="pyarrow", **pd_kwargs)
-        def load_txt(path: str) -> List[str]:
-            with open(path, "r", encoding="utf-8") as f:
+        def load_txt(path: str) -> list[str]:
+            with open(path, encoding="utf-8") as f:
                 return f.read().splitlines()
         def load_default(path: str) -> Any:
-            if path.endswith(".jsonl") or path.endswith(".json"):
+            if path.endswith(".jsonl"):
+                return load_jsonl(path)
+            elif path.endswith(".json"):
                 try:
                     return load_json_or_pickle(path)
                 except json.JSONDecodeError as exc:
@@ -109,3 +141,18 @@ def load_by_ext(
         return load_fn(fname)
     except Exception as e:
         raise ValueError(f"Error {e} while loading {fname}") from e
+def jdumps(obj, ensure_ascii=False, indent=2, **kwargs):
+    return json.dumps(obj, ensure_ascii=ensure_ascii, indent=indent, **kwargs)
+__all__ = [
+    "dump_json_or_pickle",
+    "dump_jsonl",
+    "load_by_ext",
+    "load_json_or_pickle",
+    "load_jsonl",
+    "jdumps",
+    "jloads",
+]

{speedy → speedy_utils}/common/utils_misc.py RENAMED Viewed

@@ -3,32 +3,37 @@
 import inspect
 import os
 import sys
-from typing import Any, Callable, List
-from IPython import get_ipython
-from openai import BaseModel
+from collections.abc import Callable
+from typing import Any, List
+from pydantic import BaseModel
 def mkdir_or_exist(dir_name: str) -> None:
     """Create a directory if it doesn't exist."""
     os.makedirs(dir_name, exist_ok=True)
-def flatten_list(list_of_lists: List[List[Any]]) -> List[Any]:
+def flatten_list(list_of_lists: list[list[Any]]) -> list[Any]:
     """Flatten a list of lists into a single list."""
     return [item for sublist in list_of_lists for item in sublist]
-def get_arg_names(func: Callable) -> List[str]:
+def get_arg_names(func: Callable) -> list[str]:
     """Retrieve argument names of a function."""
     return inspect.getfullargspec(func).args
-def is_interactive() -> bool:
-    """Check if the environment is interactive (e.g., Jupyter notebook)."""
+def is_notebook() -> bool:
     try:
-        get_ipython()
-        return True
+        if "get_ipython" in globals().keys():
+            get_ipython = globals()["get_ipython"]
+            shell = get_ipython().__class__.__name__
+            if shell == "ZMQInteractiveShell":
+                return True  # Jupyter notebook or qtconsole
+        return False  # Other type (?)
     except NameError:
-        return len(sys.argv) == 1
+        return False  # Probably standard Python interpreter
 def convert_to_builtin_python(input_data: Any) -> Any:
@@ -44,3 +49,12 @@ def convert_to_builtin_python(input_data: Any) -> Any:
         return convert_to_builtin_python(data)
     else:
         raise ValueError(f"Unsupported type {type(input_data)}")
+__all__ = [
+    "mkdir_or_exist",
+    "flatten_list",
+    "get_arg_names",
+    "is_notebook",
+    "convert_to_builtin_python",
+]

speedy_utils/common/utils_print.py ADDED Viewed

@@ -0,0 +1,216 @@
+# utils/utils_print.py
+import copy
+import inspect
+import json
+import pprint
+import re
+import sys
+import textwrap
+import time
+from collections import OrderedDict
+from typing import Annotated, Any, Dict, List, Literal, Optional
+from IPython.display import HTML, display
+from loguru import logger
+from tabulate import tabulate
+from .utils_misc import is_notebook
+def display_pretty_table_html(data: dict) -> None:
+    """
+    Display a pretty HTML table in Jupyter notebooks.
+    """
+    table = "<table>"
+    for key, value in data.items():
+        table += f"<tr><td>{key}</td><td>{value}</td></tr>"
+    table += "</table>"
+    display(HTML(table))
+# Flattening the dictionary using "." notation for keys
+def flatten_dict(d, parent_key="", sep="."):
+    items = []
+    for k, v in d.items():
+        new_key = parent_key + sep + k if parent_key else k
+        if isinstance(v, dict):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+def fprint(
+    input_data: Any,
+    key_ignore: list[str] | None = None,
+    key_keep: list[str] | None = None,
+    max_width: int = 100,
+    indent: int = 2,
+    depth: int | None = None,
+    table_format: str = "grid",
+    str_wrap_width: int = 80,
+    grep=None,
+    is_notebook=None,
+    f=print,
+) -> None | str:
+    """
+    Pretty print structured data.
+    """
+    if isinstance(input_data, list):
+        for i, item in enumerate(input_data):
+            fprint(
+                item,
+                key_ignore,
+                key_keep,
+                max_width,
+                indent,
+                depth,
+                table_format,
+                str_wrap_width,
+                grep,
+                is_notebook,
+                f,
+            )
+            print("\n" + "-" * 100 + "\n")
+    from speedy_utils import is_notebook as is_interactive
+    # is_notebook = is_notebook or is_interactive()
+    if is_notebook is None:
+        is_notebook = is_interactive()
+    if isinstance(input_data, list):
+        if all(hasattr(item, "toDict") for item in input_data):
+            input_data = [item.toDict() for item in input_data]
+    elif hasattr(input_data, "toDict"):
+        input_data = input_data.toDict()
+    if isinstance(input_data, list):
+        if all(hasattr(item, "to_dict") for item in input_data):
+            input_data = [item.to_dict() for item in input_data]
+    elif hasattr(input_data, "to_dict"):
+        input_data = input_data.to_dict()
+    if isinstance(input_data, list):
+        if all(hasattr(item, "model_dump") for item in input_data):
+            input_data = [item.model_dump() for item in input_data]
+    elif hasattr(input_data, "model_dump"):
+        input_data = input_data.model_dump()
+    if not isinstance(input_data, (dict, str)):
+        raise ValueError("Input data must be a dictionary or string")
+    if isinstance(input_data, dict):
+        input_data = flatten_dict(input_data)
+    if grep is not None and isinstance(input_data, dict):
+        input_data = {k: v for k, v in input_data.items() if grep in str(k)}
+    def remove_keys(d: dict, keys: list[str]) -> dict:
+        """Remove specified keys from a dictionary."""
+        for key in keys:
+            parts = key.split(".")
+            sub_dict = d
+            for part in parts[:-1]:
+                sub_dict = sub_dict.get(part, {})
+            sub_dict.pop(parts[-1], None)
+        return d
+    def keep_keys(d: dict, keys: list[str]) -> dict:
+        """Keep only specified keys in a dictionary."""
+        result = {}
+        for key in keys:
+            parts = key.split(".")
+            sub_source = d
+            sub_result = result
+            for part in parts[:-1]:
+                if part not in sub_source:
+                    break
+                sub_result = sub_result.setdefault(part, {})
+                sub_source = sub_source[part]
+            else:
+                sub_result[parts[-1]] = copy.deepcopy(sub_source.get(parts[-1]))
+        return result
+    if hasattr(input_data, "to_dict") and not isinstance(input_data, str):
+        input_data = input_data.to_dict()
+    processed_data = copy.deepcopy(input_data)
+    if isinstance(processed_data, dict) and is_notebook:
+        if key_keep is not None:
+            processed_data = keep_keys(processed_data, key_keep)
+        elif key_ignore is not None:
+            processed_data = remove_keys(processed_data, key_ignore)
+        if is_notebook:
+            display_pretty_table_html(processed_data)
+            return
+    if isinstance(processed_data, dict):
+        table = [[k, v] for k, v in processed_data.items()]
+        f(
+            tabulate(
+                table,
+                headers=["Key", "Value"],
+                tablefmt=table_format,
+                maxcolwidths=[None, max_width],
+            )
+        )
+    elif isinstance(processed_data, str):
+        wrapped_text = textwrap.fill(processed_data, width=str_wrap_width)
+        f(wrapped_text)
+    elif isinstance(processed_data, list):
+        f(tabulate(processed_data, tablefmt=table_format))
+    else:
+        printer = pprint.PrettyPrinter(width=max_width, indent=indent, depth=depth)
+        printer.pprint(processed_data)
+def print_table(data: Any, use_html: bool = True) -> None:
+    """
+    Print data as a table. If use_html is True, display using IPython HTML.
+    """
+    def __get_table(data: Any) -> str:
+        if isinstance(data, str):
+            try:
+                data = json.loads(data)
+            except json.JSONDecodeError as exc:
+                raise ValueError("String input could not be decoded as JSON") from exc
+        if isinstance(data, list):
+            if all(isinstance(item, dict) for item in data):
+                headers = list(data[0].keys())
+                rows = [list(item.values()) for item in data]
+                return tabulate(
+                    rows, headers=headers, tablefmt="html" if use_html else "grid"
+                )
+            else:
+                raise ValueError("List must contain dictionaries")
+        if isinstance(data, dict):
+            headers = ["Key", "Value"]
+            rows = list(data.items())
+            return tabulate(
+                rows, headers=headers, tablefmt="html" if use_html else "grid"
+            )
+        raise TypeError(
+            "Input data must be a list of dictionaries, a dictionary, or a JSON string"
+        )
+    table = __get_table(data)
+    if use_html:
+        display(HTML(table))
+    else:
+        print(table)
+__all__ = [
+    "display_pretty_table_html",
+    "flatten_dict",
+    "fprint",
+    "print_table",
+    # "setup_logger",
+    # "log",
+]

speedy_utils/multi_worker/__init__.py ADDED Viewed

File without changes

speedy-utils 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

speedy-utils 1.0.3py3-none-any.whl → 1.0.5py3-none-any.whl