PyPI - speedy-utils - Versions diffs - 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl - Mend

speedy-utils 1.1.18py3-none-any.whl → 1.1.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

llm_utils/__init__.py +3 -2
llm_utils/lm/async_lm/async_llm_task.py +1 -0
llm_utils/lm/llm_task.py +303 -10
llm_utils/lm/openai_memoize.py +10 -2
llm_utils/vector_cache/core.py +250 -234
speedy_utils/__init__.py +2 -1
speedy_utils/common/utils_cache.py +38 -19
speedy_utils/common/utils_io.py +9 -5
speedy_utils/multi_worker/process.py +91 -10
speedy_utils/multi_worker/thread.py +94 -2
{speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/METADATA +34 -13
{speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/RECORD +19 -19
{speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/WHEEL +1 -1
speedy_utils-1.1.20.dist-info/entry_points.txt +5 -0
speedy_utils-1.1.18.dist-info/entry_points.txt +0 -6

speedy_utils/__init__.py CHANGED Viewed

@@ -138,7 +138,7 @@ from .common.utils_print import (
 # Multi-worker processing
 from .multi_worker.process import multi_process
-from .multi_worker.thread import multi_thread
+from .multi_worker.thread import kill_all_thread, multi_thread
 # Define __all__ explicitly
 __all__ = [
@@ -224,6 +224,7 @@ __all__ = [
     # Multi-worker processing
     "multi_process",
     "multi_thread",
+    "kill_all_thread",
     # Notebook utilities
     "change_dir",
 ]

speedy_utils/common/utils_cache.py CHANGED Viewed

@@ -258,13 +258,13 @@ def _memory_memoize(
         with mem_lock:
             if name in mem_cache:
-                return mem_cache[name]  # type: ignore[return-value]
+                return mem_cache[name]
         result = func(*args, **kwargs)
         with mem_lock:
             if name not in mem_cache:
-                mem_cache[name] = result  # type: ignore[index]
+                mem_cache[name] = result
         return result
     return wrapper
@@ -292,7 +292,7 @@ def _async_memory_memoize(
         async with alock:
             if name in mem_cache:
-                return mem_cache[name]  # type: ignore[return-value]
+                return mem_cache[name]
             task = inflight.get(name)
             if task is None:
                 task = asyncio.create_task(func(*args, **kwargs))  # type: ignore[arg-type]
@@ -305,7 +305,7 @@ def _async_memory_memoize(
                 inflight.pop(name, None)
         with mem_lock:
-            mem_cache[name] = result  # type: ignore[index]
+            mem_cache[name] = result
         return result
     return wrapper
@@ -447,7 +447,7 @@ def both_memoize(
         # Memory first
         with mem_lock:
             if mem_key in mem_cache:
-                return mem_cache[mem_key]  # type: ignore[return-value]
+                return mem_cache[mem_key]
         # Disk next
         if sub_dir == "funcs":
@@ -468,7 +468,7 @@ def both_memoize(
         if disk_result is not None:
             with mem_lock:
-                mem_cache[mem_key] = disk_result  # type: ignore[index]
+                mem_cache[mem_key] = disk_result
             return disk_result
         # Miss: compute, then write both
@@ -477,7 +477,7 @@ def both_memoize(
             if not osp.exists(cache_path):
                 dump_json_or_pickle(result, cache_path)
         with mem_lock:
-            mem_cache[mem_key] = result  # type: ignore[index]
+            mem_cache[mem_key] = result
         return result
     return wrapper
@@ -506,7 +506,7 @@ def _async_both_memoize(
         # Memory
         async with alock:
             if mem_key in mem_cache:
-                return mem_cache[mem_key]  # type: ignore[return-value]
+                return mem_cache[mem_key]
         # Disk
         if sub_dir == "funcs":
@@ -526,7 +526,7 @@ def _async_both_memoize(
         if disk_result is not None:
             with mem_lock:
-                mem_cache[mem_key] = disk_result  # type: ignore[index]
+                mem_cache[mem_key] = disk_result
             return disk_result
         # Avoid duplicate async work for same key
@@ -550,7 +550,7 @@ def _async_both_memoize(
         await loop.run_in_executor(None, write_disk_cache)
         with mem_lock:
-            mem_cache[mem_key] = result  # type: ignore[index]
+            mem_cache[mem_key] = result
         return result
     return wrapper
@@ -561,9 +561,10 @@ def _async_both_memoize(
 # --------------------------------------------------------------------------------------
+# Define overloads to preserve exact type information
 @overload
 def memoize(
-    _func: Callable[P, R | Awaitable[R]],
+    _func: Callable[P, R],
     *,
     keys: Optional[list[str]] = ...,
     key: Optional[Callable[..., Any]] = ...,
@@ -572,7 +573,23 @@ def memoize(
     size: int = ...,
     ignore_self: bool = ...,
     verbose: bool = ...,
-) -> Callable[P, R | Awaitable[R]]: ...
+) -> Callable[P, R]: ...
+@overload
+def memoize(
+    _func: Callable[P, Awaitable[R]],
+    *,
+    keys: Optional[list[str]] = ...,
+    key: Optional[Callable[..., Any]] = ...,
+    cache_dir: str = ...,
+    cache_type: Literal["memory", "disk", "both"] = ...,
+    size: int = ...,
+    ignore_self: bool = ...,
+    verbose: bool = ...,
+) -> Callable[P, Awaitable[R]]: ...
 @overload
 def memoize(
     _func: None = ...,
@@ -585,8 +602,10 @@ def memoize(
     ignore_self: bool = ...,
     verbose: bool = ...,
 ) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
 @overload
-def memoize(
+def memoize( # type: ignore
     _func: None = ...,
     *,
     keys: Optional[list[str]] = ...,
@@ -635,24 +654,24 @@ def memoize(
         if cache_type == "memory":
             if is_async:
-                return _async_memory_memoize(target_func, size, keys, ignore_self, key)  # type: ignore[return-value]
-            return _memory_memoize(target_func, size, keys, ignore_self, key)  # type: ignore[return-value]
+                return _async_memory_memoize(target_func, size, keys, ignore_self, key)
+            return _memory_memoize(target_func, size, keys, ignore_self, key)
         if cache_type == "disk":
             if is_async:
                 return _async_disk_memoize(
                     target_func, keys, cache_dir, ignore_self, verbose, key
-                )  # type: ignore[return-value]
+                )
             return _disk_memoize(
                 target_func, keys, cache_dir, ignore_self, verbose, key
-            )  # type: ignore[return-value]
+            )
         # cache_type == "both"
         if is_async:
             return _async_both_memoize(
                 target_func, keys, cache_dir, ignore_self, size, key
-            )  # type: ignore[return-value]
-        return both_memoize(target_func, keys, cache_dir, ignore_self, size, key)  # type: ignore[return-value]
+            )
+        return both_memoize(target_func, keys, cache_dir, ignore_self, size, key)
     # Support both @memoize and @memoize(...)
     if _func is None:

speedy_utils/common/utils_io.py CHANGED Viewed

@@ -1,13 +1,18 @@
 # utils/utils_io.py
+import bz2
+import gzip
+import io
 import json
+import lzma
 import os
 import os.path as osp
 import pickle
 import time
+import warnings
 from glob import glob
 from pathlib import Path
-from typing import Any, Union
+from typing import IO, Any, Iterable, Optional, Union, cast
 from json_repair import loads as jloads
 from pydantic import BaseModel
@@ -53,7 +58,7 @@ def dump_json_or_pickle(
         except Exception as e:
             if isinstance(obj, BaseModel):
                 data = obj.model_dump()
-                from fastcore.all import obj2dict, dict2obj
+                from fastcore.all import dict2obj, obj2dict
                 obj2 = dict2obj(data)
                 with open(fname, "wb") as f:
                     pickle.dump(obj2, f)
@@ -87,8 +92,7 @@ def load_json_or_pickle(fname: str, counter=0) -> Any:
             raise ValueError(f"Error {e} while loading {fname}") from e
-import os, io, json, gzip, bz2, lzma, warnings
-from typing import Iterable, Union, IO, Any, Optional, cast
 try:
     import orjson  # type: ignore[import-not-found]  # fastest JSON parser when available
@@ -212,7 +216,7 @@ def fast_load_jsonl(
         if line_count > multiworker_threshold:
             # Use multi-worker processing
             from ..multi_worker.thread import multi_thread
             # Read all lines into chunks
             f = _open_auto(path_or_file)
             all_lines = list(f)

speedy_utils/multi_worker/process.py CHANGED Viewed

@@ -1,13 +1,32 @@
 # ray_multi_process.py
-import time, os, pickle, uuid, datetime
+import time, os, pickle, uuid, datetime, multiprocessing
+import datetime
+import os
+import pickle
+import time
+import uuid
 from pathlib import Path
 from typing import Any, Callable
 from tqdm import tqdm
+import psutil
+import threading
+ray: Any
+try:
+    import ray as ray  # type: ignore
+    _HAS_RAY = True
+except Exception:  # pragma: no cover
+    ray = None  # type: ignore
+    _HAS_RAY = False
+from typing import Any, Callable, Iterable
 import ray
 from fastcore.parallel import parallel
+from tqdm import tqdm
 # ─── cache helpers ──────────────────────────────────────────
 def _build_cache_dir(func: Callable, items: list[Any]) -> Path:
     """Build cache dir with function name + timestamp."""
     func_name = getattr(func, "__name__", "func")
@@ -18,6 +37,7 @@ def _build_cache_dir(func: Callable, items: list[Any]) -> Path:
     path.mkdir(parents=True, exist_ok=True)
     return path
 def wrap_dump(func: Callable, cache_dir: Path | None):
     """Wrap a function so results are dumped to .pkl when cache_dir is set."""
     if cache_dir is None:
@@ -29,12 +49,15 @@ def wrap_dump(func: Callable, cache_dir: Path | None):
         with open(p, "wb") as fh:
             pickle.dump(res, fh)
         return str(p)
     return wrapped
 # ─── ray management ─────────────────────────────────────────
 RAY_WORKER = None
 def ensure_ray(workers: int, pbar: tqdm | None = None):
     """Initialize or reinitialize Ray with a given worker count, log to bar postfix."""
     global RAY_WORKER
@@ -49,19 +72,22 @@ def ensure_ray(workers: int, pbar: tqdm | None = None):
             pbar.set_postfix_str(f"ray.init {workers} took {took:.2f}s")
         RAY_WORKER = workers
 # ─── main API ───────────────────────────────────────────────
 from typing import Literal
 def multi_process(
     func: Callable[[Any], Any],
-    items: list[Any] | None = None,
+    items: Iterable[Any] | None = None,
     *,
-    inputs: list[Any] | None = None,
+    inputs: Iterable[Any] | None = None,
     workers: int | None = None,
     lazy_output: bool = False,
     progress: bool = True,
     # backend: str = "ray",   # "seq", "ray", or "fastcore"
-    backend: Literal["seq", "ray", "mp", "threadpool"] = "ray",
+    backend: Literal["seq", "ray", "mp", "threadpool", "safe"] | None = None,
+    backend: Literal["seq", "ray", "mp", "threadpool"] = "mp",
     # Additional optional knobs (accepted for compatibility)
     batch: int | None = None,
     ordered: bool | None = None,
@@ -75,15 +101,25 @@ def multi_process(
     backend:
         - "seq": run sequentially
         - "ray": run in parallel with Ray
-        - "fastcore": run in parallel with fastcore.parallel
+        - "mp": run in parallel with multiprocessing (uses threadpool to avoid fork warnings)
+        - "threadpool": run in parallel with thread pool
+        - "safe": run in parallel with thread pool (explicitly safe for tests)
     If lazy_output=True, every result is saved to .pkl and
     the returned list contains file paths.
     """
+    # default backend selection
+    if backend is None:
+        backend = "ray" if _HAS_RAY else "mp"
     # unify items
+    # unify items and coerce to concrete list so we can use len() and
+    # iterate multiple times. This accepts ranges and other iterables.
     if items is None and inputs is not None:
         items = list(inputs)
+    if items is not None and not isinstance(items, list):
+        items = list(items)
     if items is None:
         raise ValueError("'items' or 'inputs' must be provided")
@@ -95,8 +131,9 @@ def multi_process(
     f_wrapped = wrap_dump(func, cache_dir)
     total = len(items)
-    with tqdm(total=total, desc=f"multi_process [{backend}]", disable=not progress) as pbar:
+    with tqdm(
+        total=total, desc=f"multi_process [{backend}]", disable=not progress
+    ) as pbar:
         # ---- sequential backend ----
         if backend == "seq":
             pbar.set_postfix_str("backend=seq")
@@ -108,6 +145,13 @@ def multi_process(
         # ---- ray backend ----
         if backend == "ray":
+            if not _HAS_RAY:
+                msg = (
+                    "Ray backend requested but 'ray' is not installed. "
+                    "Install extra: pip install 'speedy-utils[ray]' or "
+                    "poetry install -E ray."
+                )
+                raise RuntimeError(msg)
             pbar.set_postfix_str("backend=ray")
             ensure_ray(workers, pbar)
@@ -125,10 +169,47 @@ def multi_process(
         # ---- fastcore backend ----
         if backend == "mp":
-            results = parallel(f_wrapped, items, n_workers=workers, progress=progress, threadpool=False)
+            results = parallel(
+                f_wrapped, items, n_workers=workers, progress=progress, threadpool=False
+            )
             return list(results)
         if backend == "threadpool":
-            results = parallel(f_wrapped, items, n_workers=workers, progress=progress, threadpool=True)
+            results = parallel(
+                f_wrapped, items, n_workers=workers, progress=progress, threadpool=True
+            )
             return list(results)
+        if backend == "safe":
+            # Completely safe backend for tests - no multiprocessing, no external progress bars
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
+                results = list(executor.map(f_wrapped, items))
         raise ValueError(f"Unsupported backend: {backend!r}")
+def cleanup_phantom_workers():
+    """
+    Kill all child processes (phantom workers) without killing the Jupyter kernel itself.
+    Also lists non-daemon threads that remain.
+    """
+    parent = psutil.Process(os.getpid())
+    # Kill only children, never the current process
+    for child in parent.children(recursive=True):
+        try:
+            print(f"🔪 Killing child process {child.pid} ({child.name()})")
+            child.kill()
+        except psutil.NoSuchProcess:
+            pass
+    # Report stray threads (can't hard-kill them in Python)
+    for t in threading.enumerate():
+        if t is threading.current_thread():
+            continue
+        if not t.daemon:
+            print(f"⚠️ Thread {t.name} is still running (cannot be force-killed).")
+    print("✅ Cleaned up child processes (kernel untouched).")
+# Usage: run this anytime after cancelling a cell

speedy_utils/multi_worker/thread.py CHANGED Viewed

@@ -77,7 +77,9 @@
 # ============================================================================= #
 """
+import ctypes
 import os
+import threading
 import time
 import traceback
 from collections.abc import Callable, Iterable
@@ -98,6 +100,42 @@ DEFAULT_WORKERS = (os.cpu_count() or 4) * 2
 T = TypeVar("T")
 R = TypeVar("R")
+SPEEDY_RUNNING_THREADS: list[threading.Thread] = []
+_SPEEDY_THREADS_LOCK = threading.Lock()
+_PY_SET_ASYNC_EXC = ctypes.pythonapi.PyThreadState_SetAsyncExc
+try:
+    _PY_SET_ASYNC_EXC.argtypes = (ctypes.c_ulong, ctypes.py_object)  # type: ignore[attr-defined]
+    _PY_SET_ASYNC_EXC.restype = ctypes.c_int  # type: ignore[attr-defined]
+except AttributeError:  # pragma: no cover - platform specific
+    pass
+def _prune_dead_threads() -> None:
+    with _SPEEDY_THREADS_LOCK:
+        SPEEDY_RUNNING_THREADS[:] = [t for t in SPEEDY_RUNNING_THREADS if t.is_alive()]
+def _track_threads(threads: Iterable[threading.Thread]) -> None:
+    if not threads:
+        return
+    with _SPEEDY_THREADS_LOCK:
+        living = [t for t in SPEEDY_RUNNING_THREADS if t.is_alive()]
+        for candidate in threads:
+            if not candidate.is_alive():
+                continue
+            if any(existing is candidate for existing in living):
+                continue
+            living.append(candidate)
+        SPEEDY_RUNNING_THREADS[:] = living
+def _track_executor_threads(pool: ThreadPoolExecutor) -> None:
+    thread_set = getattr(pool, "_threads", None)
+    if not thread_set:
+        return
+    _track_threads(tuple(thread_set))
 def _group_iter(src: Iterable[T], size: int) -> Iterable[list[T]]:
     """Yield successive chunks from iterable of specified size."""
@@ -273,11 +311,13 @@ def multi_thread(
                 fut.idx = next_logical_idx  # type: ignore[attr-defined]
                 inflight.add(fut)
                 next_logical_idx += len(arg)
+                _track_executor_threads(pool)
             else:
                 fut = pool.submit(_worker, arg, func, fixed_kwargs)
                 fut.idx = next_logical_idx  # type: ignore[attr-defined]
                 inflight.add(fut)
                 next_logical_idx += 1
+                _track_executor_threads(pool)
         try:
             # Process futures as they complete and add new ones to keep the pool busy
@@ -347,11 +387,13 @@ def multi_thread(
                                 fut2.idx = next_logical_idx  # type: ignore[attr-defined]
                                 inflight.add(fut2)
                                 next_logical_idx += len(arg)
+                                _track_executor_threads(pool)
                             else:
                                 fut2 = pool.submit(_worker, arg, func, fixed_kwargs)
                                 fut2.idx = next_logical_idx  # type: ignore[attr-defined]
                                 inflight.add(fut2)
                                 next_logical_idx += 1
+                                _track_executor_threads(pool)
                     except StopIteration:
                         pass
@@ -370,6 +412,7 @@ def multi_thread(
                 bar.close()
     if store_output_pkl_file:
         dump_json_or_pickle(results, store_output_pkl_file)
+    _prune_dead_threads()
     return results
@@ -396,9 +439,58 @@ def multi_thread_standard(
         Results in same order as input items.
     """
     with ThreadPoolExecutor(max_workers=workers) as executor:
-        futures = [executor.submit(fn, item) for item in items]
+        futures = []
+        for item in items:
+            futures.append(executor.submit(fn, item))
+            _track_executor_threads(executor)
         results = [fut.result() for fut in futures]
+    _prune_dead_threads()
     return results
-__all__ = ["multi_thread", "multi_thread_standard"]
+def _async_raise(thread_id: int, exc_type: type[BaseException]) -> bool:
+    if thread_id <= 0:
+        return False
+    if not issubclass(exc_type, BaseException):
+        raise TypeError("exc_type must derive from BaseException")
+    res = _PY_SET_ASYNC_EXC(ctypes.c_ulong(thread_id), ctypes.py_object(exc_type))
+    if res == 0:
+        return False
+    if res > 1:  # pragma: no cover - defensive branch
+        _PY_SET_ASYNC_EXC(ctypes.c_ulong(thread_id), None)
+        raise SystemError("PyThreadState_SetAsyncExc failed")
+    return True
+def kill_all_thread(exc_type: type[BaseException] = SystemExit, join_timeout: float = 0.1) -> int:
+    """Forcefully stop tracked worker threads. Returns number of threads signalled."""
+    _prune_dead_threads()
+    current = threading.current_thread()
+    with _SPEEDY_THREADS_LOCK:
+        targets = [t for t in SPEEDY_RUNNING_THREADS if t.is_alive()]
+    terminated = 0
+    for thread in targets:
+        if thread is current:
+            continue
+        ident = thread.ident
+        if ident is None:
+            continue
+        try:
+            if _async_raise(ident, exc_type):
+                terminated += 1
+                thread.join(timeout=join_timeout)
+            else:
+                logger.warning("Unable to signal thread %s", thread.name)
+        except Exception as exc:  # pragma: no cover - defensive
+            logger.error("Failed to stop thread %s: %s", thread.name, exc)
+    _prune_dead_threads()
+    return terminated
+__all__ = [
+    "SPEEDY_RUNNING_THREADS",
+    "multi_thread",
+    "multi_thread_standard",
+    "kill_all_thread",
+]

{speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,14 @@
 Metadata-Version: 2.4
 Name: speedy-utils
-Version: 1.1.18
+Version: 1.1.20
 Summary: Fast and easy-to-use package for data science
-Author: AnhVTH
-Author-email: anhvth.226@gmail.com
-Requires-Python: >=3.8
+Project-URL: Homepage, https://github.com/anhvth/speedy
+Project-URL: Repository, https://github.com/anhvth/speedy
+Author-email: AnhVTH <anhvth.226@gmail.com>
+License: MIT
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
@@ -13,29 +17,34 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Programming Language :: Python :: 3.14
+Requires-Python: >=3.8
+Requires-Dist: aiohttp
 Requires-Dist: bump2version
 Requires-Dist: cachetools
 Requires-Dist: debugpy
 Requires-Dist: fastcore
 Requires-Dist: fastprogress
-Requires-Dist: freezegun (>=1.5.1,<2.0.0)
+Requires-Dist: freezegun
 Requires-Dist: ipdb
 Requires-Dist: ipywidgets
-Requires-Dist: json-repair (>=0.25.0,<0.31.0)
+Requires-Dist: json-repair
 Requires-Dist: jupyterlab
 Requires-Dist: loguru
 Requires-Dist: matplotlib
 Requires-Dist: numpy
-Requires-Dist: openai (>=1.106.0,<2.0.0)
-Requires-Dist: packaging (>=23.2,<25)
+Requires-Dist: openai
+Requires-Dist: packaging
 Requires-Dist: pandas
 Requires-Dist: pydantic
+Requires-Dist: pytest
+Requires-Dist: ray
 Requires-Dist: requests
 Requires-Dist: scikit-learn
 Requires-Dist: tabulate
 Requires-Dist: tqdm
 Requires-Dist: xxhash
-Project-URL: Homepage, https://github.com/anhvth/speedy
+Provides-Extra: ray
+Requires-Dist: ray>=2.49.1; (python_version >= '3.9') and extra == 'ray'
 Description-Content-Type: text/markdown
 # Speedy Utils
@@ -84,6 +93,19 @@ cd speedy-utils
 pip install .
 ```
+### Extras
+Optional dependencies can be installed via extras. For the `ray` backend
+support (requires Python >= 3.9):
+```bash
+# pip
+pip install 'speedy-utils[ray]'
+# Poetry (for developing this repo)
+poetry install -E ray
+```
 ## Updating from previous versions
 To update from previous versions or switch to v1.x, first uninstall any old
@@ -282,9 +304,8 @@ python speedy_utils/common/dataclass_parser.py
 Example output:
-| Field              | Value                                 |
-|--------------------|---------------------------------------|
-| from_peft          | ./outputs/llm_hn_qw32b/hn_results_r3/ |
+| Field     | Value                                 |
+| --------- | ------------------------------------- |
+| from_peft | ./outputs/llm_hn_qw32b/hn_results_r3/ |
 Please ensure your code adheres to the project's coding standards and includes appropriate tests.

speedy-utils 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl

speedy-utils 1.1.18py3-none-any.whl → 1.1.20py3-none-any.whl