PyPI - speedy-utils - Versions diffs - 1.1.23__py3-none-any.whl → 1.1.25__py3-none-any.whl - Mend

speedy-utils 1.1.23py3-none-any.whl → 1.1.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

llm_utils/__init__.py +12 -8
llm_utils/chat_format/__init__.py +2 -0
llm_utils/chat_format/display.py +115 -44
llm_utils/lm/__init__.py +14 -6
llm_utils/lm/llm.py +413 -0
llm_utils/lm/llm_signature.py +35 -0
llm_utils/lm/mixins.py +379 -0
llm_utils/lm/openai_memoize.py +18 -7
llm_utils/lm/signature.py +26 -37
llm_utils/lm/utils.py +61 -76
speedy_utils/__init__.py +31 -2
speedy_utils/all.py +30 -1
speedy_utils/common/utils_cache.py +142 -1
speedy_utils/common/utils_io.py +36 -26
speedy_utils/common/utils_misc.py +25 -1
speedy_utils/multi_worker/thread.py +145 -58
{speedy_utils-1.1.23.dist-info → speedy_utils-1.1.25.dist-info}/METADATA +1 -1
{speedy_utils-1.1.23.dist-info → speedy_utils-1.1.25.dist-info}/RECORD +20 -19
llm_utils/lm/llm_as_a_judge.py +0 -390
llm_utils/lm/llm_task.py +0 -614
{speedy_utils-1.1.23.dist-info → speedy_utils-1.1.25.dist-info}/WHEEL +0 -0
{speedy_utils-1.1.23.dist-info → speedy_utils-1.1.25.dist-info}/entry_points.txt +0 -0

speedy_utils/common/utils_io.py CHANGED Viewed

@@ -29,9 +29,7 @@ def dump_jsonl(list_dictionaries: list[dict], file_name: str = "output.jsonl") -
             file.write(json.dumps(dictionary, ensure_ascii=False) + "\n")
-def dump_json_or_pickle(
-    obj: Any, fname: str, ensure_ascii: bool = False, indent: int = 4
-) -> None:
+def dump_json_or_pickle(obj: Any, fname: str, ensure_ascii: bool = False, indent: int = 4) -> None:
     """
     Dump an object to a file, supporting both JSON and pickle formats.
     """
@@ -59,6 +57,7 @@ def dump_json_or_pickle(
             if isinstance(obj, BaseModel):
                 data = obj.model_dump()
                 from fastcore.all import dict2obj, obj2dict
                 obj2 = dict2obj(data)
                 with open(fname, "wb") as f:
                     pickle.dump(obj2, f)
@@ -84,7 +83,8 @@ def load_json_or_pickle(fname: str, counter=0) -> Any:
         except EOFError:
             time.sleep(1)
             if counter > 5:
-                print("Error: Ran out of input", fname)
+                # Keep message concise and actionable
+                print(f"Corrupted cache file {fname} removed; it will be regenerated on next access")
                 os.remove(fname)
                 raise
             return load_json_or_pickle(fname, counter + 1)
@@ -92,8 +92,6 @@ def load_json_or_pickle(fname: str, counter=0) -> Any:
             raise ValueError(f"Error {e} while loading {fname}") from e
 try:
     import orjson  # type: ignore[import-not-found]  # fastest JSON parser when available
 except Exception:
@@ -113,11 +111,11 @@ def fast_load_jsonl(
     use_orjson: bool = True,
     encoding: str = "utf-8",
     errors: str = "strict",
-    on_error: str = "raise",   # 'raise' | 'warn' | 'skip'
+    on_error: str = "raise",  # 'raise' | 'warn' | 'skip'
     skip_empty: bool = True,
     max_lines: Optional[int] = None,
     use_multiworker: bool = True,
-    multiworker_threshold: int = 50000,
+    multiworker_threshold: int = 1000000,
     workers: Optional[int] = None,
 ) -> Iterable[Any]:
     """
@@ -127,7 +125,7 @@ def fast_load_jsonl(
     - Optional tqdm progress over bytes (compressed size if gz/bz2/xz/zst).
     - Auto-detects compression by extension: .gz, .bz2, .xz/.lzma, .zst/.zstd.
     - Uses orjson if available (use_orjson=True), falls back to json.
-    - Automatically uses multi-worker processing for large files (>50k lines).
+    - Automatically uses multi-worker processing for large files (>100k lines).
     Args:
         path_or_file: Path-like or file-like object. File-like can be binary or text.
@@ -140,11 +138,12 @@ def fast_load_jsonl(
         max_lines: Stop after reading this many lines (useful for sampling).
         use_multiworker: Enable multi-worker processing for large files.
         multiworker_threshold: Line count threshold to trigger multi-worker processing.
-        workers: Number of worker threads (defaults to CPU count).
+        workers: Number of worker threads (defaults to 80% of CPU count, max 8).
     Yields:
         Parsed Python objects per line.
     """
     def _open_auto(pth_or_f) -> IO[Any]:
         if hasattr(pth_or_f, "read"):
             # ensure binary buffer for consistent byte-length progress
@@ -206,39 +205,47 @@ def fast_load_jsonl(
     # Check if we should use multi-worker processing
     should_use_multiworker = (
-        use_multiworker
+        use_multiworker
         and not hasattr(path_or_file, "read")  # Only for file paths, not file objects
         and max_lines is None  # Don't use multiworker if we're limiting lines
     )
     if should_use_multiworker:
         line_count = _count_lines_fast(cast(Union[str, os.PathLike], path_or_file))
         if line_count > multiworker_threshold:
             # Use multi-worker processing
             from ..multi_worker.thread import multi_thread
+            # Calculate optimal worker count: 80% of CPU count, capped at 8
+            cpu_count = os.cpu_count() or 4
+            default_workers = min(int(cpu_count * 0.8), 8)
+            num_workers = workers if workers is not None else default_workers
+            num_workers = max(1, num_workers)  # At least 1 worker
             # Read all lines into chunks
             f = _open_auto(path_or_file)
             all_lines = list(f)
             f.close()
-            # Split into chunks for workers
-            num_workers = workers or os.cpu_count() or 4
-            chunk_size = max(len(all_lines) // num_workers, 1000)
+            # Split into chunks - aim for ~10k-20k lines per chunk minimum
+            min_chunk_size = 10000
+            chunk_size = max(len(all_lines) // num_workers, min_chunk_size)
             chunks = []
             for i in range(0, len(all_lines), chunk_size):
-                chunks.append(all_lines[i:i + chunk_size])
+                chunks.append(all_lines[i : i + chunk_size])
             # Process chunks in parallel
             if progress:
-                print(f"Processing {line_count} lines with {num_workers} workers...")
+                print(f"Processing {line_count} lines with {num_workers} workers ({len(chunks)} chunks)...")
             chunk_results = multi_thread(_process_chunk, chunks, workers=num_workers, progress=progress)
             # Flatten results and yield
-            for chunk_result in chunk_results:
-                for obj in chunk_result:
-                    yield obj
+            if chunk_results:
+                for chunk_result in chunk_results:
+                    if chunk_result:
+                        for obj in chunk_result:
+                            yield obj
             return
     # Single-threaded processing (original logic)
@@ -266,7 +273,11 @@ def fast_load_jsonl(
             line_no += 1
             if pbar is not None:
                 # raw_line is bytes here; if not, compute byte length
-                nbytes = len(raw_line) if isinstance(raw_line, (bytes, bytearray)) else len(str(raw_line).encode(encoding, errors))
+                nbytes = (
+                    len(raw_line)
+                    if isinstance(raw_line, (bytes, bytearray))
+                    else len(str(raw_line).encode(encoding, errors))
+                )
                 pbar.update(nbytes)
             # Normalize to bytes -> str only if needed
@@ -322,7 +333,6 @@ def fast_load_jsonl(
                 pass
 def load_by_ext(fname: Union[str, list[str]], do_memoize: bool = False) -> Any:
     """
     Load data based on file extension.

speedy_utils/common/utils_misc.py CHANGED Viewed

@@ -3,10 +3,12 @@
 import inspect
 import os
 from collections.abc import Callable
-from typing import Any
+from typing import Any, TypeVar
 from pydantic import BaseModel
+T = TypeVar("T")
 def mkdir_or_exist(dir_name: str) -> None:
     """Create a directory if it doesn't exist."""
@@ -50,10 +52,32 @@ def convert_to_builtin_python(input_data: Any) -> Any:
         raise ValueError(f"Unsupported type {type(input_data)}")
+def dedup(items: list[T], key: Callable[[T], Any]) -> list[T]:
+    """
+    Deduplicate items in a list based on a key function.
+    Args:
+        items: The list of items.
+        key: A function that takes an item and returns a hashable key.
+    Returns:
+        A list with duplicates removed, preserving the first occurrence.
+    """
+    seen = set()
+    result = []
+    for item in items:
+        k = key(item)
+        if k not in seen:
+            seen.add(k)
+            result.append(item)
+    return result
 __all__ = [
     "mkdir_or_exist",
     "flatten_list",
     "get_arg_names",
     "is_notebook",
     "convert_to_builtin_python",
+    "dedup",
 ]

speedy_utils/multi_worker/thread.py CHANGED Viewed

@@ -80,8 +80,10 @@
 import ctypes
 import os
+import sys
 import threading
 import time
+import traceback
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
 from heapq import heappop, heappush
@@ -99,12 +101,42 @@ except ImportError:  # pragma: no cover
 # Sensible defaults
 DEFAULT_WORKERS = (os.cpu_count() or 4) * 2
-T = TypeVar('T')
-R = TypeVar('R')
+T = TypeVar("T")
+R = TypeVar("R")
 SPEEDY_RUNNING_THREADS: list[threading.Thread] = []  # cooperative shutdown tracking
 _SPEEDY_THREADS_LOCK = threading.Lock()
+class UserFunctionError(Exception):
+    """Exception wrapper that highlights user function errors."""
+    def __init__(
+        self,
+        original_exception: Exception,
+        func_name: str,
+        input_value: Any,
+        user_traceback: list[traceback.FrameSummary],
+    ) -> None:
+        self.original_exception = original_exception
+        self.func_name = func_name
+        self.input_value = input_value
+        self.user_traceback = user_traceback
+        # Create a focused error message
+        tb_str = "".join(traceback.format_list(user_traceback))
+        msg = (
+            f'\nError in function "{func_name}" with input: {input_value!r}\n'
+            f"\nUser code traceback:\n{tb_str}"
+            f"{type(original_exception).__name__}: {original_exception}"
+        )
+        super().__init__(msg)
+    def __str__(self) -> str:
+        # Return focused error without infrastructure frames
+        return super().__str__()
 _PY_SET_ASYNC_EXC = ctypes.pythonapi.PyThreadState_SetAsyncExc
 try:
     _PY_SET_ASYNC_EXC.argtypes = (ctypes.c_ulong, ctypes.py_object)  # type: ignore[attr-defined]
@@ -133,7 +165,7 @@ def _track_threads(threads: Iterable[threading.Thread]) -> None:
 def _track_executor_threads(pool: ThreadPoolExecutor) -> None:
-    thread_set = getattr(pool, '_threads', None)
+    thread_set = getattr(pool, "_threads", None)
     if not thread_set:
         return
     _track_threads(tuple(thread_set))
@@ -152,7 +184,48 @@ def _worker(
     fixed_kwargs: Mapping[str, Any],
 ) -> R:
     """Execute the function with an item and fixed kwargs."""
-    return func(item, **fixed_kwargs)
+    # Validate func is callable before attempting to call it
+    if not callable(func):
+        func_type = type(func).__name__
+        raise TypeError(
+            f"\nmulti_thread: func parameter must be callable, "
+            f"got {func_type}: {func!r}\n"
+            f"Hint: Did you accidentally pass a {func_type} instead of a function?"
+        )
+    try:
+        return func(item, **fixed_kwargs)
+    except Exception as exc:
+        # Extract user code traceback (filter out infrastructure)
+        exc_tb = sys.exc_info()[2]
+        if exc_tb is not None:
+            tb_list = traceback.extract_tb(exc_tb)
+            # Filter to keep only user code frames
+            user_frames = []
+            skip_patterns = [
+                "multi_worker/thread.py",
+                "concurrent/futures/",
+                "threading.py",
+            ]
+            for frame in tb_list:
+                if not any(pattern in frame.filename for pattern in skip_patterns):
+                    user_frames.append(frame)
+            # If we have user frames, wrap in our custom exception
+            if user_frames:
+                func_name = getattr(func, "__name__", repr(func))
+                raise UserFunctionError(
+                    exc,
+                    func_name,
+                    item,
+                    user_frames,
+                ) from exc
+        # Fallback: re-raise original if we couldn't extract frames
+        raise
 def _run_batch(
@@ -164,14 +237,14 @@ def _run_batch(
 def _attach_metadata(fut: Future[Any], idx: int, logical_size: int) -> None:
-    setattr(fut, '_speedy_idx', idx)
-    setattr(fut, '_speedy_size', logical_size)
+    setattr(fut, "_speedy_idx", idx)
+    setattr(fut, "_speedy_size", logical_size)
 def _future_meta(fut: Future[Any]) -> tuple[int, int]:
     return (
-        getattr(fut, '_speedy_idx'),
-        getattr(fut, '_speedy_size'),
+        getattr(fut, "_speedy_idx"),
+        getattr(fut, "_speedy_size"),
     )
@@ -219,7 +292,7 @@ def _resolve_worker_count(workers: int | None) -> int:
     if workers is None:
         return DEFAULT_WORKERS
     if workers <= 0:
-        raise ValueError('workers must be a positive integer')
+        raise ValueError("workers must be a positive integer")
     return workers
@@ -227,18 +300,18 @@ def _normalize_batch_result(result: Any, logical_size: int) -> list[Any]:
     if logical_size == 1:
         return [result]
     if result is None:
-        raise ValueError('batched callable returned None for a batch result')
+        raise ValueError("batched callable returned None for a batch result")
     if isinstance(result, (str, bytes, bytearray)):
-        raise TypeError('batched callable must not return str/bytes when batching')
+        raise TypeError("batched callable must not return str/bytes when batching")
     if isinstance(result, Sequence):
         out = list(result)
     elif isinstance(result, Iterable):
         out = list(result)
     else:
-        raise TypeError('batched callable must return an iterable of results')
+        raise TypeError("batched callable must return an iterable of results")
     if len(out) != logical_size:
         raise ValueError(
-            f'batched callable returned {len(out)} items, expected {logical_size}',
+            f"batched callable returned {len(out)} items, expected {logical_size}",
         )
     return out
@@ -325,7 +398,7 @@ def multi_thread(
         results: list[R | None] = []
         for proc_idx, chunk in enumerate(chunks):
-            with tempfile.NamedTemporaryFile(delete=False, suffix='multi_thread.pkl') as fh:
+            with tempfile.NamedTemporaryFile(delete=False, suffix="multi_thread.pkl") as fh:
                 file_pkl = fh.name
             assert isinstance(in_process_multi_thread, Callable)
             proc = in_process_multi_thread(
@@ -347,28 +420,28 @@ def multi_thread(
         for proc, file_pkl in procs:
             proc.join()
-            logger.info('process finished: %s', proc)
+            logger.info("process finished: %s", proc)
             try:
                 results.extend(load_by_ext(file_pkl))
             finally:
                 try:
                     os.unlink(file_pkl)
                 except OSError as exc:  # pragma: no cover - best effort cleanup
-                    logger.warning('failed to remove temp file %s: %s', file_pkl, exc)
+                    logger.warning("failed to remove temp file %s: %s", file_pkl, exc)
         return results
     try:
         import pandas as pd
         if isinstance(inputs, pd.DataFrame):
-            inputs = cast(Iterable[T], inputs.to_dict(orient='records'))
+            inputs = cast(Iterable[T], inputs.to_dict(orient="records"))
     except ImportError:  # pragma: no cover - optional dependency
         pass
     if batch <= 0:
-        raise ValueError('batch must be a positive integer')
+        raise ValueError("batch must be a positive integer")
     if prefetch_factor <= 0:
-        raise ValueError('prefetch_factor must be a positive integer')
+        raise ValueError("prefetch_factor must be a positive integer")
     workers_val = _resolve_worker_count(workers)
     progress_update = max(progress_update, 1)
@@ -390,20 +463,12 @@ def multi_thread(
     bar = None
     last_bar_update = 0
-    if (
-        progress
-        and tqdm is not None
-        and logical_total is not None
-        and logical_total > 0
-    ):
+    if progress and tqdm is not None and logical_total is not None and logical_total > 0:
         bar = tqdm(
             total=logical_total,
             ncols=128,
-            colour='green',
-            bar_format=(
-                '{l_bar}{bar}| {n_fmt}/{total_fmt}'
-                ' [{elapsed}<{remaining}, {rate_fmt}{postfix}]'
-            ),
+            colour="green",
+            bar_format=("{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"),
         )
     deadline = time.monotonic() + timeout if timeout is not None else None
@@ -417,11 +482,12 @@ def multi_thread(
     inflight: set[Future[Any]] = set()
     pool = ThreadPoolExecutor(
         max_workers=workers_val,
-        thread_name_prefix='speedy-thread',
+        thread_name_prefix="speedy-thread",
     )
-    shutdown_kwargs: dict[str, Any] = {'wait': True}
+    shutdown_kwargs: dict[str, Any] = {"wait": True}
     try:
         def submit_arg(arg: Any) -> None:
             nonlocal next_logical_idx
             if batch > 1:
@@ -451,7 +517,7 @@ def multi_thread(
                 if remaining <= 0:
                     _cancel_futures(inflight)
                     raise TimeoutError(
-                        f'multi_thread timed out after {timeout} seconds',
+                        f"multi_thread timed out after {timeout} seconds",
                     )
                 wait_timeout = max(remaining, 0.0)
@@ -464,7 +530,7 @@ def multi_thread(
             if not done:
                 _cancel_futures(inflight)
                 raise TimeoutError(
-                    f'multi_thread timed out after {timeout} seconds',
+                    f"multi_thread timed out after {timeout} seconds",
                 )
             for fut in done:
@@ -472,11 +538,37 @@ def multi_thread(
                 idx, logical_size = _future_meta(fut)
                 try:
                     result = fut.result()
+                except UserFunctionError as exc:
+                    # User function error - already has clean traceback
+                    logger.error(str(exc))
+                    if stop_on_error:
+                        _cancel_futures(inflight)
+                        # Create a clean exception without infrastructure frames
+                        # by re-creating the traceback
+                        orig_exc = exc.original_exception
+                        # Build new traceback from user frames only
+                        tb_str = "".join(traceback.format_list(exc.user_traceback))
+                        clean_msg = (
+                            f'\nError in "{exc.func_name}" '
+                            f"with input: {exc.input_value!r}\n\n{tb_str}"
+                            f"{type(orig_exc).__name__}: {orig_exc}"
+                        )
+                        # Raise a new instance of the original exception type
+                        # with our clean message
+                        new_exc = type(orig_exc)(clean_msg)
+                        # Suppress the "from" chain to avoid showing infrastructure
+                        raise new_exc from None
+                    out_items = [None] * logical_size
                 except Exception as exc:
+                    # Other errors (infrastructure, batching, etc.)
                     if stop_on_error:
                         _cancel_futures(inflight)
                         raise
-                    logger.exception('multi_thread task failed', exc_info=exc)
+                    logger.exception("multi_thread task failed", exc_info=exc)
                     out_items = [None] * logical_size
                 else:
                     try:
@@ -484,7 +576,7 @@ def multi_thread(
                     except Exception as exc:
                         _cancel_futures(inflight)
                         raise RuntimeError(
-                            'batched callable returned an unexpected shape',
+                            "batched callable returned an unexpected shape",
                         ) from exc
                 collector.add(idx, out_items)
@@ -496,14 +588,10 @@ def multi_thread(
                         bar.update(delta)
                         last_bar_update = completed_items
                         submitted = next_logical_idx
-                        pending = (
-                            max(logical_total - submitted, 0)
-                            if logical_total is not None
-                            else '-'
-                        )
+                        pending = max(logical_total - submitted, 0) if logical_total is not None else "-"
                         postfix = {
-                            'processing': min(len(inflight), workers_val),
-                            'pending': pending,
+                            "processing": min(len(inflight), workers_val),
+                            "pending": pending,
                         }
                         bar.set_postfix(postfix)
@@ -516,7 +604,7 @@ def multi_thread(
         results = collector.finalize()
     except KeyboardInterrupt:
-        shutdown_kwargs = {'wait': False, 'cancel_futures': True}
+        shutdown_kwargs = {"wait": False, "cancel_futures": True}
         _cancel_futures(inflight)
         kill_all_thread(SystemExit)
         raise KeyboardInterrupt() from None
@@ -524,29 +612,27 @@ def multi_thread(
         try:
             pool.shutdown(**shutdown_kwargs)
         except TypeError:  # pragma: no cover - Python <3.9 fallback
-            pool.shutdown(shutdown_kwargs.get('wait', True))
+            pool.shutdown(shutdown_kwargs.get("wait", True))
         if bar:
             delta = completed_items - last_bar_update
             if delta > 0:
                 bar.update(delta)
             bar.close()
-    results = collector.finalize() if 'results' not in locals() else results
+    results = collector.finalize() if "results" not in locals() else results
     if store_output_pkl_file:
         dump_json_or_pickle(results, store_output_pkl_file)
     _prune_dead_threads()
     return results
-def multi_thread_standard(
-    fn: Callable[[T], R], items: Iterable[T], workers: int = 4
-) -> list[R]:
+def multi_thread_standard(fn: Callable[[T], R], items: Iterable[T], workers: int = 4) -> list[R]:
     """Execute ``fn`` across ``items`` while preserving submission order."""
     workers_val = _resolve_worker_count(workers)
     with ThreadPoolExecutor(
         max_workers=workers_val,
-        thread_name_prefix='speedy-thread',
+        thread_name_prefix="speedy-thread",
     ) as executor:
         futures: list[Future[R]] = []
         for item in items:
@@ -561,13 +647,13 @@ def _async_raise(thread_id: int, exc_type: type[BaseException]) -> bool:
     if thread_id <= 0:
         return False
     if not issubclass(exc_type, BaseException):
-        raise TypeError('exc_type must derive from BaseException')
+        raise TypeError("exc_type must derive from BaseException")
     res = _PY_SET_ASYNC_EXC(ctypes.c_ulong(thread_id), ctypes.py_object(exc_type))
     if res == 0:
         return False
     if res > 1:  # pragma: no cover - defensive branch
         _PY_SET_ASYNC_EXC(ctypes.c_ulong(thread_id), None)
-        raise SystemError('PyThreadState_SetAsyncExc failed')
+        raise SystemError("PyThreadState_SetAsyncExc failed")
     return True
@@ -596,16 +682,17 @@ def kill_all_thread(exc_type: type[BaseException] = SystemExit, join_timeout: fl
                 terminated += 1
                 thread.join(timeout=join_timeout)
             else:
-                logger.warning('Unable to signal thread %s', thread.name)
+                logger.warning("Unable to signal thread %s", thread.name)
         except Exception as exc:  # pragma: no cover - defensive
-            logger.error('Failed to stop thread %s: %s', thread.name, exc)
+            logger.error("Failed to stop thread %s: %s", thread.name, exc)
     _prune_dead_threads()
     return terminated
 __all__ = [
-    'SPEEDY_RUNNING_THREADS',
-    'multi_thread',
-    'multi_thread_standard',
-    'kill_all_thread',
+    "SPEEDY_RUNNING_THREADS",
+    "UserFunctionError",
+    "multi_thread",
+    "multi_thread_standard",
+    "kill_all_thread",
 ]

{speedy_utils-1.1.23.dist-info → speedy_utils-1.1.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: speedy-utils
-Version: 1.1.23
+Version: 1.1.25
 Summary: Fast and easy-to-use package for data science
 Project-URL: Homepage, https://github.com/anhvth/speedy
 Project-URL: Repository, https://github.com/anhvth/speedy

speedy-utils 1.1.23__py3-none-any.whl → 1.1.25__py3-none-any.whl

speedy-utils 1.1.23py3-none-any.whl → 1.1.25py3-none-any.whl