PyPI - speedy-utils - Versions diffs - 1.1.12__tar.gz → 1.1.14__tar.gz - Mend

speedy-utils 1.1.12tar.gz → 1.1.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: speedy-utils
-Version: 1.1.12
+Version: 1.1.14
 Summary: Fast and easy-to-use package for data science
 Author: AnhVTH
 Author-email: anhvth.226@gmail.com
@@ -25,6 +25,7 @@ Requires-Dist: jupyterlab
 Requires-Dist: loguru
 Requires-Dist: matplotlib
 Requires-Dist: numpy
+Requires-Dist: openai (>=1.106.0,<2.0.0)
 Requires-Dist: packaging (>=23.2,<25)
 Requires-Dist: pandas
 Requires-Dist: pydantic

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "speedy-utils"
-version = "1.1.12"
+version = "1.1.14"
 description = "Fast and easy-to-use package for data science"
 authors = ["AnhVTH <anhvth.226@gmail.com>"]
 readme = "README.md"
@@ -58,6 +58,7 @@ json-repair = ">=0.25.0,<0.31.0"
 fastprogress = "*"
 freezegun = "^1.5.1"
 packaging = ">=23.2,<25"
+openai = "^1.106.0"
 [tool.poetry.scripts]
 mpython = "speedy_utils.scripts.mpython:main"

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/src/llm_utils/__init__.py RENAMED Viewed

@@ -1,4 +1,6 @@
 from llm_utils.lm.openai_memoize import MOpenAI
+from llm_utils.vector_cache import VectorCache
 from .chat_format import (
     build_chatml_input,
     display_chat_messages_as_html,
@@ -24,5 +26,6 @@ __all__ = [
     "display_chat_messages_as_html",
     "AsyncLM",
     "AsyncLLMTask",
-    "MOpenAI"
+    "MOpenAI",
+    "VectorCache",
 ]

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/src/llm_utils/lm/async_lm/async_lm.py RENAMED Viewed

@@ -5,6 +5,7 @@ from typing import (
     Literal,
     Optional,
     Type,
+    Union,
     cast,
 )
@@ -49,7 +50,7 @@ class AsyncLM(AsyncLMBase):
         temperature: float = 0.0,
         max_tokens: int = 2_000,
         host: str = "localhost",
-        port: Optional[int | str] = None,
+        port: Optional[Union[int, str]] = None,
         base_url: Optional[str] = None,
         api_key: Optional[str] = None,
         cache: bool = True,

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/src/llm_utils/lm/async_lm/async_lm_base.py RENAMED Viewed

@@ -41,7 +41,7 @@ class AsyncLMBase:
         self,
         *,
         host: str = "localhost",
-        port: Optional[int | str] = None,
+        port: Optional[Union[int, str]] = None,
         base_url: Optional[str] = None,
         api_key: Optional[str] = None,
         cache: bool = True,
@@ -81,8 +81,8 @@ class AsyncLMBase:
     async def __call__(  # type: ignore
         self,
         *,
-        prompt: str | None = ...,
-        messages: RawMsgs | None = ...,
+        prompt: Optional[str] = ...,
+        messages: Optional[RawMsgs] = ...,
         response_format: type[str] = str,
         return_openai_response: bool = ...,
         **kwargs: Any,
@@ -92,8 +92,8 @@ class AsyncLMBase:
     async def __call__(
         self,
         *,
-        prompt: str | None = ...,
-        messages: RawMsgs | None = ...,
+        prompt: Optional[str] = ...,
+        messages: Optional[RawMsgs] = ...,
         response_format: Type[TModel],
         return_openai_response: bool = ...,
         **kwargs: Any,
@@ -137,7 +137,7 @@ class AsyncLMBase:
     @staticmethod
     def _parse_output(
         raw_response: Any, response_format: Union[type[str], Type[BaseModel]]
-    ) -> str | BaseModel:
+    ) -> Union[str, BaseModel]:
         if hasattr(raw_response, "model_dump"):
             raw_response = raw_response.model_dump()

speedy_utils-1.1.14/src/llm_utils/vector_cache/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""
+Efficient embedding caching system using vLLM for offline embeddings.
+This package provides a fast, SQLite-backed caching layer for text embeddings,
+supporting both OpenAI API and local models via vLLM.
+Classes:
+    VectorCache: Main class for embedding computation and caching
+Example:
+    # Using local model
+    cache = VectorCache("Qwen/Qwen3-Embedding-0.6B")
+    embeddings = cache.embeds(["Hello world", "How are you?"])
+    # Using OpenAI API
+    cache = VectorCache("https://api.openai.com/v1")
+    embeddings = cache.embeds(["Hello world", "How are you?"])
+"""
+from .core import VectorCache
+from .utils import get_default_cache_path, validate_model_name, estimate_cache_size
+__version__ = "0.1.0"
+__author__ = "AnhVTH <anhvth.226@gmail.com>"
+__all__ = ["VectorCache", "get_default_cache_path", "validate_model_name", "estimate_cache_size"]

speedy_utils-1.1.14/src/llm_utils/vector_cache/cli.py ADDED Viewed

@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+"""Command-line interface for embed_cache package."""
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import List
+from llm_utils.vector_cache import VectorCache, estimate_cache_size, validate_model_name
+def main():
+    """Main CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description="Command-line interface for embed_cache package"
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Embed command
+    embed_parser = subparsers.add_parser("embed", help="Generate embeddings for texts")
+    embed_parser.add_argument("model", help="Model name or API URL")
+    embed_parser.add_argument("--texts", nargs="+", help="Texts to embed")
+    embed_parser.add_argument("--file", help="File containing texts (one per line)")
+    embed_parser.add_argument("--output", help="Output file for embeddings (JSON)")
+    embed_parser.add_argument(
+        "--cache-db", default="embed_cache.sqlite", help="Cache database path"
+    )
+    embed_parser.add_argument(
+        "--backend",
+        choices=["vllm", "transformers", "openai", "auto"],
+        default="auto",
+        help="Backend to use",
+    )
+    embed_parser.add_argument(
+        "--gpu-memory",
+        type=float,
+        default=0.5,
+        help="GPU memory utilization for vLLM (0.0-1.0)",
+    )
+    embed_parser.add_argument(
+        "--batch-size", type=int, default=32, help="Batch size for transformers"
+    )
+    embed_parser.add_argument(
+        "--verbose", action="store_true", help="Enable verbose output"
+    )
+    # Cache stats command
+    stats_parser = subparsers.add_parser("stats", help="Show cache statistics")
+    stats_parser.add_argument(
+        "--cache-db", default="embed_cache.sqlite", help="Cache database path"
+    )
+    # Clear cache command
+    clear_parser = subparsers.add_parser("clear", help="Clear cache")
+    clear_parser.add_argument(
+        "--cache-db", default="embed_cache.sqlite", help="Cache database path"
+    )
+    clear_parser.add_argument(
+        "--confirm", action="store_true", help="Skip confirmation prompt"
+    )
+    # Validate model command
+    validate_parser = subparsers.add_parser("validate", help="Validate model name")
+    validate_parser.add_argument("model", help="Model name to validate")
+    # Estimate command
+    estimate_parser = subparsers.add_parser("estimate", help="Estimate cache size")
+    estimate_parser.add_argument("num_texts", type=int, help="Number of texts")
+    estimate_parser.add_argument(
+        "--embed-dim", type=int, default=1024, help="Embedding dimension"
+    )
+    args = parser.parse_args()
+    if not args.command:
+        parser.print_help()
+        return
+    try:
+        if args.command == "embed":
+            handle_embed(args)
+        elif args.command == "stats":
+            handle_stats(args)
+        elif args.command == "clear":
+            handle_clear(args)
+        elif args.command == "validate":
+            handle_validate(args)
+        elif args.command == "estimate":
+            handle_estimate(args)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+def handle_embed(args):
+    """Handle embed command."""
+    # Get texts
+    texts = []
+    if args.texts:
+        texts.extend(args.texts)
+    if args.file:
+        file_path = Path(args.file)
+        if not file_path.exists():
+            raise FileNotFoundError(f"File not found: {args.file}")
+        with open(file_path, "r", encoding="utf-8") as f:
+            texts.extend([line.strip() for line in f if line.strip()])
+    if not texts:
+        raise ValueError("No texts provided. Use --texts or --file")
+    print(f"Embedding {len(texts)} texts using model: {args.model}")
+    # Initialize cache and get embeddings
+    cache = VectorCache(args.model, db_path=args.cache_db)
+    embeddings = cache.embeds(texts)
+    print(f"Generated embeddings with shape: {embeddings.shape}")
+    # Output results
+    if args.output:
+        output_data = {
+            "texts": texts,
+            "embeddings": embeddings.tolist(),
+            "shape": list(embeddings.shape),
+            "model": args.model,
+        }
+        with open(args.output, "w", encoding="utf-8") as f:
+            json.dump(output_data, f, indent=2)
+        print(f"Results saved to: {args.output}")
+    else:
+        print(f"Embeddings shape: {embeddings.shape}")
+        print(f"Sample embedding (first 5 dims): {embeddings[0][:5].tolist()}")
+def handle_stats(args):
+    """Handle stats command."""
+    cache_path = Path(args.cache_db)
+    if not cache_path.exists():
+        print(f"Cache database not found: {args.cache_db}")
+        return
+    cache = VectorCache("dummy", db_path=args.cache_db)
+    stats = cache.get_cache_stats()
+    print("Cache Statistics:")
+    print(f"  Database: {args.cache_db}")
+    print(f"  Total cached embeddings: {stats['total_cached']}")
+    print(f"  Database size: {cache_path.stat().st_size / (1024 * 1024):.2f} MB")
+def handle_clear(args):
+    """Handle clear command."""
+    cache_path = Path(args.cache_db)
+    if not cache_path.exists():
+        print(f"Cache database not found: {args.cache_db}")
+        return
+    if not args.confirm:
+        response = input(
+            f"Are you sure you want to clear cache at {args.cache_db}? [y/N]: "
+        )
+        if response.lower() != "y":
+            print("Cancelled.")
+            return
+    cache = VectorCache("dummy", db_path=args.cache_db)
+    stats_before = cache.get_cache_stats()
+    cache.clear_cache()
+    print(
+        f"Cleared {stats_before['total_cached']} cached embeddings from {args.cache_db}"
+    )
+def handle_validate(args):
+    """Handle validate command."""
+    is_valid = validate_model_name(args.model)
+    if is_valid:
+        print(f"✓ Valid model: {args.model}")
+    else:
+        print(f"✗ Invalid model: {args.model}")
+        sys.exit(1)
+def handle_estimate(args):
+    """Handle estimate command."""
+    size_estimate = estimate_cache_size(args.num_texts, args.embed_dim)
+    print(
+        f"Estimated cache size for {args.num_texts} texts ({args.embed_dim}D embeddings): {size_estimate}"
+    )
+if __name__ == "__main__":
+    main()

speedy_utils-1.1.14/src/llm_utils/vector_cache/core.py ADDED Viewed

@@ -0,0 +1,557 @@
+from __future__ import annotations
+import hashlib
+import os
+import sqlite3
+from pathlib import Path
+from time import time
+from typing import Any, Dict, Literal, Optional, cast
+import numpy as np
+class VectorCache:
+    """
+    A caching layer for text embeddings with support for multiple backends.
+    Examples:
+        # OpenAI API
+        from llm_utils import VectorCache
+        cache = VectorCache("https://api.openai.com/v1", api_key="your-key")
+        embeddings = cache.embeds(["Hello world", "How are you?"])
+        # Custom OpenAI-compatible server (auto-detects model)
+        cache = VectorCache("http://localhost:8000/v1", api_key="abc")
+        # Transformers (Sentence Transformers)
+        cache = VectorCache("sentence-transformers/all-MiniLM-L6-v2")
+        # vLLM (local model)
+        cache = VectorCache("/path/to/model")
+        # Explicit backend specification
+        cache = VectorCache("model-name", backend="transformers")
+        # Lazy loading (default: True) - load model only when needed
+        cache = VectorCache("model-name", lazy=True)
+        # Eager loading - load model immediately
+        cache = VectorCache("model-name", lazy=False)
+    """
+    def __init__(
+        self,
+        url_or_model: str,
+        backend: Optional[Literal["vllm", "transformers", "openai"]] = None,
+        embed_size: Optional[int] = None,
+        db_path: Optional[str] = None,
+        # OpenAI API parameters
+        api_key: Optional[str] = "abc",
+        model_name: Optional[str] = None,
+        # vLLM parameters
+        vllm_gpu_memory_utilization: float = 0.5,
+        vllm_tensor_parallel_size: int = 1,
+        vllm_dtype: str = "auto",
+        vllm_trust_remote_code: bool = False,
+        vllm_max_model_len: Optional[int] = None,
+        # Transformers parameters
+        transformers_device: str = "auto",
+        transformers_batch_size: int = 32,
+        transformers_normalize_embeddings: bool = True,
+        transformers_trust_remote_code: bool = False,
+        # SQLite parameters
+        sqlite_chunk_size: int = 999,
+        sqlite_cache_size: int = 10000,
+        sqlite_mmap_size: int = 268435456,
+        # Other parameters
+        verbose: bool = True,
+        lazy: bool = True,
+    ) -> None:
+        self.url_or_model = url_or_model
+        self.embed_size = embed_size
+        self.verbose = verbose
+        self.lazy = lazy
+        self.backend = self._determine_backend(backend)
+        if self.verbose and backend is None:
+            print(f"Auto-detected backend: {self.backend}")
+        # Store all configuration parameters
+        self.config = {
+            # OpenAI
+            "api_key": api_key or os.getenv("OPENAI_API_KEY"),
+            "model_name": self._try_infer_model_name(model_name),
+            # vLLM
+            "vllm_gpu_memory_utilization": vllm_gpu_memory_utilization,
+            "vllm_tensor_parallel_size": vllm_tensor_parallel_size,
+            "vllm_dtype": vllm_dtype,
+            "vllm_trust_remote_code": vllm_trust_remote_code,
+            "vllm_max_model_len": vllm_max_model_len,
+            # Transformers
+            "transformers_device": transformers_device,
+            "transformers_batch_size": transformers_batch_size,
+            "transformers_normalize_embeddings": transformers_normalize_embeddings,
+            "transformers_trust_remote_code": transformers_trust_remote_code,
+            # SQLite
+            "sqlite_chunk_size": sqlite_chunk_size,
+            "sqlite_cache_size": sqlite_cache_size,
+            "sqlite_mmap_size": sqlite_mmap_size,
+        }
+        # Auto-detect model_name for OpenAI if using custom URL and default model
+        if (self.backend == "openai" and
+            model_name == "text-embedding-3-small" and
+            self.url_or_model != "https://api.openai.com/v1"):
+            if self.verbose:
+                print(f"Attempting to auto-detect model from {self.url_or_model}...")
+            try:
+                import openai
+                client = openai.OpenAI(
+                    base_url=self.url_or_model,
+                    api_key=self.config["api_key"]
+                )
+                models = client.models.list()
+                if models.data:
+                    detected_model = models.data[0].id
+                    self.config["model_name"] = detected_model
+                    model_name = detected_model  # Update for db_path computation
+                    if self.verbose:
+                        print(f"Auto-detected model: {detected_model}")
+                else:
+                    if self.verbose:
+                        print("No models found, using default model")
+            except Exception as e:
+                if self.verbose:
+                    print(f"Model auto-detection failed: {e}, using default model")
+                # Fallback to default if auto-detection fails
+                pass
+        # Set default db_path if not provided
+        if db_path is None:
+            if self.backend == "openai":
+                model_id = self.config["model_name"] or "openai-default"
+            else:
+                model_id = self.url_or_model
+            safe_name = hashlib.sha1(model_id.encode("utf-8")).hexdigest()[:16]
+            self.db_path = Path.home() / ".cache" / "embed" / f"{self.backend}_{safe_name}.sqlite"
+        else:
+            self.db_path = Path(db_path)
+        # Ensure the directory exists
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        self.conn = sqlite3.connect(self.db_path)
+        self._optimize_connection()
+        self._ensure_schema()
+        self._model = None  # Lazy loading
+        self._client = None  # For OpenAI client
+        # Load model/client if not lazy
+        if not self.lazy:
+            if self.backend == "openai":
+                self._load_openai_client()
+            elif self.backend in ["vllm", "transformers"]:
+                self._load_model()
+    def _determine_backend(self, backend: Optional[Literal["vllm", "transformers", "openai"]]) -> str:
+        """Determine the appropriate backend based on url_or_model and user preference."""
+        if backend is not None:
+            valid_backends = ["vllm", "transformers", "openai"]
+            if backend not in valid_backends:
+                raise ValueError(f"Invalid backend '{backend}'. Must be one of: {valid_backends}")
+            return backend
+        if self.url_or_model.startswith("http"):
+            return "openai"
+        # Default to vllm for local models
+        return "vllm"
+    def _try_infer_model_name(self, model_name: Optional[str]) -> Optional[str]:
+        """Infer model name for OpenAI backend if not explicitly provided."""
+        # if self.backend != "openai":
+            # return model_name
+        if model_name:
+            return model_name
+        if 'https://' in self.url_or_model:
+            model_name =  "text-embedding-3-small"
+        if 'http://localhost' in self.url_or_model:
+            from openai import OpenAI
+            client = OpenAI(base_url=self.url_or_model, api_key='abc')
+            model_name =  client.models.list().data[0].id
+        # Default model name
+        print('Infer model name:', model_name)
+        return model_name
+    def _optimize_connection(self) -> None:
+        """Optimize SQLite connection for bulk operations."""
+        # Performance optimizations for bulk operations
+        self.conn.execute(
+            "PRAGMA journal_mode=WAL"
+        )  # Write-Ahead Logging for better concurrency
+        self.conn.execute("PRAGMA synchronous=NORMAL")  # Faster writes, still safe
+        self.conn.execute(f"PRAGMA cache_size={self.config['sqlite_cache_size']}")  # Configurable cache
+        self.conn.execute("PRAGMA temp_store=MEMORY")  # Use memory for temp storage
+        self.conn.execute(f"PRAGMA mmap_size={self.config['sqlite_mmap_size']}")  # Configurable memory mapping
+    def _ensure_schema(self) -> None:
+        self.conn.execute("""
+        CREATE TABLE IF NOT EXISTS cache (
+            hash TEXT PRIMARY KEY,
+            text TEXT,
+            embedding BLOB
+        )
+        """)
+        # Add index for faster lookups if it doesn't exist
+        self.conn.execute("""
+        CREATE INDEX IF NOT EXISTS idx_cache_hash ON cache(hash)
+        """)
+        self.conn.commit()
+    def _load_openai_client(self) -> None:
+        """Load OpenAI client."""
+        import openai
+        self._client = openai.OpenAI(
+            base_url=self.url_or_model,
+            api_key=self.config["api_key"]
+        )
+    def _load_model(self) -> None:
+        """Load the model for vLLM or Transformers."""
+        if self.backend == "vllm":
+            from vllm import LLM
+            gpu_memory_utilization = cast(float, self.config["vllm_gpu_memory_utilization"])
+            tensor_parallel_size = cast(int, self.config["vllm_tensor_parallel_size"])
+            dtype = cast(str, self.config["vllm_dtype"])
+            trust_remote_code = cast(bool, self.config["vllm_trust_remote_code"])
+            max_model_len = cast(Optional[int], self.config["vllm_max_model_len"])
+            vllm_kwargs = {
+                "model": self.url_or_model,
+                "task": "embed",
+                "gpu_memory_utilization": gpu_memory_utilization,
+                "tensor_parallel_size": tensor_parallel_size,
+                "dtype": dtype,
+                "trust_remote_code": trust_remote_code,
+            }
+            if max_model_len is not None:
+                vllm_kwargs["max_model_len"] = max_model_len
+            try:
+                self._model = LLM(**vllm_kwargs)
+            except (ValueError, AssertionError, RuntimeError) as e:
+                error_msg = str(e).lower()
+                if ("kv cache" in error_msg and "gpu_memory_utilization" in error_msg) or \
+                   ("memory" in error_msg and ("gpu" in error_msg or "insufficient" in error_msg)) or \
+                   ("free memory" in error_msg and "initial" in error_msg) or \
+                   ("engine core initialization failed" in error_msg):
+                    raise ValueError(
+                        f"Insufficient GPU memory for vLLM model initialization. "
+                        f"Current vllm_gpu_memory_utilization ({gpu_memory_utilization}) may be too low. "
+                        f"Try one of the following:\n"
+                        f"1. Increase vllm_gpu_memory_utilization (e.g., 0.5, 0.8, or 0.9)\n"
+                        f"2. Decrease vllm_max_model_len (e.g., 4096, 8192)\n"
+                        f"3. Use a smaller model\n"
+                        f"4. Ensure no other processes are using GPU memory during initialization\n"
+                        f"Original error: {e}"
+                    ) from e
+                else:
+                    raise
+        elif self.backend == "transformers":
+            from transformers import AutoTokenizer, AutoModel
+            import torch
+            device = self.config["transformers_device"]
+            # Handle "auto" device selection - default to CPU for transformers to avoid memory conflicts
+            if device == "auto":
+                device = "cpu"  # Default to CPU to avoid GPU memory conflicts with vLLM
+            tokenizer = AutoTokenizer.from_pretrained(self.url_or_model, padding_side='left', trust_remote_code=self.config["transformers_trust_remote_code"])
+            model = AutoModel.from_pretrained(self.url_or_model, trust_remote_code=self.config["transformers_trust_remote_code"])
+            # Move model to device
+            model.to(device)
+            model.eval()
+            self._model = {"tokenizer": tokenizer, "model": model, "device": device}
+    def _get_embeddings(self, texts: list[str]) -> list[list[float]]:
+        """Get embeddings using the configured backend."""
+        if self.backend == "openai":
+            return self._get_openai_embeddings(texts)
+        elif self.backend == "vllm":
+            return self._get_vllm_embeddings(texts)
+        elif self.backend == "transformers":
+            return self._get_transformers_embeddings(texts)
+        else:
+            raise ValueError(f"Unsupported backend: {self.backend}")
+    def _get_openai_embeddings(self, texts: list[str]) -> list[list[float]]:
+        """Get embeddings using OpenAI API."""
+        # Assert valid model_name for OpenAI backend
+        model_name = self.config["model_name"]
+        assert model_name is not None and model_name.strip(), f"Invalid model_name for OpenAI backend: {model_name}. Model name must be provided and non-empty."
+        if self._client is None:
+            self._load_openai_client()
+        response = self._client.embeddings.create(  # type: ignore
+            model=model_name,
+            input=texts
+        )
+        embeddings = [item.embedding for item in response.data]
+        return embeddings
+    def _get_vllm_embeddings(self, texts: list[str]) -> list[list[float]]:
+        """Get embeddings using vLLM."""
+        if self._model is None:
+            self._load_model()
+        outputs = self._model.embed(texts)  # type: ignore
+        embeddings = [o.outputs.embedding for o in outputs]
+        return embeddings
+    def _get_transformers_embeddings(self, texts: list[str]) -> list[list[float]]:
+        """Get embeddings using transformers directly."""
+        if self._model is None:
+            self._load_model()
+        if not isinstance(self._model, dict):
+            raise ValueError("Model not loaded properly for transformers backend")
+        tokenizer = self._model["tokenizer"]
+        model = self._model["model"]
+        device = self._model["device"]
+        normalize_embeddings = cast(bool, self.config["transformers_normalize_embeddings"])
+        # For now, use a default max_length
+        max_length = 8192
+        # Tokenize
+        batch_dict = tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt",
+        )
+        # Move to device
+        batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
+        # Run model
+        import torch
+        with torch.no_grad():
+            outputs = model(**batch_dict)
+        # Apply last token pooling
+        embeddings = self._last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
+        # Normalize if needed
+        if normalize_embeddings:
+            import torch.nn.functional as F
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+        return embeddings.cpu().numpy().tolist()
+    def _last_token_pool(self, last_hidden_states, attention_mask):
+        """Apply last token pooling to get embeddings."""
+        import torch
+        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+        if left_padding:
+            return last_hidden_states[:, -1]
+        else:
+            sequence_lengths = attention_mask.sum(dim=1) - 1
+            batch_size = last_hidden_states.shape[0]
+            return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+    def _hash_text(self, text: str) -> str:
+        return hashlib.sha1(text.encode("utf-8")).hexdigest()
+    def embeds(self, texts: list[str], cache: bool = True) -> np.ndarray:
+        """
+        Return embeddings for all texts.
+        If cache=True, compute and cache missing embeddings.
+        If cache=False, force recompute all embeddings and update cache.
+        This method processes lookups and embedding generation in chunks to
+        handle very large input lists. A tqdm progress bar is shown while
+        computing missing embeddings.
+        """
+        if not texts:
+            return np.empty((0, 0), dtype=np.float32)
+        t = time()
+        hashes = [self._hash_text(t) for t in texts]
+        # Helper to yield chunks
+        def _chunks(lst: list[str], n: int) -> list[list[str]]:
+            return [lst[i : i + n] for i in range(0, len(lst), n)]
+        # Fetch known embeddings in bulk with optimized chunk size
+        hit_map: dict[str, np.ndarray] = {}
+        chunk_size = self.config["sqlite_chunk_size"]
+        # Use bulk lookup with optimized query
+        hash_chunks = _chunks(hashes, chunk_size)
+        for chunk in hash_chunks:
+            placeholders = ",".join("?" * len(chunk))
+            rows = self.conn.execute(
+                f"SELECT hash, embedding FROM cache WHERE hash IN ({placeholders})",
+                chunk,
+            ).fetchall()
+            for h, e in rows:
+                hit_map[h] = np.frombuffer(e, dtype=np.float32)
+        # Determine which texts are missing
+        if cache:
+            missing_items: list[tuple[str, str]] = [
+                (t, h) for t, h in zip(texts, hashes) if h not in hit_map
+            ]
+        else:
+            missing_items: list[tuple[str, str]] = [
+                (t, h) for t, h in zip(texts, hashes)
+            ]
+        if missing_items:
+            if self.verbose:
+                print(f"Computing embeddings for {len(missing_items)} missing texts...")
+            missing_texts = [t for t, _ in missing_items]
+            embeds = self._get_embeddings(missing_texts)
+            # Prepare batch data for bulk insert
+            bulk_insert_data: list[tuple[str, str, bytes]] = []
+            for (text, h), vec in zip(missing_items, embeds):
+                arr = np.asarray(vec, dtype=np.float32)
+                bulk_insert_data.append((h, text, arr.tobytes()))
+                hit_map[h] = arr
+            self._bulk_insert(bulk_insert_data)
+        # Return embeddings in the original order
+        elapsed = time() - t
+        if self.verbose:
+            print(f"Retrieved {len(texts)} embeddings in {elapsed:.2f} seconds")
+        return np.vstack([hit_map[h] for h in hashes])
+    def __call__(self, texts: list[str], cache: bool = True) -> np.ndarray:
+        return self.embeds(texts, cache)
+    def _bulk_insert(self, data: list[tuple[str, str, bytes]]) -> None:
+        """Perform bulk insert of embedding data."""
+        if not data:
+            return
+        self.conn.executemany(
+            "INSERT OR REPLACE INTO cache (hash, text, embedding) VALUES (?, ?, ?)",
+            data,
+        )
+        self.conn.commit()
+    def precompute_embeddings(self, texts: list[str]) -> None:
+        """
+        Precompute embeddings for a large list of texts efficiently.
+        This is optimized for bulk operations when you know all texts upfront.
+        """
+        if not texts:
+            return
+        # Remove duplicates while preserving order
+        unique_texts = list(dict.fromkeys(texts))
+        if self.verbose:
+            print(f"Precomputing embeddings for {len(unique_texts)} unique texts...")
+        # Check which ones are already cached
+        hashes = [self._hash_text(t) for t in unique_texts]
+        existing_hashes = set()
+        # Bulk check for existing embeddings
+        chunk_size = self.config["sqlite_chunk_size"]
+        for i in range(0, len(hashes), chunk_size):
+            chunk = hashes[i : i + chunk_size]
+            placeholders = ",".join("?" * len(chunk))
+            rows = self.conn.execute(
+                f"SELECT hash FROM cache WHERE hash IN ({placeholders})",
+                chunk,
+            ).fetchall()
+            existing_hashes.update(h[0] for h in rows)
+        # Find missing texts
+        missing_items = [
+            (t, h) for t, h in zip(unique_texts, hashes) if h not in existing_hashes
+        ]
+        if not missing_items:
+            if self.verbose:
+                print("All texts already cached!")
+            return
+        if self.verbose:
+            print(f"Computing {len(missing_items)} missing embeddings...")
+        missing_texts = [t for t, _ in missing_items]
+        embeds = self._get_embeddings(missing_texts)
+        # Prepare batch data for bulk insert
+        bulk_insert_data: list[tuple[str, str, bytes]] = []
+        for (text, h), vec in zip(missing_items, embeds):
+            arr = np.asarray(vec, dtype=np.float32)
+            bulk_insert_data.append((h, text, arr.tobytes()))
+        self._bulk_insert(bulk_insert_data)
+        if self.verbose:
+            print(f"Successfully cached {len(missing_items)} new embeddings!")
+    def get_cache_stats(self) -> dict[str, int]:
+        """Get statistics about the cache."""
+        cursor = self.conn.execute("SELECT COUNT(*) FROM cache")
+        count = cursor.fetchone()[0]
+        return {"total_cached": count}
+    def clear_cache(self) -> None:
+        """Clear all cached embeddings."""
+        self.conn.execute("DELETE FROM cache")
+        self.conn.commit()
+    def get_config(self) -> Dict[str, Any]:
+        """Get current configuration."""
+        return {
+            "url_or_model": self.url_or_model,
+            "backend": self.backend,
+            "embed_size": self.embed_size,
+            "db_path": str(self.db_path),
+            "verbose": self.verbose,
+            "lazy": self.lazy,
+            **self.config
+        }
+    def update_config(self, **kwargs) -> None:
+        """Update configuration parameters."""
+        for key, value in kwargs.items():
+            if key in self.config:
+                self.config[key] = value
+            elif key == "verbose":
+                self.verbose = value
+            elif key == "lazy":
+                self.lazy = value
+            else:
+                raise ValueError(f"Unknown configuration parameter: {key}")
+        # Reset model if backend-specific parameters changed
+        backend_params = {
+            "vllm": ["vllm_gpu_memory_utilization", "vllm_tensor_parallel_size", "vllm_dtype",
+                    "vllm_trust_remote_code", "vllm_max_model_len"],
+            "transformers": ["transformers_device", "transformers_batch_size",
+                           "transformers_normalize_embeddings", "transformers_trust_remote_code"],
+            "openai": ["api_key", "model_name"]
+        }
+        if any(param in kwargs for param in backend_params.get(self.backend, [])):
+            self._model = None  # Force reload on next use
+            if self.backend == "openai":
+                self._client = None
+    def __del__(self) -> None:
+        """Clean up database connection."""
+        if hasattr(self, "conn"):
+            self.conn.close()

speedy_utils-1.1.14/src/llm_utils/vector_cache/types.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Type definitions for the embed_cache package."""
+from typing import List, Dict, Any, Union, Optional, Tuple
+import numpy as np
+from numpy.typing import NDArray
+# Type aliases
+TextList = List[str]
+EmbeddingArray = NDArray[np.float32]
+EmbeddingList = List[List[float]]
+CacheStats = Dict[str, int]
+ModelIdentifier = str  # Either URL or model name/path
+# For backwards compatibility
+Embeddings = Union[EmbeddingArray, EmbeddingList]

speedy_utils-1.1.14/src/llm_utils/vector_cache/utils.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Utility functions for the embed_cache package."""
+import os
+from typing import Optional
+def get_default_cache_path() -> str:
+    """Get the default cache path based on environment."""
+    cache_dir = os.getenv("EMBED_CACHE_DIR", ".")
+    return os.path.join(cache_dir, "embed_cache.sqlite")
+def validate_model_name(model_name: str) -> bool:
+    """Validate if a model name is supported."""
+    # Check if it's a URL
+    if model_name.startswith("http"):
+        return True
+    # Check if it's a valid model path/name
+    supported_prefixes = [
+        "Qwen/",
+        "sentence-transformers/",
+        "BAAI/",
+        "intfloat/",
+        "microsoft/",
+        "nvidia/",
+    ]
+    return any(model_name.startswith(prefix) for prefix in supported_prefixes) or os.path.exists(model_name)
+def estimate_cache_size(num_texts: int, embedding_dim: int = 1024) -> str:
+    """Estimate cache size for given number of texts."""
+    # Rough estimate: hash (40 bytes) + text (avg 100 bytes) + embedding (embedding_dim * 4 bytes)
+    bytes_per_entry = 40 + 100 + (embedding_dim * 4)
+    total_bytes = num_texts * bytes_per_entry
+    if total_bytes < 1024:
+        return f"{total_bytes} bytes"
+    elif total_bytes < 1024 * 1024:
+        return f"{total_bytes / 1024:.1f} KB"
+    elif total_bytes < 1024 * 1024 * 1024:
+        return f"{total_bytes / (1024 * 1024):.1f} MB"
+    else:
+        return f"{total_bytes / (1024 * 1024 * 1024):.1f} GB"

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/src/speedy_utils/__init__.py RENAMED Viewed

@@ -18,7 +18,7 @@
 # • memoize(func) -> Callable - Function result caching decorator
 # • identify(obj: Any) -> str - Generate unique object identifier
 # • identify_uuid(obj: Any) -> str - Generate UUID-based object identifier
-# • load_by_ext(fname: str | list[str]) -> Any - Auto-detect file format loader
+# • load_by_ext(fname: Union[str, list[str]]) -> Any - Auto-detect file format loader
 # • dump_json_or_pickle(obj: Any, fname: str) -> None - Smart file serializer
 # • load_json_or_pickle(fname: str) -> Any - Smart file deserializer
 # • multi_thread(func, items, **kwargs) -> list - Parallel thread execution

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/src/speedy_utils/common/logger.py RENAMED Viewed

@@ -5,7 +5,7 @@ import re
 import sys
 import time
 from collections import OrderedDict
-from typing import Annotated, Literal
+from typing import Annotated, Literal, Union
 from loguru import logger
@@ -166,7 +166,7 @@ def log(
     *,
     level: Literal["info", "warning", "error", "critical", "success"] = "info",
     once: bool = False,
-    interval: float | None = None,
+    interval: Union[float, None] = None,
 ) -> None:
     """
     Log a message using loguru with optional `once` and `interval` control.

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/src/speedy_utils/common/utils_io.py RENAMED Viewed

@@ -7,7 +7,7 @@ import pickle
 import time
 from glob import glob
 from pathlib import Path
-from typing import Any
+from typing import Any, Union
 from json_repair import loads as jloads
 from pydantic import BaseModel
@@ -92,7 +92,7 @@ def load_jsonl(path):
     return [json.loads(line) for line in lines]
-def load_by_ext(fname: str | list[str], do_memoize: bool = False) -> Any:
+def load_by_ext(fname: Union[str, list[str]], do_memoize: bool = False) -> Any:
     """
     Load data based on file extension.
     """

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/src/speedy_utils/common/utils_print.py RENAMED Viewed

@@ -3,7 +3,7 @@
 import copy
 import pprint
 import textwrap
-from typing import Any
+from typing import Any, Union
 from tabulate import tabulate
@@ -24,17 +24,17 @@ def flatten_dict(d, parent_key="", sep="."):
 def fprint(
     input_data: Any,
-    key_ignore: list[str] | None = None,
-    key_keep: list[str] | None = None,
+    key_ignore: Union[list[str], None] = None,
+    key_keep: Union[list[str], None] = None,
     max_width: int = 100,
     indent: int = 2,
-    depth: int | None = None,
+    depth: Union[int, None] = None,
     table_format: str = "grid",
     str_wrap_width: int = 80,
     grep=None,
     is_notebook=None,
     f=print,
-) -> None | str:
+) -> Union[None, str]:
     """
     Pretty print structured data.
     """

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/src/speedy_utils/multi_worker/process.py RENAMED Viewed

@@ -4,7 +4,7 @@ import traceback
 from collections.abc import Callable, Iterable, Iterator
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from itertools import islice
-from typing import Any, TypeVar, cast
+from typing import Any, TypeVar, Union, cast
 T = TypeVar("T")
@@ -65,12 +65,12 @@ def multi_process(
     func: Callable[[Any], Any],
     inputs: Iterable[Any],
     *,
-    workers: int | None = None,
+    workers: Union[int, None] = None,
     batch: int = 1,
     ordered: bool = True,
     progress: bool = False,
-    inflight: int | None = None,
-    timeout: float | None = None,
+    inflight: Union[int, None] = None,
+    timeout: Union[float, None] = None,
     stop_on_error: bool = True,
     process_update_interval=10,
     for_loop: bool = False,

{speedy_utils-1.1.12 → speedy_utils-1.1.14}/src/speedy_utils/multi_worker/thread.py RENAMED Viewed

@@ -83,7 +83,7 @@ import traceback
 from collections.abc import Callable, Iterable
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from itertools import islice
-from typing import Any, TypeVar
+from typing import Any, TypeVar, Union
 from loguru import logger
@@ -125,16 +125,16 @@ def multi_thread(
     func: Callable,
     inputs: Iterable[Any],
     *,
-    workers: int | None = DEFAULT_WORKERS,
+    workers: Union[int, None] = DEFAULT_WORKERS,
     batch: int = 1,
     ordered: bool = True,
     progress: bool = True,
     progress_update: int = 10,
     prefetch_factor: int = 4,
-    timeout: float | None = None,
+    timeout: Union[float, None] = None,
     stop_on_error: bool = True,
     n_proc=0,
-    store_output_pkl_file: str | None = None,
+    store_output_pkl_file: Union[str, None] = None,
     **fixed_kwargs,
 ) -> list[Any]:
     """