PyPI - speedy-utils - Versions diffs - 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl - Mend

speedy-utils 1.1.18py3-none-any.whl → 1.1.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

llm_utils/__init__.py +3 -2
llm_utils/lm/async_lm/async_llm_task.py +1 -0
llm_utils/lm/llm_task.py +303 -10
llm_utils/lm/openai_memoize.py +10 -2
llm_utils/vector_cache/core.py +250 -234
speedy_utils/__init__.py +2 -1
speedy_utils/common/utils_cache.py +38 -19
speedy_utils/common/utils_io.py +9 -5
speedy_utils/multi_worker/process.py +91 -10
speedy_utils/multi_worker/thread.py +94 -2
{speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/METADATA +34 -13
{speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/RECORD +19 -19
{speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/WHEEL +1 -1
speedy_utils-1.1.20.dist-info/entry_points.txt +5 -0
speedy_utils-1.1.18.dist-info/entry_points.txt +0 -6

llm_utils/vector_cache/core.py CHANGED Viewed

@@ -13,50 +13,51 @@ import numpy as np
 class VectorCache:
     """
     A caching layer for text embeddings with support for multiple backends.
     This cache is designed to be safe for multi-process environments where multiple
     processes may access the same cache file simultaneously. It uses SQLite WAL mode
     and retry logic with exponential backoff to handle concurrent access.
     Examples:
         # OpenAI API
         from llm_utils import VectorCache
         cache = VectorCache("https://api.openai.com/v1", api_key="your-key")
         embeddings = cache.embeds(["Hello world", "How are you?"])
         # Custom OpenAI-compatible server (auto-detects model)
         cache = VectorCache("http://localhost:8000/v1", api_key="abc")
         # Transformers (Sentence Transformers)
         cache = VectorCache("sentence-transformers/all-MiniLM-L6-v2")
         # vLLM (local model)
         cache = VectorCache("/path/to/model")
         # Explicit backend specification
         cache = VectorCache("model-name", backend="transformers")
         # Eager loading (default: False) - load model immediately for better performance
         cache = VectorCache("model-name", lazy=False)
         # Lazy loading - load model only when needed (may cause performance issues)
         cache = VectorCache("model-name", lazy=True)
     Multi-Process Safety:
         The cache uses SQLite WAL (Write-Ahead Logging) mode and implements retry logic
         with exponential backoff to handle database locks. Multiple processes can safely
         read and write to the same cache file simultaneously.
         Race Condition Protection:
         - Uses INSERT OR IGNORE to prevent overwrites when multiple processes compute the same text
         - The first process to successfully cache a text wins, subsequent attempts are ignored
         - This ensures deterministic results even with non-deterministic embedding models
         For best performance in multi-process scenarios, consider:
         - Using separate cache files per process if cache hits are low
         - Coordinating cache warm-up to avoid redundant computation
         - Monitor for excessive lock contention in high-concurrency scenarios
     """
     def __init__(
         self,
         url_or_model: str,
@@ -80,7 +81,7 @@ class VectorCache:
         # SQLite parameters
         sqlite_chunk_size: int = 999,
         sqlite_cache_size: int = 10000,
-        sqlite_mmap_size: int = 268435456,
+        sqlite_mmap_size: int = 268435456,  # 256MB
         # Processing parameters
         embedding_batch_size: int = 20_000,
         # Other parameters
@@ -91,11 +92,11 @@ class VectorCache:
         self.embed_size = embed_size
         self.verbose = verbose
         self.lazy = lazy
         self.backend = self._determine_backend(backend)
         if self.verbose and backend is None:
             print(f"Auto-detected backend: {self.backend}")
         # Store all configuration parameters
         self.config = {
             # OpenAI
@@ -119,18 +120,20 @@ class VectorCache:
             # Processing
             "embedding_batch_size": embedding_batch_size,
         }
         # Auto-detect model_name for OpenAI if using custom URL and default model
-        if (self.backend == "openai" and
-            model_name == "text-embedding-3-small" and
-            self.url_or_model != "https://api.openai.com/v1"):
+        if (
+            self.backend == "openai"
+            and model_name == "text-embedding-3-small"
+            and self.url_or_model != "https://api.openai.com/v1"
+        ):
             if self.verbose:
                 print(f"Attempting to auto-detect model from {self.url_or_model}...")
             try:
                 import openai
                 client = openai.OpenAI(
-                    base_url=self.url_or_model,
-                    api_key=self.config["api_key"]
+                    base_url=self.url_or_model, api_key=self.config["api_key"]
                 )
                 models = client.models.list()
                 if models.data:
@@ -147,7 +150,7 @@ class VectorCache:
                     print(f"Model auto-detection failed: {e}, using default model")
                 # Fallback to default if auto-detection fails
                 pass
         # Set default db_path if not provided
         if db_path is None:
             if self.backend == "openai":
@@ -155,19 +158,21 @@ class VectorCache:
             else:
                 model_id = self.url_or_model
             safe_name = hashlib.sha1(model_id.encode("utf-8")).hexdigest()[:16]
-            self.db_path = Path.home() / ".cache" / "embed" / f"{self.backend}_{safe_name}.sqlite"
+            self.db_path = (
+                Path.home() / ".cache" / "embed" / f"{self.backend}_{safe_name}.sqlite"
+            )
         else:
             self.db_path = Path(db_path)
         # Ensure the directory exists
         self.db_path.parent.mkdir(parents=True, exist_ok=True)
         self.conn = sqlite3.connect(self.db_path)
         self._optimize_connection()
         self._ensure_schema()
         self._model = None  # Lazy loading
         self._client = None  # For OpenAI client
         # Load model/client if not lazy
         if not self.lazy:
             if self.verbose:
@@ -179,34 +184,41 @@ class VectorCache:
             if self.verbose:
                 print(f"✓ {self.backend.upper()} model/client loaded successfully")
-    def _determine_backend(self, backend: Optional[Literal["vllm", "transformers", "openai"]]) -> str:
+    def _determine_backend(
+        self, backend: Optional[Literal["vllm", "transformers", "openai"]]
+    ) -> str:
         """Determine the appropriate backend based on url_or_model and user preference."""
         if backend is not None:
             valid_backends = ["vllm", "transformers", "openai"]
             if backend not in valid_backends:
-                raise ValueError(f"Invalid backend '{backend}'. Must be one of: {valid_backends}")
+                raise ValueError(
+                    f"Invalid backend '{backend}'. Must be one of: {valid_backends}"
+                )
             return backend
         if self.url_or_model.startswith("http"):
             return "openai"
         # Default to vllm for local models
         return "vllm"
     def _try_infer_model_name(self, model_name: Optional[str]) -> Optional[str]:
         """Infer model name for OpenAI backend if not explicitly provided."""
         if model_name:
             return model_name
-        if 'https://' in self.url_or_model:
-            model_name =  "text-embedding-3-small"
-        if 'http://localhost' in self.url_or_model:
+        if "https://" in self.url_or_model:
+            model_name = "text-embedding-3-small"
+        if "http://localhost" in self.url_or_model:
             from openai import OpenAI
-            client = OpenAI(base_url=self.url_or_model, api_key='abc')
-            model_name =  client.models.list().data[0].id
+            client = OpenAI(base_url=self.url_or_model, api_key="abc")
+            model_name = client.models.list().data[0].id
         # Default model name
-        print('Infer model name:', model_name)
+        print("Infer model name:", model_name)
         return model_name
     def _optimize_connection(self) -> None:
         """Optimize SQLite connection for bulk operations and multi-process safety."""
         # Performance optimizations for bulk operations
@@ -214,13 +226,21 @@ class VectorCache:
             "PRAGMA journal_mode=WAL"
         )  # Write-Ahead Logging for better concurrency
         self.conn.execute("PRAGMA synchronous=NORMAL")  # Faster writes, still safe
-        self.conn.execute(f"PRAGMA cache_size={self.config['sqlite_cache_size']}")  # Configurable cache
+        self.conn.execute(
+            f"PRAGMA cache_size={self.config['sqlite_cache_size']}"
+        )  # Configurable cache
         self.conn.execute("PRAGMA temp_store=MEMORY")  # Use memory for temp storage
-        self.conn.execute(f"PRAGMA mmap_size={self.config['sqlite_mmap_size']}")  # Configurable memory mapping
+        self.conn.execute(
+            f"PRAGMA mmap_size={self.config['sqlite_mmap_size']}"
+        )  # Configurable memory mapping
         # Multi-process safety improvements
-        self.conn.execute("PRAGMA busy_timeout=30000")  # Wait up to 30 seconds for locks
-        self.conn.execute("PRAGMA wal_autocheckpoint=1000")  # Checkpoint WAL every 1000 pages
+        self.conn.execute(
+            "PRAGMA busy_timeout=30000"
+        )  # Wait up to 30 seconds for locks
+        self.conn.execute(
+            "PRAGMA wal_autocheckpoint=1000"
+        )  # Checkpoint WAL every 1000 pages
     def _ensure_schema(self) -> None:
         self.conn.execute("""
@@ -239,22 +259,24 @@ class VectorCache:
     def _load_openai_client(self) -> None:
         """Load OpenAI client."""
         import openai
         self._client = openai.OpenAI(
-            base_url=self.url_or_model,
-            api_key=self.config["api_key"]
+            base_url=self.url_or_model, api_key=self.config["api_key"]
         )
     def _load_model(self) -> None:
         """Load the model for vLLM or Transformers."""
         if self.backend == "vllm":
             from vllm import LLM  # type: ignore[import-not-found]
-            gpu_memory_utilization = cast(float, self.config["vllm_gpu_memory_utilization"])
+            gpu_memory_utilization = cast(
+                float, self.config["vllm_gpu_memory_utilization"]
+            )
             tensor_parallel_size = cast(int, self.config["vllm_tensor_parallel_size"])
             dtype = cast(str, self.config["vllm_dtype"])
             trust_remote_code = cast(bool, self.config["vllm_trust_remote_code"])
             max_model_len = cast(Optional[int], self.config["vllm_max_model_len"])
             vllm_kwargs = {
                 "model": self.url_or_model,
                 "task": "embed",
@@ -263,18 +285,23 @@ class VectorCache:
                 "dtype": dtype,
                 "trust_remote_code": trust_remote_code,
             }
             if max_model_len is not None:
                 vllm_kwargs["max_model_len"] = max_model_len
             try:
                 self._model = LLM(**vllm_kwargs)
             except (ValueError, AssertionError, RuntimeError) as e:
                 error_msg = str(e).lower()
-                if ("kv cache" in error_msg and "gpu_memory_utilization" in error_msg) or \
-                   ("memory" in error_msg and ("gpu" in error_msg or "insufficient" in error_msg)) or \
-                   ("free memory" in error_msg and "initial" in error_msg) or \
-                   ("engine core initialization failed" in error_msg):
+                if (
+                    ("kv cache" in error_msg and "gpu_memory_utilization" in error_msg)
+                    or (
+                        "memory" in error_msg
+                        and ("gpu" in error_msg or "insufficient" in error_msg)
+                    )
+                    or ("free memory" in error_msg and "initial" in error_msg)
+                    or ("engine core initialization failed" in error_msg)
+                ):
                     raise ValueError(
                         f"Insufficient GPU memory for vLLM model initialization. "
                         f"Current vllm_gpu_memory_utilization ({gpu_memory_utilization}) may be too low. "
@@ -288,27 +315,39 @@ class VectorCache:
                 else:
                     raise
         elif self.backend == "transformers":
-            from transformers import AutoTokenizer, AutoModel  # type: ignore[import-not-found]
-            import torch  # type: ignore[import-not-found]
+            import torch  # type: ignore[import-not-found] # noqa: F401
+            from transformers import (  # type: ignore[import-not-found]
+                AutoModel,
+                AutoTokenizer,
+            )
             device = self.config["transformers_device"]
             # Handle "auto" device selection - default to CPU for transformers to avoid memory conflicts
             if device == "auto":
                 device = "cpu"  # Default to CPU to avoid GPU memory conflicts with vLLM
-            tokenizer = AutoTokenizer.from_pretrained(self.url_or_model, padding_side='left', trust_remote_code=self.config["transformers_trust_remote_code"])
-            model = AutoModel.from_pretrained(self.url_or_model, trust_remote_code=self.config["transformers_trust_remote_code"])
+            tokenizer = AutoTokenizer.from_pretrained(
+                self.url_or_model,
+                padding_side="left",
+                trust_remote_code=self.config["transformers_trust_remote_code"],
+            )
+            model = AutoModel.from_pretrained(
+                self.url_or_model,
+                trust_remote_code=self.config["transformers_trust_remote_code"],
+            )
             # Move model to device
             model.to(device)
             model.eval()
             self._model = {"tokenizer": tokenizer, "model": model, "device": device}
     def _get_embeddings(self, texts: list[str]) -> list[list[float]]:
         """Get embeddings using the configured backend."""
         assert isinstance(texts, list), "texts must be a list"
-        assert all(isinstance(t, str) for t in texts), "all elements in texts must be strings"
+        assert all(isinstance(t, str) for t in texts), (
+            "all elements in texts must be strings"
+        )
         if self.backend == "openai":
             return self._get_openai_embeddings(texts)
         elif self.backend == "vllm":
@@ -321,10 +360,14 @@ class VectorCache:
     def _get_openai_embeddings(self, texts: list[str]) -> list[list[float]]:
         """Get embeddings using OpenAI API."""
         assert isinstance(texts, list), "texts must be a list"
-        assert all(isinstance(t, str) for t in texts), "all elements in texts must be strings"
+        assert all(isinstance(t, str) for t in texts), (
+            "all elements in texts must be strings"
+        )
         # Assert valid model_name for OpenAI backend
         model_name = self.config["model_name"]
-        assert model_name is not None and model_name.strip(), f"Invalid model_name for OpenAI backend: {model_name}. Model name must be provided and non-empty."
+        assert model_name is not None and model_name.strip(), (
+            f"Invalid model_name for OpenAI backend: {model_name}. Model name must be provided and non-empty."
+        )
         if self._client is None:
             if self.verbose:
@@ -332,10 +375,9 @@ class VectorCache:
             self._load_openai_client()
             if self.verbose:
                 print("✓ OpenAI client loaded successfully")
         response = self._client.embeddings.create(  # type: ignore
-            model=model_name,
-            input=texts
+            model=model_name, input=texts
         )
         embeddings = [item.embedding for item in response.data]
         return embeddings
@@ -343,14 +385,16 @@ class VectorCache:
     def _get_vllm_embeddings(self, texts: list[str]) -> list[list[float]]:
         """Get embeddings using vLLM."""
         assert isinstance(texts, list), "texts must be a list"
-        assert all(isinstance(t, str) for t in texts), "all elements in texts must be strings"
+        assert all(isinstance(t, str) for t in texts), (
+            "all elements in texts must be strings"
+        )
         if self._model is None:
             if self.verbose:
                 print("🔧 Loading vLLM model...")
             self._load_model()
             if self.verbose:
                 print("✓ vLLM model loaded successfully")
         outputs = self._model.embed(texts)  # type: ignore
         embeddings = [o.outputs.embedding for o in outputs]
         return embeddings
@@ -358,26 +402,30 @@ class VectorCache:
     def _get_transformers_embeddings(self, texts: list[str]) -> list[list[float]]:
         """Get embeddings using transformers directly."""
         assert isinstance(texts, list), "texts must be a list"
-        assert all(isinstance(t, str) for t in texts), "all elements in texts must be strings"
+        assert all(isinstance(t, str) for t in texts), (
+            "all elements in texts must be strings"
+        )
         if self._model is None:
             if self.verbose:
                 print("🔧 Loading Transformers model...")
             self._load_model()
             if self.verbose:
                 print("✓ Transformers model loaded successfully")
         if not isinstance(self._model, dict):
             raise ValueError("Model not loaded properly for transformers backend")
         tokenizer = self._model["tokenizer"]
         model = self._model["model"]
         device = self._model["device"]
-        normalize_embeddings = cast(bool, self.config["transformers_normalize_embeddings"])
+        normalize_embeddings = cast(
+            bool, self.config["transformers_normalize_embeddings"]
+        )
         # For now, use a default max_length
         max_length = 8192
         # Tokenize
         batch_dict = tokenizer(
             texts,
@@ -386,35 +434,43 @@ class VectorCache:
             max_length=max_length,
             return_tensors="pt",
         )
         # Move to device
         batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
         # Run model
         import torch  # type: ignore[import-not-found]
         with torch.no_grad():
             outputs = model(**batch_dict)
         # Apply last token pooling
-        embeddings = self._last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
+        embeddings = self._last_token_pool(
+            outputs.last_hidden_state, batch_dict["attention_mask"]
+        )
         # Normalize if needed
         if normalize_embeddings:
             import torch.nn.functional as F  # type: ignore[import-not-found]
             embeddings = F.normalize(embeddings, p=2, dim=1)
         return embeddings.cpu().numpy().tolist()
     def _last_token_pool(self, last_hidden_states, attention_mask):
         """Apply last token pooling to get embeddings."""
         import torch  # type: ignore[import-not-found]
-        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+        left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
         if left_padding:
             return last_hidden_states[:, -1]
         else:
             sequence_lengths = attention_mask.sum(dim=1) - 1
             batch_size = last_hidden_states.shape[0]
-            return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+            return last_hidden_states[
+                torch.arange(batch_size, device=last_hidden_states.device),
+                sequence_lengths,
+            ]
     def _hash_text(self, text: str) -> str:
         return hashlib.sha1(text.encode("utf-8")).hexdigest()
@@ -423,33 +479,36 @@ class VectorCache:
         """Execute SQLite query with retry logic for multi-process safety."""
         max_retries = 3
         base_delay = 0.05  # 50ms base delay for reads (faster than writes)
         last_exception = None
         for attempt in range(max_retries + 1):
             try:
                 if params is None:
                     return self.conn.execute(query)
                 else:
                     return self.conn.execute(query, params)
             except sqlite3.OperationalError as e:
                 last_exception = e
                 if "database is locked" in str(e).lower() and attempt < max_retries:
                     # Exponential backoff: 0.05s, 0.1s, 0.2s
-                    delay = base_delay * (2 ** attempt)
+                    delay = base_delay * (2**attempt)
                     if self.verbose:
-                        print(f"⚠️  Database locked on read, retrying in {delay:.2f}s (attempt {attempt + 1}/{max_retries + 1})")
+                        print(
+                            f"⚠️  Database locked on read, retrying in {delay:.2f}s (attempt {attempt + 1}/{max_retries + 1})"
+                        )
                     import time
                     time.sleep(delay)
                     continue
                 else:
                     # Re-raise if not a lock error or max retries exceeded
                     raise
-            except Exception as e:
+            except Exception:
                 # Re-raise any other exceptions
                 raise
         # This should never be reached, but satisfy the type checker
         raise last_exception or RuntimeError("Failed to execute query after retries")
@@ -465,7 +524,9 @@ class VectorCache:
         computing missing embeddings.
         """
         assert isinstance(texts, list), "texts must be a list"
-        assert all(isinstance(t, str) for t in texts), "all elements in texts must be strings"
+        assert all(isinstance(t, str) for t in texts), (
+            "all elements in texts must be strings"
+        )
         if not texts:
             return np.empty((0, 0), dtype=np.float32)
         t = time()
@@ -502,7 +563,9 @@ class VectorCache:
         if missing_items:
             if self.verbose:
-                print(f"Computing {len(missing_items)}/{len(texts)} missing embeddings...")
+                print(
+                    f"Computing {len(missing_items)}/{len(texts)} missing embeddings..."
+                )
             self._process_missing_items_with_batches(missing_items, hit_map)
         # Return embeddings in the original order
@@ -511,92 +574,81 @@ class VectorCache:
             print(f"Retrieved {len(texts)} embeddings in {elapsed:.2f} seconds")
         return np.vstack([hit_map[h] for h in hashes])
-    def _process_missing_items_with_batches(self, missing_items: list[tuple[str, str]], hit_map: dict[str, np.ndarray]) -> None:
+    def _process_missing_items_with_batches(
+        self, missing_items: list[tuple[str, str]], hit_map: dict[str, np.ndarray]
+    ) -> None:
         """
-        Process missing items in batches with progress bar and incremental DB insertion.
+        Process missing items in batches with simple progress tracking.
         """
         t = time()  # Track total processing time
-        # Try to import tqdm, fall back to simple progress if not available
-        tqdm = None  # avoid "possibly unbound" in type checker
-        use_tqdm = False
-        try:
-            from tqdm import tqdm as _tqdm  # type: ignore[import-not-found]
-            tqdm = _tqdm
-            use_tqdm = True
-        except ImportError:
-            use_tqdm = False
-            if self.verbose:
-                print("tqdm not available, using simple progress reporting")
         batch_size = self.config["embedding_batch_size"]
         total_items = len(missing_items)
         if self.verbose:
-            print(f"Computing embeddings for {total_items} missing texts in batches of {batch_size}...")
+            print(
+                f"Computing embeddings for {total_items} missing texts in batches of {batch_size}..."
+            )
             if self.backend in ["vllm", "transformers"] and self._model is None:
-                print(f"⚠️  Model will be loaded on first batch (lazy loading enabled)")
+                print("⚠️  Model will be loaded on first batch (lazy loading enabled)")
             elif self.backend in ["vllm", "transformers"]:
-                print(f"✓ Model already loaded, ready for efficient batch processing")
-        # Create progress bar
-        pbar = None
-        processed_count = 0
-        if use_tqdm and tqdm is not None:
-            pbar = tqdm(total=total_items, desc="Computing embeddings", unit="texts")
+                print("✓ Model already loaded, ready for efficient batch processing")
         # Track total committed items
         total_committed = 0
-        try:
-            # Process in batches
-            for i in range(0, total_items, batch_size):
-                batch_items = missing_items[i:i + batch_size]
-                batch_texts = [text for text, _ in batch_items]
-                # Get embeddings for this batch
-                batch_embeds = self._get_embeddings(batch_texts)
-                # Prepare batch data for immediate insert
-                batch_data: list[tuple[str, str, bytes]] = []
-                for (text, h), vec in zip(batch_items, batch_embeds):
-                    arr = np.asarray(vec, dtype=np.float32)
-                    batch_data.append((h, text, arr.tobytes()))
-                    hit_map[h] = arr
-                # Immediate commit after each batch
-                self._bulk_insert(batch_data)
-                total_committed += len(batch_data)
-                # Update progress
-                batch_size_actual = len(batch_items)
-                if use_tqdm:
-                    pbar.update(batch_size_actual)
-                else:
-                    processed_count += batch_size_actual
-                    if self.verbose:
-                        print(f"Progress: {processed_count}/{total_items} embeddings computed, {total_committed} committed")
-        finally:
-            # Clean up progress bar
-            if pbar is not None:
-                pbar.close()
+        processed_count = 0
+        # Process in batches
+        for i in range(0, total_items, batch_size):
+            batch_items = missing_items[i : i + batch_size]
+            batch_texts = [text for text, _ in batch_items]
+            # Get embeddings for this batch
+            batch_embeds = self._get_embeddings(batch_texts)
+            # Prepare batch data for immediate insert
+            batch_data: list[tuple[str, str, bytes]] = []
+            for (text, h), vec in zip(batch_items, batch_embeds):
+                arr = np.asarray(vec, dtype=np.float32)
+                batch_data.append((h, text, arr.tobytes()))
+                hit_map[h] = arr
+            # Immediate commit after each batch
+            self._bulk_insert(batch_data)
+            total_committed += len(batch_data)
+            # Update progress - simple single line
+            batch_size_actual = len(batch_items)
+            processed_count += batch_size_actual
             if self.verbose:
-                total_time = time() - t
-                rate = total_items / total_time if total_time > 0 else 0
-                print(f"✅ Completed: {total_items} embeddings computed and {total_committed} items committed to database")
-                print(f"   Total time: {total_time:.2f}s | Rate: {rate:.1f} embeddings/sec")
+                elapsed = time() - t
+                rate = processed_count / elapsed if elapsed > 0 else 0
+                progress_pct = (processed_count / total_items) * 100
+                print(
+                    f"\rProgress: {processed_count}/{total_items} ({progress_pct:.1f}%) | {rate:.0f} texts/sec",
+                    end="",
+                    flush=True,
+                )
+        if self.verbose:
+            total_time = time() - t
+            rate = total_items / total_time if total_time > 0 else 0
+            print(
+                f"\n✅ Completed: {total_items} embeddings computed and {total_committed} items committed to database"
+            )
+            print(f"   Total time: {total_time:.2f}s | Rate: {rate:.1f} embeddings/sec")
     def __call__(self, texts: list[str], cache: bool = True) -> np.ndarray:
         assert isinstance(texts, list), "texts must be a list"
-        assert all(isinstance(t, str) for t in texts), "all elements in texts must be strings"
+        assert all(isinstance(t, str) for t in texts), (
+            "all elements in texts must be strings"
+        )
         return self.embeds(texts, cache)
     def _bulk_insert(self, data: list[tuple[str, str, bytes]]) -> None:
         """
         Perform bulk insert of embedding data with retry logic for multi-process safety.
         Uses INSERT OR IGNORE to prevent race conditions where multiple processes
         might try to insert the same text hash. The first process to successfully
         insert wins, subsequent attempts are ignored. This ensures deterministic
@@ -607,7 +659,7 @@ class VectorCache:
         max_retries = 3
         base_delay = 0.1  # 100ms base delay
         for attempt in range(max_retries + 1):
             try:
                 cursor = self.conn.executemany(
@@ -615,82 +667,34 @@ class VectorCache:
                     data,
                 )
                 self.conn.commit()
                 # Check if some insertions were ignored due to existing entries
-                if self.verbose and cursor.rowcount < len(data):
-                    ignored_count = len(data) - cursor.rowcount
-                    if ignored_count > 0:
-                        print(f"ℹ️  {ignored_count}/{len(data)} embeddings already existed in cache (computed by another process)")
+                # if self.verbose and cursor.rowcount < len(data):
+                # ignored_count = len(data) - cursor.rowcount
+                # if ignored_count > 0:
+                #     print(f"ℹ️  {ignored_count}/{len(data)} embeddings already existed in cache (computed by another process)")
                 return  # Success, exit the retry loop
             except sqlite3.OperationalError as e:
                 if "database is locked" in str(e).lower() and attempt < max_retries:
                     # Exponential backoff: 0.1s, 0.2s, 0.4s
-                    delay = base_delay * (2 ** attempt)
+                    delay = base_delay * (2**attempt)
                     if self.verbose:
-                        print(f"⚠️  Database locked, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries + 1})")
+                        print(
+                            f"⚠️  Database locked, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries + 1})"
+                        )
                     import time
                     time.sleep(delay)
                     continue
                 else:
                     # Re-raise if not a lock error or max retries exceeded
                     raise
-            except Exception as e:
+            except Exception:
                 # Re-raise any other exceptions
                 raise
-    # def precompute_embeddings(self, texts: list[str]) -> None:
-    #     """
-    #     Precompute embeddings for a large list of texts efficiently.
-    #     This is optimized for bulk operations when you know all texts upfront.
-    #     """
-    #     assert isinstance(texts, list), "texts must be a list"
-    #     assert all(isinstance(t, str) for t in texts), "all elements in texts must be strings"
-    #     if not texts:
-    #         return
-    #     # Remove duplicates while preserving order
-    #     unique_texts = list(dict.fromkeys(texts))
-    #     if self.verbose:
-    #         print(f"Precomputing embeddings for {len(unique_texts)} unique texts...")
-    #     # Check which ones are already cached
-    #     hashes = [self._hash_text(t) for t in unique_texts]
-    #     existing_hashes = set()
-    #     # Bulk check for existing embeddings
-    #     chunk_size = self.config["sqlite_chunk_size"]
-    #     for i in range(0, len(hashes), chunk_size):
-    #         chunk = hashes[i : i + chunk_size]
-    #         placeholders = ",".join("?" * len(chunk))
-    #         rows = self._execute_with_retry(
-    #             f"SELECT hash FROM cache WHERE hash IN ({placeholders})",
-    #             chunk,
-    #         ).fetchall()
-    #         existing_hashes.update(h[0] for h in rows)
-    #     # Find missing texts
-    #     missing_items = [
-    #         (t, h) for t, h in zip(unique_texts, hashes) if h not in existing_hashes
-    #     ]
-    #     if not missing_items:
-    #         if self.verbose:
-    #             print("All texts already cached!")
-    #         return
-    #     if self.verbose:
-    #         print(f"Computing {len(missing_items)} missing embeddings...")
-    #     # Process missing items with batches
-    #     missing_texts = [t for t, _ in missing_items]
-    #     missing_items_tupled = [(t, h) for t, h in zip(missing_texts, [self._hash_text(t) for t in missing_texts])]
-    #     hit_map_temp: dict[str, np.ndarray] = {}
-    #     self._process_missing_items_with_batches(missing_items_tupled, hit_map_temp)
-    #     if self.verbose:
-    #         print(f"Successfully cached {len(missing_items)} new embeddings!")
     def get_cache_stats(self) -> dict[str, int]:
         """Get statistics about the cache."""
         cursor = self._execute_with_retry("SELECT COUNT(*) FROM cache")
@@ -701,24 +705,27 @@ class VectorCache:
         """Clear all cached embeddings."""
         max_retries = 3
         base_delay = 0.1  # 100ms base delay
         for attempt in range(max_retries + 1):
             try:
                 self.conn.execute("DELETE FROM cache")
                 self.conn.commit()
                 return  # Success
             except sqlite3.OperationalError as e:
                 if "database is locked" in str(e).lower() and attempt < max_retries:
-                    delay = base_delay * (2 ** attempt)
+                    delay = base_delay * (2**attempt)
                     if self.verbose:
-                        print(f"⚠️  Database locked during clear, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries + 1})")
+                        print(
+                            f"⚠️  Database locked during clear, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries + 1})"
+                        )
                     import time
                     time.sleep(delay)
                     continue
                 else:
                     raise
-            except Exception as e:
+            except Exception:
                 raise
     def get_config(self) -> Dict[str, Any]:
@@ -730,7 +737,7 @@ class VectorCache:
             "db_path": str(self.db_path),
             "verbose": self.verbose,
             "lazy": self.lazy,
-            **self.config
+            **self.config,
         }
     def update_config(self, **kwargs) -> None:
@@ -744,17 +751,26 @@ class VectorCache:
                 self.lazy = value
             else:
                 raise ValueError(f"Unknown configuration parameter: {key}")
         # Reset model if backend-specific parameters changed
         backend_params = {
-            "vllm": ["vllm_gpu_memory_utilization", "vllm_tensor_parallel_size", "vllm_dtype",
-                    "vllm_trust_remote_code", "vllm_max_model_len"],
-            "transformers": ["transformers_device", "transformers_batch_size",
-                           "transformers_normalize_embeddings", "transformers_trust_remote_code"],
+            "vllm": [
+                "vllm_gpu_memory_utilization",
+                "vllm_tensor_parallel_size",
+                "vllm_dtype",
+                "vllm_trust_remote_code",
+                "vllm_max_model_len",
+            ],
+            "transformers": [
+                "transformers_device",
+                "transformers_batch_size",
+                "transformers_normalize_embeddings",
+                "transformers_trust_remote_code",
+            ],
             "openai": ["api_key", "model_name"],
-            "processing": ["embedding_batch_size"]
+            "processing": ["embedding_batch_size"],
         }
         if any(param in kwargs for param in backend_params.get(self.backend, [])):
             self._model = None  # Force reload on next use
             if self.backend == "openai":

speedy-utils 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl

speedy-utils 1.1.18py3-none-any.whl → 1.1.20py3-none-any.whl