PyPI - embedkit - Versions diffs - 0.1.4__tar.gz → 0.1.6__tar.gz - Mend

embedkit 0.1.4tar.gz → 0.1.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{embedkit-0.1.4 → embedkit-0.1.6}/PKG-INFO +18 -12
{embedkit-0.1.4 → embedkit-0.1.6}/README.md +16 -10
{embedkit-0.1.4 → embedkit-0.1.6}/main.py +36 -28
{embedkit-0.1.4 → embedkit-0.1.6}/pyproject.toml +2 -2
{embedkit-0.1.4 → embedkit-0.1.6}/src/embedkit/__init__.py +10 -12
embedkit-0.1.6/src/embedkit/base.py +122 -0
{embedkit-0.1.4 → embedkit-0.1.6}/src/embedkit/classes.py +1 -6
embedkit-0.1.6/src/embedkit/models.py +18 -0
{embedkit-0.1.4 → embedkit-0.1.6}/src/embedkit/providers/cohere.py +31 -44
embedkit-0.1.6/src/embedkit/providers/colpali.py +162 -0
embedkit-0.1.6/src/embedkit/utils.py +142 -0
embedkit-0.1.6/tests/fixtures/2407.01449v6_p1_p5.pdf +0 -0
{embedkit-0.1.4 → embedkit-0.1.6}/tests/test_embedkit.py +2 -2
embedkit-0.1.6/tests/test_utils.py +52 -0
{embedkit-0.1.4 → embedkit-0.1.6}/uv.lock +2 -2
embedkit-0.1.4/src/embedkit/base.py +0 -53
embedkit-0.1.4/src/embedkit/models.py +0 -12
embedkit-0.1.4/src/embedkit/providers/colpali.py +0 -160
embedkit-0.1.4/src/embedkit/utils.py +0 -48
{embedkit-0.1.4 → embedkit-0.1.6}/.gitignore +0 -0
{embedkit-0.1.4 → embedkit-0.1.6}/.python-version +0 -0
{embedkit-0.1.4 → embedkit-0.1.6}/LICENSE +0 -0
{embedkit-0.1.4 → embedkit-0.1.6}/src/embedkit/config.py +0 -0
{embedkit-0.1.4 → embedkit-0.1.6}/src/embedkit/providers/__init__.py +0 -0
{embedkit-0.1.4 → embedkit-0.1.6}/tests/conftest.py +0 -0
{embedkit-0.1.4 → embedkit-0.1.6}/tests/fixtures/2407.01449v6_p1.pdf +0 -0
{embedkit-0.1.4 → embedkit-0.1.6}/tests/fixtures/2407.01449v6_p1.png +0 -0

{embedkit-0.1.4 → embedkit-0.1.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: embedkit
-Version: 0.1.4
+Version: 0.1.6
 Summary: A simple toolkit for generating vector embeddings across multiple providers and models
 Author-email: JP Hwang <me@jphwang.com>
 License: MIT
@@ -22,7 +22,7 @@ Requires-Dist: colpali-engine<0.4.0,>=0.3.0
 Requires-Dist: pdf2image>=1.17.0
 Requires-Dist: pillow>=11.2.1
 Requires-Dist: torch<=2.5
-Requires-Dist: transformers
+Requires-Dist: transformers>=4.46.2
 Description-Content-Type: text/markdown
 # EmbedKit
@@ -45,7 +45,7 @@ from embedkit.classes import Model, CohereInputType
 # Initialize with ColPali
 kit = EmbedKit.colpali(
-    model=Model.ColPali.V1_3,
+    model=Model.ColPali.COLPALI_V1_3,  # or COLSMOL_256M, COLSMOL_500M
     text_batch_size=16,  # Optional: process text in batches of 16
     image_batch_size=8,  # Optional: process images in batches of 8
 )
@@ -54,7 +54,7 @@ kit = EmbedKit.colpali(
 result = kit.embed_text("Hello world")
 print(result.model_provider)
 print(result.input_type)
-print(result.objects[0].embedding.shape)
+print(result.objects[0].embedding.shape)  # Returns 2D array for ColPali
 print(result.objects[0].source_b64)
 # Initialize with Cohere
@@ -70,7 +70,7 @@ kit = EmbedKit.cohere(
 result = kit.embed_text("Hello world")
 print(result.model_provider)
 print(result.input_type)
-print(result.objects[0].embedding.shape)
+print(result.objects[0].embedding.shape)  # Returns 1D array for Cohere
 print(result.objects[0].source_b64)
 ```
@@ -85,8 +85,8 @@ result = kit.embed_image(image_path)
 print(result.model_provider)
 print(result.input_type)
-print(result.objects[0].embedding.shape)
-print(result.objects[0].source_b64)
+print(result.objects[0].embedding.shape)  # 2D for ColPali, 1D for Cohere
+print(result.objects[0].source_b64)  # Base64 encoded image
 ```
 ### PDF Embeddings
@@ -100,8 +100,8 @@ result = kit.embed_pdf(pdf_path)
 print(result.model_provider)
 print(result.input_type)
-print(result.objects[0].embedding.shape)
-print(result.objects[0].source_b64)
+print(result.objects[0].embedding.shape)  # 2D for ColPali, 1D for Cohere
+print(result.objects[0].source_b64)  # Base64 encoded PDF page
 ```
 ## Response Format
@@ -116,17 +116,23 @@ class EmbeddingResponse:
     objects: List[EmbeddingObject]
 class EmbeddingObject:
-    embedding: np.ndarray
-    source_b64: Optional[str]
+    embedding: np.ndarray  # 1D array for Cohere, 2D array for ColPali
+    source_b64: Optional[str]  # Base64 encoded source for images and PDFs
 ```
 ## Supported Models
 ### ColPali
-- `Model.ColPali.V1_3`
+- `Model.ColPali.COLPALI_V1_3`
+- `Model.ColPali.COLSMOL_256M`
+- `Model.ColPali.COLSMOL_500M`
 ### Cohere
 - `Model.Cohere.EMBED_V4_0`
+- `Model.Cohere.EMBED_ENGLISH_V3_0`
+- `Model.Cohere.EMBED_ENGLISH_LIGHT_V3_0`
+- `Model.Cohere.EMBED_MULTILINGUAL_V3_0`
+- `Model.Cohere.EMBED_MULTILINGUAL_LIGHT_V3_0`
 ## Requirements

{embedkit-0.1.4 → embedkit-0.1.6}/README.md RENAMED Viewed

@@ -18,7 +18,7 @@ from embedkit.classes import Model, CohereInputType
 # Initialize with ColPali
 kit = EmbedKit.colpali(
-    model=Model.ColPali.V1_3,
+    model=Model.ColPali.COLPALI_V1_3,  # or COLSMOL_256M, COLSMOL_500M
     text_batch_size=16,  # Optional: process text in batches of 16
     image_batch_size=8,  # Optional: process images in batches of 8
 )
@@ -27,7 +27,7 @@ kit = EmbedKit.colpali(
 result = kit.embed_text("Hello world")
 print(result.model_provider)
 print(result.input_type)
-print(result.objects[0].embedding.shape)
+print(result.objects[0].embedding.shape)  # Returns 2D array for ColPali
 print(result.objects[0].source_b64)
 # Initialize with Cohere
@@ -43,7 +43,7 @@ kit = EmbedKit.cohere(
 result = kit.embed_text("Hello world")
 print(result.model_provider)
 print(result.input_type)
-print(result.objects[0].embedding.shape)
+print(result.objects[0].embedding.shape)  # Returns 1D array for Cohere
 print(result.objects[0].source_b64)
 ```
@@ -58,8 +58,8 @@ result = kit.embed_image(image_path)
 print(result.model_provider)
 print(result.input_type)
-print(result.objects[0].embedding.shape)
-print(result.objects[0].source_b64)
+print(result.objects[0].embedding.shape)  # 2D for ColPali, 1D for Cohere
+print(result.objects[0].source_b64)  # Base64 encoded image
 ```
 ### PDF Embeddings
@@ -73,8 +73,8 @@ result = kit.embed_pdf(pdf_path)
 print(result.model_provider)
 print(result.input_type)
-print(result.objects[0].embedding.shape)
-print(result.objects[0].source_b64)
+print(result.objects[0].embedding.shape)  # 2D for ColPali, 1D for Cohere
+print(result.objects[0].source_b64)  # Base64 encoded PDF page
 ```
 ## Response Format
@@ -89,17 +89,23 @@ class EmbeddingResponse:
     objects: List[EmbeddingObject]
 class EmbeddingObject:
-    embedding: np.ndarray
-    source_b64: Optional[str]
+    embedding: np.ndarray  # 1D array for Cohere, 2D array for ColPali
+    source_b64: Optional[str]  # Base64 encoded source for images and PDFs
 ```
 ## Supported Models
 ### ColPali
-- `Model.ColPali.V1_3`
+- `Model.ColPali.COLPALI_V1_3`
+- `Model.ColPali.COLSMOL_256M`
+- `Model.ColPali.COLSMOL_500M`
 ### Cohere
 - `Model.Cohere.EMBED_V4_0`
+- `Model.Cohere.EMBED_ENGLISH_V3_0`
+- `Model.Cohere.EMBED_ENGLISH_LIGHT_V3_0`
+- `Model.Cohere.EMBED_MULTILINGUAL_V3_0`
+- `Model.Cohere.EMBED_MULTILINGUAL_LIGHT_V3_0`
 ## Requirements

{embedkit-0.1.4 → embedkit-0.1.6}/main.py RENAMED Viewed

@@ -32,30 +32,7 @@ def get_sample_image() -> Path:
 sample_image = get_sample_image()
 sample_pdf = Path("tests/fixtures/2407.01449v6_p1.pdf")
-long_pdf = Path("tmp/2407.01449v6.pdf")
-kit = EmbedKit.colpali(model=Model.ColPali.V1_3, text_batch_size=16, image_batch_size=8)
-results = kit.embed_text("Hello world")
-assert len(results.objects) == 1
-assert len(results.objects[0].embedding.shape) == 2
-assert results.objects[0].source_b64 == None
-results = kit.embed_image(sample_image)
-assert len(results.objects) == 1
-assert len(results.objects[0].embedding.shape) == 2
-assert type(results.objects[0].source_b64) == str
-results = kit.embed_pdf(sample_pdf)
-assert len(results.objects) == 1
-assert len(results.objects[0].embedding.shape) == 2
-assert type(results.objects[0].source_b64) == str
-# results = kit.embed_pdf(long_pdf)
-# assert len(results.objects) == 26
-# assert len(results.objects[0].embedding.shape) == 2
-# assert type(results.objects[0].source_b64) == str
+longer_pdf = Path("tests/fixtures/2407.01449v6_p1_p5.pdf")
 kit = EmbedKit.cohere(
     model=Model.Cohere.EMBED_V4_0,
@@ -65,6 +42,7 @@ kit = EmbedKit.cohere(
     text_input_type=CohereInputType.SEARCH_QUERY,
 )
+print(f"Trying out Cohere")
 results = kit.embed_text("Hello world")
 assert len(results.objects) == 1
 assert len(results.objects[0].embedding.shape) == 1
@@ -93,7 +71,37 @@ assert len(results.objects) == 1
 assert len(results.objects[0].embedding.shape) == 1
 assert type(results.objects[0].source_b64) == str
-# results = kit.embed_pdf(long_pdf)
-# assert len(results.objects) == 1
-# assert len(results.objects[0].embedding.shape) == 1
-# assert type(results.objects[0].source_b64) == str
+results = kit.embed_pdf(longer_pdf)
+assert len(results.objects) == 5
+assert len(results.objects[0].embedding.shape) == 1
+assert type(results.objects[0].source_b64) == str
+for colpali_model in [
+    Model.ColPali.COLSMOL_256M,
+    Model.ColPali.COLSMOL_500M,
+    Model.ColPali.COLPALI_V1_3,
+]:
+    print(f"Trying out {colpali_model}")
+    kit = EmbedKit.colpali(
+        model=colpali_model, text_batch_size=16, image_batch_size=8
+    )
+    results = kit.embed_text("Hello world")
+    assert len(results.objects) == 1
+    assert len(results.objects[0].embedding.shape) == 2
+    assert results.objects[0].source_b64 == None
+    results = kit.embed_image(sample_image)
+    assert len(results.objects) == 1
+    assert len(results.objects[0].embedding.shape) == 2
+    assert type(results.objects[0].source_b64) == str
+    results = kit.embed_pdf(sample_pdf)
+    assert len(results.objects) == 1
+    assert len(results.objects[0].embedding.shape) == 2
+    assert type(results.objects[0].source_b64) == str
+    results = kit.embed_pdf(longer_pdf)
+    assert len(results.objects) == 5
+    assert len(results.objects[0].embedding.shape) == 2
+    assert type(results.objects[0].source_b64) == str

{embedkit-0.1.4 → embedkit-0.1.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "embedkit"
-version = "0.1.4"
+version = "0.1.6"
 description = "A simple toolkit for generating vector embeddings across multiple providers and models"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -11,7 +11,7 @@ dependencies = [
     "pdf2image>=1.17.0",
     "pillow>=11.2.1",
     "torch<=2.5",
-    "transformers",
+    "transformers>=4.46.2",
 ]
 authors = [
     {name = "JP Hwang", email = "me@jphwang.com"},

{embedkit-0.1.4 → embedkit-0.1.6}/src/embedkit/__init__.py RENAMED Viewed

@@ -5,7 +5,6 @@ EmbedKit: A unified toolkit for generating vector embeddings.
 from typing import Union, List, Optional
 from pathlib import Path
-import numpy as np
 from .models import Model
 from .base import EmbeddingError, EmbeddingResponse
@@ -28,7 +27,7 @@ class EmbedKit:
     @classmethod
     def colpali(
         cls,
-        model: Model = Model.ColPali.V1_3,
+        model: Model = Model.ColPali.COLPALI_V1_3,
         device: Optional[str] = None,
         text_batch_size: int = 32,
         image_batch_size: int = 8,
@@ -42,13 +41,13 @@ class EmbedKit:
             text_batch_size: Batch size for text embedding generation
             image_batch_size: Batch size for image embedding generation
         """
-        if model == Model.ColPali.V1_3:
-            model_name = "vidore/colpali-v1.3"
-        else:
-            raise ValueError(f"Unsupported model: {model}")
+        if not isinstance(model, Model.ColPali):
+            raise ValueError(
+                f"Unsupported model: {model}. Must be a Model.ColPali enum value."
+            )
         provider = ColPaliProvider(
-            model_name=model_name,
+            model=model,
             device=device,
             text_batch_size=text_batch_size,
             image_batch_size=image_batch_size,
@@ -77,16 +76,15 @@ class EmbedKit:
         if not api_key:
             raise ValueError("API key is required")
-        if model == Model.Cohere.EMBED_V4_0:
-            model_name = "embed-v4.0"
-        else:
+        if not isinstance(model, Model.Cohere):
             raise ValueError(f"Unsupported model: {model}")
         provider = CohereProvider(
-            api_key=api_key, model_name=model_name,
+            api_key=api_key,
+            model=model,
             text_batch_size=text_batch_size,
             image_batch_size=image_batch_size,
-            text_input_type=text_input_type
+            text_input_type=text_input_type,
         )
         return cls(provider)

embedkit-0.1.6/src/embedkit/base.py ADDED Viewed

@@ -0,0 +1,122 @@
+# ./src/embedkit/base.py
+"""Base classes for EmbedKit."""
+from abc import ABC, abstractmethod
+from typing import Union, List, Optional
+from pathlib import Path
+import numpy as np
+from dataclasses import dataclass
+from .models import Model
+from .utils import with_pdf_cleanup
+@dataclass
+class EmbeddingObject:
+    embedding: np.ndarray
+    source_b64: str = None
+    source_content_type: str = None  # e.g., "image/png", "image/jpeg"
+@dataclass
+class EmbeddingResponse:
+    model_name: str
+    model_provider: str
+    input_type: str
+    objects: List[EmbeddingObject]
+    @property
+    def shape(self) -> tuple:
+        return self.objects[0].embedding.shape
+class EmbeddingProvider(ABC):
+    """Abstract base class for embedding providers."""
+    def __init__(
+        self,
+        model_name: str,
+        text_batch_size: int,
+        image_batch_size: int,
+        provider_name: str,
+    ):
+        self.model_name = model_name
+        self.provider_name = provider_name
+        self.text_batch_size = text_batch_size
+        self.image_batch_size = image_batch_size
+    def _normalize_text_input(self, texts: Union[str, List[str]]) -> List[str]:
+        """Normalize text input to a list of strings."""
+        if isinstance(texts, str):
+            return [texts]
+        return texts
+    def _normalize_image_input(
+        self, images: Union[Path, str, List[Union[Path, str]]]
+    ) -> List[Path]:
+        """Normalize image input to a list of Path objects."""
+        if isinstance(images, (str, Path)):
+            return [Path(images)]
+        return [Path(img) for img in images]
+    def _create_text_response(
+        self, embeddings: List[np.ndarray], input_type: str = "text"
+    ) -> EmbeddingResponse:
+        """Create a standardized text embedding response."""
+        return EmbeddingResponse(
+            model_name=self.model_name,
+            model_provider=self.provider_name,
+            input_type=input_type,
+            objects=[EmbeddingObject(embedding=e) for e in embeddings],
+        )
+    def _create_image_response(
+        self,
+        embeddings: List[np.ndarray],
+        b64_data: List[str],
+        content_types: List[str],
+        input_type: str = "image",
+    ) -> EmbeddingResponse:
+        """Create a standardized image embedding response."""
+        return EmbeddingResponse(
+            model_name=self.model_name,
+            model_provider=self.provider_name,
+            input_type=input_type,
+            objects=[
+                EmbeddingObject(
+                    embedding=embedding,
+                    source_b64=b64_data,
+                    source_content_type=content_type,
+                )
+                for embedding, b64_data, content_type in zip(
+                    embeddings, b64_data, content_types
+                )
+            ],
+        )
+    @abstractmethod
+    def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResponse:
+        """Generate document text embeddings using the configured provider."""
+        pass
+    @abstractmethod
+    def embed_image(
+        self, images: Union[Path, str, List[Union[Path, str]]]
+    ) -> EmbeddingResponse:
+        """Generate image embeddings using the configured provider."""
+        pass
+    def embed_pdf(self, pdf_path: Path) -> EmbeddingResponse:
+        """Generate embeddings for a PDF file."""
+        return self._embed_pdf_impl(pdf_path)
+    @with_pdf_cleanup
+    def _embed_pdf_impl(self, pdf_path: List[Path]) -> EmbeddingResponse:
+        """Internal implementation of PDF embedding with cleanup handled by decorator."""
+        return self.embed_image(pdf_path)
+class EmbeddingError(Exception):
+    """Base exception for embedding-related errors."""
+    pass

{embedkit-0.1.4 → embedkit-0.1.6}/src/embedkit/classes.py RENAMED Viewed

@@ -13,9 +13,4 @@ from . import EmbeddingResponse, EmbeddingError
 from .models import Model
 from .providers.cohere import CohereInputType
-__all__ = [
-    "EmbeddingResponse",
-    "EmbeddingError",
-    "Model",
-    "CohereInputType"
-]
+__all__ = ["EmbeddingResponse", "EmbeddingError", "Model", "CohereInputType"]

embedkit-0.1.6/src/embedkit/models.py ADDED Viewed

@@ -0,0 +1,18 @@
+# ./src/embedkit/models.py
+"""Model definitions and enum for EmbedKit."""
+from enum import Enum
+class Model:
+    class ColPali(Enum):
+        COLPALI_V1_3 = "vidore/colpali-v1.3"
+        COLSMOL_500M = "vidore/colSmol-500M"
+        COLSMOL_256M = "vidore/colSmol-256M"
+    class Cohere(Enum):
+        EMBED_V4_0 = "embed-v4.0"
+        EMBED_ENGLISH_V3_0 = "embed-english-v3.0"
+        EMBED_ENGLISH_LIGHT_V3_0 = "embed-english-light-v3.0"
+        EMBED_MULTILINGUAL_V3_0 = "embed-multilingual-v3.0"
+        EMBED_MULTILINGUAL_LIGHT_V3_0 = "embed-multilingual-light-v3.0"

{embedkit-0.1.4 → embedkit-0.1.6}/src/embedkit/providers/cohere.py RENAMED Viewed

@@ -5,9 +5,13 @@ from typing import Union, List
 from pathlib import Path
 import numpy as np
 from enum import Enum
+import logging
-from ..utils import pdf_to_images, image_to_base64
-from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResponse, EmbeddingObject
+from ..models import Model
+from ..utils import image_to_base64
+from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResponse
+logger = logging.getLogger(__name__)
 class CohereInputType(Enum):
@@ -23,18 +27,20 @@ class CohereProvider(EmbeddingProvider):
     def __init__(
         self,
         api_key: str,
-        model_name: str,
+        model: Model.Cohere,
         text_batch_size: int,
         image_batch_size: int,
         text_input_type: CohereInputType = CohereInputType.SEARCH_DOCUMENT,
     ):
+        super().__init__(
+            model_name=model.value,
+            text_batch_size=text_batch_size,
+            image_batch_size=image_batch_size,
+            provider_name="Cohere",
+        )
         self.api_key = api_key
-        self.model_name = model_name
-        self.text_batch_size = text_batch_size
-        self.image_batch_size = image_batch_size
         self.input_type = text_input_type
         self._client = None
-        self.provider_name = "Cohere"
     def _get_client(self):
         """Lazy load the Cohere client."""
@@ -54,9 +60,7 @@ class CohereProvider(EmbeddingProvider):
     def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResponse:
         """Generate text embeddings using the Cohere API."""
         client = self._get_client()
-        if isinstance(texts, str):
-            texts = [texts]
+        texts = self._normalize_text_input(texts)
         try:
             all_embeddings = []
@@ -72,16 +76,7 @@ class CohereProvider(EmbeddingProvider):
                 )
                 all_embeddings.extend(np.array(response.embeddings.float_))
-            return EmbeddingResponse(
-                model_name=self.model_name,
-                model_provider=self.provider_name,
-                input_type=self.input_type.value,
-                objects=[
-                    EmbeddingObject(
-                        embedding=e,
-                    ) for e in all_embeddings
-                ]
-            )
+            return self._create_text_response(all_embeddings, self.input_type.value)
         except Exception as e:
             raise EmbeddingError(f"Failed to embed text with Cohere: {e}") from e
@@ -92,12 +87,9 @@ class CohereProvider(EmbeddingProvider):
     ) -> EmbeddingResponse:
         """Generate embeddings for images using Cohere API."""
         client = self._get_client()
-        input_type = "image"
-        if isinstance(images, (str, Path)):
-            images = [Path(images)]
-        else:
-            images = [Path(img) for img in images]
+        images = self._normalize_image_input(images)
+        total_images = len(images)
+        logger.info(f"Starting to process {total_images} images")
         try:
             all_embeddings = []
@@ -106,12 +98,17 @@ class CohereProvider(EmbeddingProvider):
             # Process images in batches
             for i in range(0, len(images), self.image_batch_size):
                 batch_images = images[i : i + self.image_batch_size]
+                logger.info(f"Processing batch {i//self.image_batch_size + 1} of {(total_images + self.image_batch_size - 1)//self.image_batch_size} ({len(batch_images)} images)")
                 b64_images = []
                 for image in batch_images:
                     if not image.exists():
                         raise EmbeddingError(f"Image not found: {image}")
-                    b64_images.append(image_to_base64(image))
+                    b64_data, content_type = image_to_base64(image)
+                    # Construct full data URI for API
+                    data_uri = f"data:{content_type};base64,{b64_data}"
+                    b64_images.append(data_uri)
+                    all_b64_images.append((b64_data, content_type))
                 response = client.embed(
                     model=self.model_name,
@@ -121,24 +118,14 @@ class CohereProvider(EmbeddingProvider):
                 )
                 all_embeddings.extend(np.array(response.embeddings.float_))
-                all_b64_images.extend(b64_images)
-            return EmbeddingResponse(
-                model_name=self.model_name,
-                model_provider=self.provider_name,
-                input_type=input_type,
-                objects=[
-                    EmbeddingObject(
-                        embedding=all_embeddings[i],
-                        source_b64=all_b64_images[i]
-                    ) for i in range(len(all_embeddings))
-                ]
+            logger.info(f"Successfully processed all {total_images} images")
+            return self._create_image_response(
+                all_embeddings,
+                [b64 for b64, _ in all_b64_images],
+                [content_type for _, content_type in all_b64_images],
             )
         except Exception as e:
+            logger.error(f"Failed to embed images: {e}")
             raise EmbeddingError(f"Failed to embed image with Cohere: {e}") from e
-    def embed_pdf(self, pdf_path: Path) -> EmbeddingResponse:
-        """Generate embeddings for a PDF file using Cohere API."""
-        image_paths = pdf_to_images(pdf_path)
-        return self.embed_image(image_paths)

embedkit-0.1.6/src/embedkit/providers/colpali.py ADDED Viewed

@@ -0,0 +1,162 @@
+# ./src/embedkit/providers/colpali.py
+"""ColPali embedding provider."""
+from typing import Union, List, Optional
+from pathlib import Path
+import logging
+import numpy as np
+import torch
+from PIL import Image
+from ..models import Model
+from ..utils import image_to_base64
+from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResponse
+logger = logging.getLogger(__name__)
+class ColPaliProvider(EmbeddingProvider):
+    """ColPali embedding provider for document understanding."""
+    def __init__(
+        self,
+        model: Model.ColPali,
+        text_batch_size: int,
+        image_batch_size: int,
+        device: Optional[str] = None,
+    ):
+        super().__init__(
+            model_name=model.value,
+            text_batch_size=text_batch_size,
+            image_batch_size=image_batch_size,
+            provider_name="ColPali",
+        )
+        # Auto-detect device
+        if device is None:
+            if torch.cuda.is_available():
+                device = "cuda"
+            elif torch.backends.mps.is_available():
+                device = "mps"
+            else:
+                device = "cpu"
+        self._hf_device = device
+        self._hf_model = None
+        self._hf_processor = None
+    def _load_model(self):
+        """Lazy load the model."""
+        if self._hf_model is None:
+            try:
+                if self.model_name in [Model.ColPali.COLPALI_V1_3.value]:
+                    from colpali_engine.models import ColPali, ColPaliProcessor
+                    self._hf_model = ColPali.from_pretrained(
+                        self.model_name,
+                        torch_dtype=torch.bfloat16,
+                        device_map=self._hf_device,
+                    ).eval()
+                    self._hf_processor = ColPaliProcessor.from_pretrained(self.model_name)
+                elif self.model_name in [
+                    Model.ColPali.COLSMOL_500M.value,
+                    Model.ColPali.COLSMOL_256M.value,
+                ]:
+                    from colpali_engine.models import ColIdefics3, ColIdefics3Processor
+                    self._hf_model = ColIdefics3.from_pretrained(
+                        self.model_name,
+                        torch_dtype=torch.bfloat16,
+                        device_map=self._hf_device,
+                    ).eval()
+                    self._hf_processor = ColIdefics3Processor.from_pretrained(
+                        self.model_name
+                    )
+                else:
+                    raise ValueError(f"Unable to load model for: {self.model_name}.")
+                logger.info(f"Loaded {self.model_name} on {self._hf_device}")
+            except ImportError as e:
+                raise EmbeddingError(
+                    "ColPali not installed. Run: pip install colpali-engine"
+                ) from e
+            except Exception as e:
+                raise EmbeddingError(f"Failed to load model: {e}") from e
+    def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResponse:
+        """Generate embeddings for text inputs."""
+        self._load_model()
+        texts = self._normalize_text_input(texts)
+        try:
+            # Process texts in batches
+            all_embeddings: List[np.ndarray] = []
+            for i in range(0, len(texts), self.text_batch_size):
+                batch_texts = texts[i : i + self.text_batch_size]
+                processed = self._hf_processor.process_queries(batch_texts).to(self._hf_device)
+                with torch.no_grad():
+                    batch_embeddings = self._hf_model(**processed)
+                    all_embeddings.append(batch_embeddings.cpu().float().numpy())
+            # Concatenate all batch embeddings
+            final_embeddings = np.concatenate(all_embeddings, axis=0)
+            return self._create_text_response(final_embeddings)
+        except Exception as e:
+            raise EmbeddingError(f"Failed to embed text: {e}") from e
+    def embed_image(
+        self, images: Union[Path, str, List[Union[Path, str]]]
+    ) -> EmbeddingResponse:
+        """Generate embeddings for images."""
+        self._load_model()
+        images = self._normalize_image_input(images)
+        total_images = len(images)
+        logger.info(f"Starting to process {total_images} images")
+        try:
+            # Process images in batches
+            all_embeddings: List[np.ndarray] = []
+            all_b64_data: List[str] = []
+            all_content_types: List[str] = []
+            for i in range(0, len(images), self.image_batch_size):
+                batch_images = images[i : i + self.image_batch_size]
+                logger.info(f"Processing batch {i//self.image_batch_size + 1} of {(total_images + self.image_batch_size - 1)//self.image_batch_size} ({len(batch_images)} images)")
+                pil_images = []
+                batch_b64_data = []
+                batch_content_types = []
+                for img_path in batch_images:
+                    if not img_path.exists():
+                        raise EmbeddingError(f"Image not found: {img_path}")
+                    with Image.open(img_path) as img:
+                        pil_images.append(img.convert("RGB"))
+                    b64, content_type = image_to_base64(img_path)
+                    batch_b64_data.append(b64)
+                    batch_content_types.append(content_type)
+                processed = self._hf_processor.process_images(pil_images).to(self._hf_device)
+                with torch.no_grad():
+                    batch_embeddings = self._hf_model(**processed)
+                    all_embeddings.append(batch_embeddings.cpu().float().numpy())
+                    all_b64_data.extend(batch_b64_data)
+                    all_content_types.extend(batch_content_types)
+            # Concatenate all batch embeddings
+            final_embeddings = np.concatenate(all_embeddings, axis=0)
+            logger.info(f"Successfully processed all {total_images} images")
+            return self._create_image_response(
+                final_embeddings, all_b64_data, all_content_types
+            )
+        except Exception as e:
+            logger.error(f"Failed to embed images: {e}")
+            raise EmbeddingError(f"Failed to embed images: {e}") from e

embedkit-0.1.6/src/embedkit/utils.py ADDED Viewed

@@ -0,0 +1,142 @@
+import tempfile
+import shutil
+import logging
+from contextlib import contextmanager
+from pdf2image import convert_from_path
+from pathlib import Path
+from .config import get_temp_dir
+from typing import Union, List, Iterator, Callable, TypeVar, Any
+logger = logging.getLogger(__name__)
+T = TypeVar("T")
+@contextmanager
+def temporary_directory() -> Iterator[Path]:
+    """Create a temporary directory that is automatically cleaned up when done.
+    Yields:
+        Path: Path to the temporary directory
+    """
+    temp_dir = Path(tempfile.mkdtemp())
+    try:
+        yield temp_dir
+    finally:
+        shutil.rmtree(temp_dir)
+def pdf_to_images(pdf_path: Path) -> List[Path]:
+    """Convert a PDF file to a list of images.
+    The images are stored in a temporary directory that will be automatically
+    cleaned up when the process exits.
+    Args:
+        pdf_path: Path to the PDF file
+    Returns:
+        List[Path]: List of paths to the generated images
+    Note:
+        The temporary files will be automatically cleaned up when the process exits.
+        Do not rely on these files persisting after the function returns.
+    """
+    with temporary_directory() as temp_dir:
+        images = convert_from_path(pdf_path=str(pdf_path), output_folder=str(temp_dir))
+        image_paths = []
+        for i, image in enumerate(images):
+            output_path = temp_dir / f"{pdf_path.stem}_{i}.png"
+            image.save(output_path)
+            final_path = Path(tempfile.mktemp(suffix=".png"))
+            shutil.move(output_path, final_path)
+            image_paths.append(final_path)
+        return image_paths
+def image_to_base64(image_path: Union[str, Path]) -> tuple[str, str]:
+    """Convert an image to base64 and return the base64 data and content type.
+    Args:
+        image_path: Path to the image file
+    Returns:
+        tuple[str, str]: (base64_data, content_type)
+    Raises:
+        ValueError: If the image cannot be read or has an unsupported format
+    """
+    import base64
+    try:
+        base64_data = base64.b64encode(Path(image_path).read_bytes()).decode("utf-8")
+    except Exception as e:
+        raise ValueError(f"Failed to read image {image_path}: {e}") from e
+    if isinstance(image_path, Path):
+        image_path_str = str(image_path)
+    else:
+        image_path_str = image_path
+    if image_path_str.lower().endswith(".png"):
+        content_type = "image/png"
+    elif image_path_str.lower().endswith((".jpg", ".jpeg")):
+        content_type = "image/jpeg"
+    elif image_path_str.lower().endswith(".gif"):
+        content_type = "image/gif"
+    else:
+        raise ValueError(
+            f"Unsupported image format for {image_path}; expected .png, .jpg, .jpeg, or .gif"
+        )
+    return base64_data, content_type
+def with_pdf_cleanup(embed_func: Callable[..., T]) -> Callable[..., T]:
+    """Decorator to handle PDF to image conversion with automatic cleanup.
+    This decorator handles the common pattern of:
+    1. Converting PDF to images
+    2. Passing images to an embedding function
+    3. Cleaning up temporary files
+    Args:
+        embed_func: Function that takes a list of image paths and returns embeddings
+    Returns:
+        Callable that takes a PDF path and returns embeddings
+    """
+    def wrapper(*args, **kwargs) -> T:
+        # First argument is self for instance methods
+        pdf_path = args[-1] if args else kwargs.get("pdf_path")
+        if not pdf_path:
+            raise ValueError(
+                "PDF path must be provided as the last positional argument or as 'pdf_path' keyword argument"
+            )
+        images = []  # Initialize images as empty list
+        try:
+            images = pdf_to_images(pdf_path)
+            # Call the original function with the images instead of pdf_path
+            if args:
+                # For instance methods, replace the last argument (pdf_path) with images
+                args = list(args)
+                args[-1] = images
+            else:
+                kwargs["pdf_path"] = images
+            return embed_func(*args, **kwargs)
+        finally:
+            # Clean up temporary files created by pdf_to_images
+            for img_path in images:
+                try:
+                    if img_path.exists() and str(img_path).startswith(
+                        tempfile.gettempdir()
+                    ):
+                        img_path.unlink()
+                except Exception as e:
+                    logger.warning(f"Failed to clean up temporary file {img_path}: {e}")
+    return wrapper

embedkit-0.1.6/tests/fixtures/2407.01449v6_p1_p5.pdf ADDED Viewed

Binary file

{embedkit-0.1.4 → embedkit-0.1.6}/tests/test_embedkit.py RENAMED Viewed

@@ -114,7 +114,7 @@ def test_cohere_missing_api_key():
 # ===============================
 def test_colpali_text_embedding():
     """Test text embedding with Colpali model."""
-    kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
+    kit = EmbedKit.colpali(model=Model.ColPali.COLPALI_V1_3)
     result = kit.embed_text("Hello world")
     assert len(result.objects) == 1
@@ -133,7 +133,7 @@ def test_colpali_text_embedding():
 )
 def test_colpali_file_embedding(request, embed_method, file_fixture):
     """Test file embedding with Colpali model."""
-    kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
+    kit = EmbedKit.colpali(model=Model.ColPali.COLPALI_V1_3)
     file_path = request.getfixturevalue(file_fixture)
     embed_func = getattr(kit, embed_method)
     result = embed_func(file_path)

embedkit-0.1.6/tests/test_utils.py ADDED Viewed

@@ -0,0 +1,52 @@
+import pytest
+from pathlib import Path
+import tempfile
+from embedkit.utils import temporary_directory, pdf_to_images
+@pytest.fixture
+def sample_pdf_path():
+    """Fixture to provide a sample PDF for testing."""
+    path = Path("tests/fixtures/2407.01449v6_p1.pdf")
+    if not path.exists():
+        pytest.skip(f"Test fixture not found: {path}")
+    return path
+def test_temporary_directory_cleanup():
+    """Test that temporary directory is properly cleaned up after use."""
+    temp_dir = None
+    with temporary_directory() as temp_path:
+        temp_dir = temp_path
+        # Create a test file in the temp directory
+        test_file = temp_path / "test.txt"
+        test_file.write_text("test content")
+        assert test_file.exists()
+        assert temp_path.exists()
+    # After the context manager exits, both the file and directory should be gone
+    assert not temp_dir.exists()
+    assert not test_file.exists()
+def test_pdf_to_images_temporary_files(sample_pdf_path):
+    """Test that PDF to images conversion creates and cleans up temporary files properly."""
+    # Convert PDF to images
+    image_paths = pdf_to_images(sample_pdf_path)
+    # Check that we got image paths
+    assert len(image_paths) > 0
+    # Verify all images exist and are in temp directory
+    for img_path in image_paths:
+        assert img_path.exists()
+        assert str(img_path).startswith(tempfile.gettempdir())
+        assert img_path.suffix == ".png"
+        # Verify the image is readable
+        assert img_path.stat().st_size > 0
+    # Clean up the temporary files
+    for img_path in image_paths:
+        img_path.unlink()
+        assert not img_path.exists()

{embedkit-0.1.4 → embedkit-0.1.6}/uv.lock RENAMED Viewed

@@ -349,7 +349,7 @@ wheels = [
 [[package]]
 name = "embedkit"
-version = "0.1.0"
+version = "0.1.5"
 source = { editable = "." }
 dependencies = [
     { name = "accelerate" },
@@ -377,7 +377,7 @@ requires-dist = [
     { name = "pdf2image", specifier = ">=1.17.0" },
     { name = "pillow", specifier = ">=11.2.1" },
     { name = "torch", specifier = "<=2.5" },
-    { name = "transformers" },
+    { name = "transformers", specifier = ">=4.46.2" },
 ]
 [package.metadata.requires-dev]

embedkit-0.1.4/src/embedkit/base.py DELETED Viewed

@@ -1,53 +0,0 @@
-# ./src/embedkit/base.py
-"""Base classes for EmbedKit."""
-from abc import ABC, abstractmethod
-from typing import Union, List, Optional
-from pathlib import Path
-import numpy as np
-from dataclasses import dataclass
-@dataclass
-class EmbeddingObject:
-    embedding: np.ndarray
-    source_b64: str = None
-@dataclass
-class EmbeddingResponse:
-    model_name: str
-    model_provider: str
-    input_type: str
-    objects: List[EmbeddingObject]
-    @property
-    def shape(self) -> tuple:
-        return self.objects[0].embedding.shape
-class EmbeddingProvider(ABC):
-    """Abstract base class for embedding providers."""
-    @abstractmethod
-    def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResponse:
-        """Generate document text embeddings using the configured provider."""
-        pass
-    @abstractmethod
-    def embed_image(
-        self, images: Union[Path, str, List[Union[Path, str]]]
-    ) -> EmbeddingResponse:
-        """Generate image embeddings using the configured provider."""
-        pass
-    @abstractmethod
-    def embed_pdf(self, pdf: Union[Path, str]) -> EmbeddingResponse:
-        """Generate image embeddings from PDFsusing the configured provider. Takes a single PDF file."""
-        pass
-class EmbeddingError(Exception):
-    """Base exception for embedding-related errors."""
-    pass

embedkit-0.1.4/src/embedkit/models.py DELETED Viewed

@@ -1,12 +0,0 @@
-# ./src/embedkit/models.py
-"""Model definitions and enum for EmbedKit."""
-from enum import Enum
-class Model:
-    class ColPali(Enum):
-        V1_3 = "colpali-v1.3"
-    class Cohere(Enum):
-        EMBED_V4_0 = "embed-v4.0"

embedkit-0.1.4/src/embedkit/providers/colpali.py DELETED Viewed

@@ -1,160 +0,0 @@
-# ./src/embedkit/providers/colpali.py
-"""ColPali embedding provider."""
-from typing import Union, List, Optional
-from pathlib import Path
-import logging
-import numpy as np
-import torch
-from PIL import Image
-from ..utils import pdf_to_images, image_to_base64
-from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResponse, EmbeddingObject
-logger = logging.getLogger(__name__)
-class ColPaliProvider(EmbeddingProvider):
-    """ColPali embedding provider for document understanding."""
-    def __init__(
-        self,
-        model_name: str,
-        text_batch_size: int,
-        image_batch_size: int,
-        device: Optional[str] = None,
-    ):
-        self.model_name = model_name
-        self.provider_name = "ColPali"
-        self.text_batch_size = text_batch_size
-        self.image_batch_size = image_batch_size
-        # Auto-detect device
-        if device is None:
-            if torch.cuda.is_available():
-                device = "cuda"
-            elif torch.backends.mps.is_available():
-                device = "mps"
-            else:
-                device = "cpu"
-        self.device = device
-        self._model = None
-        self._processor = None
-    def _load_model(self):
-        """Lazy load the model."""
-        if self._model is None:
-            try:
-                from colpali_engine.models import ColPali, ColPaliProcessor
-                self._model = ColPali.from_pretrained(
-                    self.model_name,
-                    torch_dtype=torch.bfloat16,
-                    device_map=self.device,
-                ).eval()
-                self._processor = ColPaliProcessor.from_pretrained(self.model_name)
-                logger.info(f"Loaded ColPali model on {self.device}")
-            except ImportError as e:
-                raise EmbeddingError(
-                    "ColPali not installed. Run: pip install colpali-engine"
-                ) from e
-            except Exception as e:
-                raise EmbeddingError(f"Failed to load model: {e}") from e
-    def embed_text(self, texts: Union[str, List[str]]) -> EmbeddingResponse:
-        """Generate embeddings for text inputs."""
-        self._load_model()
-        if isinstance(texts, str):
-            texts = [texts]
-        try:
-            # Process texts in batches
-            all_embeddings = []
-            for i in range(0, len(texts), self.text_batch_size):
-                batch_texts = texts[i : i + self.text_batch_size]
-                processed = self._processor.process_queries(batch_texts).to(self.device)
-                with torch.no_grad():
-                    batch_embeddings = self._model(**processed)
-                    all_embeddings.append(batch_embeddings.cpu().float().numpy())
-            # Concatenate all batch embeddings
-            final_embeddings = np.concatenate(all_embeddings, axis=0)
-            return EmbeddingResponse(
-                model_name=self.model_name,
-                model_provider=self.provider_name,
-                input_type="text",
-                objects=[
-                    EmbeddingObject(
-                        embedding=e,
-                    ) for e in final_embeddings
-                ]
-            )
-        except Exception as e:
-            raise EmbeddingError(f"Failed to embed text: {e}") from e
-    def embed_image(
-        self, images: Union[Path, str, List[Union[Path, str]]]
-    ) -> EmbeddingResponse:
-        """Generate embeddings for images."""
-        self._load_model()
-        if isinstance(images, (str, Path)):
-            images = [Path(images)]
-        else:
-            images = [Path(img) for img in images]
-        try:
-            # Process images in batches
-            all_embeddings = []
-            all_b64_images = []
-            for i in range(0, len(images), self.image_batch_size):
-                batch_images = images[i : i + self.image_batch_size]
-                pil_images = []
-                b64_images = []
-                for img_path in batch_images:
-                    if not img_path.exists():
-                        raise EmbeddingError(f"Image not found: {img_path}")
-                    with Image.open(img_path) as img:
-                        pil_images.append(img.convert("RGB"))
-                    b64_images.append(image_to_base64(img_path))
-                processed = self._processor.process_images(pil_images).to(self.device)
-                with torch.no_grad():
-                    batch_embeddings = self._model(**processed)
-                    all_embeddings.append(batch_embeddings.cpu().float().numpy())
-                    all_b64_images.extend(b64_images)
-            # Concatenate all batch embeddings
-            final_embeddings = np.concatenate(all_embeddings, axis=0)
-            return EmbeddingResponse(
-                model_name=self.model_name,
-                model_provider=self.provider_name,
-                input_type="image",
-                objects=[
-                    EmbeddingObject(
-                        embedding=final_embeddings[i],
-                        source_b64=all_b64_images[i]
-                    ) for i in range(len(final_embeddings))
-                ]
-            )
-        except Exception as e:
-            raise EmbeddingError(f"Failed to embed images: {e}") from e
-    def embed_pdf(self, pdf_path: Path) -> EmbeddingResponse:
-        """Generate embeddings for a PDF file using ColPali API."""
-        images = pdf_to_images(pdf_path)
-        return self.embed_image(images)

embedkit-0.1.4/src/embedkit/utils.py DELETED Viewed

@@ -1,48 +0,0 @@
-from pdf2image import convert_from_path
-from pathlib import Path
-from .config import get_temp_dir
-from typing import Union
-def pdf_to_images(pdf_path: Path) -> list[Path]:
-    """Convert a PDF file to a list of images."""
-    root_temp_dir = get_temp_dir()
-    img_temp_dir = root_temp_dir / "images"
-    img_temp_dir.mkdir(parents=True, exist_ok=True)
-    images = convert_from_path(pdf_path=str(pdf_path), output_folder=str(img_temp_dir))
-    image_paths = []
-    for i, image in enumerate(images):
-        output_path = img_temp_dir / f"{pdf_path.stem}_{i}.png"
-        if output_path.exists():
-            output_path.unlink()
-        image.save(output_path)
-        image_paths.append(output_path)
-    return image_paths
-def image_to_base64(image_path: Union[str, Path]):
-    import base64
-    try:
-        base64_only = base64.b64encode(Path(image_path).read_bytes()).decode("utf-8")
-    except Exception as e:
-        raise ValueError(f"Failed to read image {image_path}: {e}") from e
-    if isinstance(image_path, Path):
-        image_path_str = str(image_path)
-    if image_path_str.lower().endswith(".png"):
-        content_type = "image/png"
-    elif image_path_str.lower().endswith((".jpg", ".jpeg")):
-        content_type = "image/jpeg"
-    elif image_path_str.lower().endswith(".gif"):
-        content_type = "image/gif"
-    else:
-        raise ValueError(
-            f"Unsupported image format for {image_path}; expected .png, .jpg, .jpeg, or .gif"
-        )
-    base64_image = f"data:{content_type};base64,{base64_only}"
-    return base64_image