PyPI - embedkit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

embedkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

embedkit/__init__.py +117 -0
embedkit/base.py +49 -0
embedkit/config.py +8 -0
embedkit/models.py +12 -0
embedkit/providers/__init__.py +7 -0
embedkit/providers/cohere.py +141 -0
embedkit/providers/colpali.py +121 -0
embedkit/utils.py +21 -0
embedkit-0.1.0.dist-info/METADATA +59 -0
embedkit-0.1.0.dist-info/RECORD +12 -0
embedkit-0.1.0.dist-info/WHEEL +4 -0
embedkit-0.1.0.dist-info/licenses/LICENSE +9 -0

embedkit/__init__.py ADDED Viewed

@@ -0,0 +1,117 @@
+# ./src/embedkit/__init__.py
+"""
+EmbedKit: A unified toolkit for generating vector embeddings.
+"""
+from typing import Union, List, Optional
+from pathlib import Path
+import numpy as np
+from .models import Model
+from .base import EmbeddingError, EmbeddingResult
+from .providers import ColPaliProvider, CohereProvider
+from .providers.cohere import CohereInputType
+class EmbedKit:
+    """Main interface for generating embeddings."""
+    def __init__(self, provider_instance):
+        """
+        Initialize EmbedKit with a provider instance.
+        Args:
+            provider_instance: An initialized provider (use class methods to create)
+        """
+        self._provider = provider_instance
+    @classmethod
+    def colpali(cls, model: Model = Model.ColPali.V1_3, device: Optional[str] = None):
+        """
+        Create EmbedKit instance with ColPali provider.
+        Args:
+            model: ColPali model enum
+            device: Device to run on ('cuda', 'mps', 'cpu', or None for auto-detect)
+        """
+        if model == Model.ColPali.V1_3:
+            model_name = "vidore/colpali-v1.3"
+        else:
+            raise ValueError(f"Unsupported model: {model}")
+        provider = ColPaliProvider(model_name=model_name, device=device)
+        return cls(provider)
+    @classmethod
+    def cohere(
+        cls,
+        api_key: str,
+        model: Model = Model.Cohere.EMBED_V4_0,
+        text_input_type: CohereInputType = CohereInputType.SEARCH_DOCUMENT,
+    ):
+        """
+        Create EmbedKit instance with Cohere provider.
+        Args:
+            api_key: Cohere API key
+            model: Cohere model enum
+            input_type: Type of input for embedding (search_document or search_query)
+        """
+        if not api_key:
+            raise ValueError("API key is required")
+        if model == Model.Cohere.EMBED_V4_0:
+            model_name = "embed-v4.0"
+        else:
+            raise ValueError(f"Unsupported model: {model}")
+        provider = CohereProvider(
+            api_key=api_key, model_name=model_name, text_input_type=text_input_type
+        )
+        return cls(provider)
+    # Future class methods:
+    # @classmethod
+    # def openai(cls, api_key: str, model_name: str = "text-embedding-3-large"):
+    #     """Create EmbedKit instance with OpenAI provider."""
+    #     provider = OpenAIProvider(api_key=api_key, model_name=model_name)
+    #     return cls(provider)
+    #
+    # @classmethod
+    # def huggingface(cls, model_name: str = "all-MiniLM-L6-v2", device: Optional[str] = None):
+    #     """Create EmbedKit instance with HuggingFace provider."""
+    #     provider = HuggingFaceProvider(model_name=model_name, device=device)
+    #     return cls(provider)
+    def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
+        """Generate document text embeddings using the configured provider.
+        Args:
+            texts: Text or list of texts to embed
+            **kwargs: Additional provider-specific arguments
+        Returns:
+            EmbeddingResult containing the embeddings
+        """
+        return self._provider.embed_text(texts, **kwargs)
+    def embed_image(
+        self, images: Union[Path, str, List[Union[Path, str]]]
+    ) -> EmbeddingResult:
+        """Generate image embeddings using the configured provider."""
+        return self._provider.embed_image(images)
+    def embed_pdf(self, pdf: Union[Path, str]) -> EmbeddingResult:
+        """Generate image embeddings from PDFsusing the configured provider. Takes a single PDF file."""
+        return self._provider.embed_pdf(pdf)
+    @property
+    def provider_info(self) -> str:
+        """Get information about the current provider."""
+        return f"{self._provider.__class__.__name__}"
+# Main exports
+__version__ = "0.1.0"
+__all__ = ["EmbedKit", "Model", "EmbeddingError"]

embedkit/base.py ADDED Viewed

@@ -0,0 +1,49 @@
+# ./src/embedkit/base.py
+"""Base classes for EmbedKit."""
+from abc import ABC, abstractmethod
+from typing import Union, List
+from pathlib import Path
+import numpy as np
+from dataclasses import dataclass
+@dataclass
+class EmbeddingResult:
+    embeddings: np.ndarray
+    model_name: str
+    model_provider: str
+    input_type: str
+    @property
+    def shape(self) -> tuple:
+        return self.embeddings.shape
+class EmbeddingProvider(ABC):
+    """Abstract base class for embedding providers."""
+    @abstractmethod
+    def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
+        """Generate document text embeddings using the configured provider."""
+        pass
+    @abstractmethod
+    def embed_image(
+        self, images: Union[Path, str, List[Union[Path, str]]]
+    ) -> EmbeddingResult:
+        """Generate image embeddings using the configured provider."""
+        pass
+    @abstractmethod
+    def embed_pdf(
+        self, pdf: Union[Path, str]
+    ) -> EmbeddingResult:
+        """Generate image embeddings from PDFsusing the configured provider. Takes a single PDF file."""
+        pass
+class EmbeddingError(Exception):
+    """Base exception for embedding-related errors."""
+    pass

embedkit/config.py ADDED Viewed

@@ -0,0 +1,8 @@
+from pathlib import Path
+TEMP_DIR = Path("tmp")
+def get_temp_dir() -> Path:
+    """Get the temporary directory."""
+    return TEMP_DIR

embedkit/models.py ADDED Viewed

@@ -0,0 +1,12 @@
+# ./src/embedkit/models.py
+"""Model definitions and enum for EmbedKit."""
+from enum import Enum
+class Model:
+    class ColPali(Enum):
+        V1_3 = "colpali-v1.3"
+    class Cohere(Enum):
+        EMBED_V4_0 = "embed-v4.0"

embedkit/providers/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# ./src/embedkit/providers/__init__.py
+"""Embedding providers for EmbedKit."""
+from .colpali import ColPaliProvider
+from .cohere import CohereProvider
+__all__ = ["ColPaliProvider", "CohereProvider"]

embedkit/providers/cohere.py ADDED Viewed

@@ -0,0 +1,141 @@
+# ./src/embedkit/providers/cohere.py
+"""Cohere embedding provider."""
+from typing import Union, List
+from pathlib import Path
+import numpy as np
+from enum import Enum
+from ..utils import pdf_to_images
+from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResult
+class CohereInputType(Enum):
+    """Enum for Cohere input types."""
+    SEARCH_DOCUMENT = "search_document"
+    SEARCH_QUERY = "search_query"
+class CohereProvider(EmbeddingProvider):
+    """Cohere embedding provider for text embeddings."""
+    def __init__(
+        self,
+        api_key: str,
+        model_name: str,
+        text_input_type: CohereInputType = CohereInputType.SEARCH_DOCUMENT,
+    ):
+        self.api_key = api_key
+        self.model_name = model_name
+        self.input_type = text_input_type
+        self._client = None
+        self.provider_name = "Cohere"
+    def _get_client(self):
+        """Lazy load the Cohere client."""
+        if self._client is None:
+            try:
+                import cohere
+                self._client = cohere.ClientV2(api_key=self.api_key)
+            except ImportError as e:
+                raise EmbeddingError(
+                    "Cohere not installed. Run: pip install cohere"
+                ) from e
+            except Exception as e:
+                raise EmbeddingError(f"Failed to initialize Cohere client: {e}") from e
+        return self._client
+    def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
+        """Generate text embeddings using the Cohere API."""
+        client = self._get_client()
+        if isinstance(texts, str):
+            texts = [texts]
+        try:
+            response = client.embed(
+                texts=texts,
+                model=self.model_name,
+                input_type=self.input_type.value,
+                embedding_types=["float"],
+            )
+            return EmbeddingResult(
+                embeddings=np.array(response.embeddings.float_),
+                model_name=self.model_name,
+                model_provider=self.provider_name,
+                input_type=self.input_type.value,
+            )
+        except Exception as e:
+            raise EmbeddingError(f"Failed to embed text with Cohere: {e}") from e
+    def embed_image(
+        self,
+        images: Union[Path, str, List[Union[Path, str]]],
+    ) -> EmbeddingResult:
+        """Generate embeddings for images using Cohere API."""
+        client = self._get_client()
+        input_type = "image"
+        if isinstance(images, (str, Path)):
+            images = [images]
+        try:
+            import base64
+            b64_images = []
+            for image in images:
+                if isinstance(image, (Path, str)):
+                    try:
+                        base64_only = base64.b64encode(Path(image).read_bytes()).decode(
+                            "utf-8"
+                        )
+                    except Exception as e:
+                        raise EmbeddingError(
+                            f"Failed to read image {image}: {e}"
+                        ) from e
+                    if isinstance(image, Path):
+                        image = str(image)
+                    if image.lower().endswith(".png"):
+                        content_type = "image/png"
+                    elif image.lower().endswith((".jpg", ".jpeg")):
+                        content_type = "image/jpeg"
+                    elif image.lower().endswith(".gif"):
+                        content_type = "image/gif"
+                    else:
+                        raise EmbeddingError(
+                            f"Unsupported image format for {image}; expected .png, .jpg, .jpeg, or .gif"
+                        )
+                    base64_image = f"data:{content_type};base64,{base64_only}"
+                else:
+                    raise EmbeddingError(f"Unsupported image type: {type(image)}")
+                b64_images.append(base64_image)
+            response = client.embed(
+                model=self.model_name,
+                input_type="image",
+                images=b64_images,
+                embedding_types=["float"],
+            )
+            return EmbeddingResult(
+                embeddings=np.array(response.embeddings.float_),
+                model_name=self.model_name,
+                model_provider=self.provider_name,
+                input_type=input_type,
+            )
+        except Exception as e:
+            raise EmbeddingError(f"Failed to embed image with Cohere: {e}") from e
+    def embed_pdf(self, pdf_path: Path) -> EmbeddingResult:
+        """Generate embeddings for a PDF file using Cohere API."""
+        image_paths = pdf_to_images(pdf_path)
+        return self.embed_image(image_paths)

embedkit/providers/colpali.py ADDED Viewed

@@ -0,0 +1,121 @@
+# ./src/embedkit/providers/colpali.py
+"""ColPali embedding provider."""
+from typing import Union, List, Optional
+from pathlib import Path
+import logging
+import numpy as np
+import torch
+from PIL import Image
+from ..utils import pdf_to_images
+from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResult
+logger = logging.getLogger(__name__)
+class ColPaliProvider(EmbeddingProvider):
+    """ColPali embedding provider for document understanding."""
+    def __init__(self, model_name: str, device: Optional[str] = None):
+        self.model_name = model_name
+        self.provider_name = "ColPali"
+        # Auto-detect device
+        if device is None:
+            if torch.cuda.is_available():
+                device = "cuda"
+            elif torch.backends.mps.is_available():
+                device = "mps"
+            else:
+                device = "cpu"
+        self.device = device
+        self._model = None
+        self._processor = None
+    def _load_model(self):
+        """Lazy load the model."""
+        if self._model is None:
+            try:
+                from colpali_engine.models import ColPali, ColPaliProcessor
+                self._model = ColPali.from_pretrained(
+                    self.model_name,
+                    torch_dtype=torch.bfloat16,
+                    device_map=self.device,
+                ).eval()
+                self._processor = ColPaliProcessor.from_pretrained(self.model_name)
+                logger.info(f"Loaded ColPali model on {self.device}")
+            except ImportError as e:
+                raise EmbeddingError(
+                    "ColPali not installed. Run: pip install colpali-engine"
+                ) from e
+            except Exception as e:
+                raise EmbeddingError(f"Failed to load model: {e}") from e
+    def embed_text(self, texts: Union[str, List[str]]) -> np.ndarray:
+        """Generate embeddings for text inputs."""
+        self._load_model()
+        if isinstance(texts, str):
+            texts = [texts]
+        try:
+            processed = self._processor.process_queries(texts).to(self.device)
+            with torch.no_grad():
+                embeddings = self._model(**processed)
+            return EmbeddingResult(
+                embeddings=embeddings.cpu().float().numpy(),
+                model_name=self.model_name,
+                model_provider=self.provider_name,
+                input_type="text",
+            )
+        except Exception as e:
+            raise EmbeddingError(f"Failed to embed text: {e}") from e
+    def embed_image(
+        self, images: Union[Path, str, List[Union[Path, str]]]
+    ) -> np.ndarray:
+        """Generate embeddings for images."""
+        self._load_model()
+        if isinstance(images, (str, Path)):
+            images = [Path(images)]
+        else:
+            images = [Path(img) for img in images]
+        try:
+            pil_images = []
+            for img_path in images:
+                if not img_path.exists():
+                    raise EmbeddingError(f"Image not found: {img_path}")
+                with Image.open(img_path) as img:
+                    pil_images.append(img.convert("RGB"))
+            processed = self._processor.process_images(pil_images).to(self.device)
+            with torch.no_grad():
+                embeddings = self._model(**processed)
+            return EmbeddingResult(
+                embeddings=embeddings.cpu().float().numpy(),
+                model_name=self.model_name,
+                model_provider=self.provider_name,
+                input_type="image",
+            )
+        except Exception as e:
+            raise EmbeddingError(f"Failed to embed images: {e}") from e
+    def embed_pdf(self, pdf_path: Path) -> EmbeddingResult:
+        """Generate embeddings for a PDF file using ColPali API."""
+        images = pdf_to_images(pdf_path)
+        return self.embed_image(images)

embedkit/utils.py ADDED Viewed

@@ -0,0 +1,21 @@
+from pdf2image import convert_from_path
+from pathlib import Path
+from .config import get_temp_dir
+def pdf_to_images(pdf_path: Path) -> list[Path]:
+    """Convert a PDF file to a list of images."""
+    root_temp_dir = get_temp_dir()
+    img_temp_dir = root_temp_dir / "images"
+    img_temp_dir.mkdir(parents=True, exist_ok=True)
+    images = convert_from_path(pdf_path=str(pdf_path), output_folder=str(img_temp_dir))
+    image_paths = []
+    for i, image in enumerate(images):
+        output_path = img_temp_dir / f"{pdf_path.stem}_{i}.png"
+        if output_path.exists():
+            output_path.unlink()
+        image.save(output_path)
+        image_paths.append(output_path)
+    return image_paths

embedkit-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,59 @@
+Metadata-Version: 2.4
+Name: embedkit
+Version: 0.1.0
+Summary: A simple toolkit for generating vector embeddings across multiple providers and models
+Author-email: JP Hwang <me@jphwang.com>
+License: MIT
+License-File: LICENSE
+Keywords: ai,cohere,colpali,embeddings,machine-learning,vector
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.10
+Requires-Dist: accelerate>=1.7.0
+Requires-Dist: cohere>=5.15.0
+Requires-Dist: colpali-engine<0.4.0,>=0.3.0
+Requires-Dist: pdf2image>=1.17.0
+Requires-Dist: pillow>=11.2.1
+Requires-Dist: torch<=2.5
+Requires-Dist: transformers
+Description-Content-Type: text/markdown
+# EmbedKit
+A Python library for generating embeddings from text, images, and PDFs using various models (e.g. from Cohere, ColPali).
+## Usage
+See [main.py](main.py) for examples.
+```python
+from embedkit import EmbedKit
+from embedkit.models import Model
+# Instantiate a kit
+# Using ColPali
+kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
+# Using Cohere
+kit = EmbedKit.cohere(
+    model=Model.Cohere.EMBED_V4_0,
+    api_key="your_api_key",
+    text_input_type=CohereInputType.SEARCH_DOCUMENT,
+)
+# Then - the embedding API is consistent
+embeddings = kit.embed_text("Hello world") or kit.embed_text(["Hello world", "Hello world"])
+embeddings = kit.embed_image("path/to/image.png") or kit.embed_image(["path/to/image1.png", "path/to/image2.png"])
+embeddings = kit.embed_pdf("path/to/pdf.pdf")  # Single PDF only
+```
+## License
+MIT

embedkit-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+embedkit/__init__.py,sha256=vm_dF7i_EGQsNEgBn7WPq-Vbo1xTnqV2devUvY18Z5E,3862
+embedkit/base.py,sha256=ZwCeDnJXVsVVT5l7ybpP5wG2ZU9e19XgV3c9OJp9z2o,1233
+embedkit/config.py,sha256=EVGODSKxQAr46bU8dyORFunsfRuj6dnvtSqa4MxUZCo,138
+embedkit/models.py,sha256=EBIYkyZeIhGaOPL-9bslHHdLaZ7qzOYLd0qxVZ7VX7w,226
+embedkit/utils.py,sha256=TyFyDk6tMx-PaVotixSdJDx8U3JgrPi9nV2j-rW-clw,705
+embedkit/providers/__init__.py,sha256=HaS-HNQabvhn9xLNZCq3VUqPCb7rGG4pvgvpKP4AXcw,201
+embedkit/providers/cohere.py,sha256=u6zoAjXKkjaVfTZk1VgjwRqtQ7Bea1odlVBKWomB_1A,4737
+embedkit/providers/colpali.py,sha256=20YAEeTvkNoexax-KhU7lWjJBdWRHPzE4Zf-6XpP3v0,3896
+embedkit-0.1.0.dist-info/METADATA,sha256=18DAz2h--FOgMSO3VNgm9ZXENSXK9IsVkEYm-xb2a3c,1893
+embedkit-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+embedkit-0.1.0.dist-info/licenses/LICENSE,sha256=-g2Rad7b3rb2oVwOTwfMOIpscHT1zuaJoguamLRCBJs,1072
+embedkit-0.1.0.dist-info/RECORD,,

embedkit-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any

embedkit-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,9 @@
+MIT License
+Copyright © 2025 JP Hwang
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.