embedkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
embedkit/__init__.py ADDED
@@ -0,0 +1,117 @@
1
+ # ./src/embedkit/__init__.py
2
+ """
3
+ EmbedKit: A unified toolkit for generating vector embeddings.
4
+ """
5
+
6
+ from typing import Union, List, Optional
7
+ from pathlib import Path
8
+ import numpy as np
9
+
10
+ from .models import Model
11
+ from .base import EmbeddingError, EmbeddingResult
12
+ from .providers import ColPaliProvider, CohereProvider
13
+ from .providers.cohere import CohereInputType
14
+
15
+
16
+ class EmbedKit:
17
+ """Main interface for generating embeddings."""
18
+
19
+ def __init__(self, provider_instance):
20
+ """
21
+ Initialize EmbedKit with a provider instance.
22
+
23
+ Args:
24
+ provider_instance: An initialized provider (use class methods to create)
25
+ """
26
+ self._provider = provider_instance
27
+
28
+ @classmethod
29
+ def colpali(cls, model: Model = Model.ColPali.V1_3, device: Optional[str] = None):
30
+ """
31
+ Create EmbedKit instance with ColPali provider.
32
+
33
+ Args:
34
+ model: ColPali model enum
35
+ device: Device to run on ('cuda', 'mps', 'cpu', or None for auto-detect)
36
+ """
37
+ if model == Model.ColPali.V1_3:
38
+ model_name = "vidore/colpali-v1.3"
39
+ else:
40
+ raise ValueError(f"Unsupported model: {model}")
41
+
42
+
43
+ provider = ColPaliProvider(model_name=model_name, device=device)
44
+ return cls(provider)
45
+
46
+ @classmethod
47
+ def cohere(
48
+ cls,
49
+ api_key: str,
50
+ model: Model = Model.Cohere.EMBED_V4_0,
51
+ text_input_type: CohereInputType = CohereInputType.SEARCH_DOCUMENT,
52
+ ):
53
+ """
54
+ Create EmbedKit instance with Cohere provider.
55
+
56
+ Args:
57
+ api_key: Cohere API key
58
+ model: Cohere model enum
59
+ input_type: Type of input for embedding (search_document or search_query)
60
+ """
61
+ if not api_key:
62
+ raise ValueError("API key is required")
63
+
64
+ if model == Model.Cohere.EMBED_V4_0:
65
+ model_name = "embed-v4.0"
66
+ else:
67
+ raise ValueError(f"Unsupported model: {model}")
68
+
69
+ provider = CohereProvider(
70
+ api_key=api_key, model_name=model_name, text_input_type=text_input_type
71
+ )
72
+ return cls(provider)
73
+
74
+ # Future class methods:
75
+ # @classmethod
76
+ # def openai(cls, api_key: str, model_name: str = "text-embedding-3-large"):
77
+ # """Create EmbedKit instance with OpenAI provider."""
78
+ # provider = OpenAIProvider(api_key=api_key, model_name=model_name)
79
+ # return cls(provider)
80
+ #
81
+ # @classmethod
82
+ # def huggingface(cls, model_name: str = "all-MiniLM-L6-v2", device: Optional[str] = None):
83
+ # """Create EmbedKit instance with HuggingFace provider."""
84
+ # provider = HuggingFaceProvider(model_name=model_name, device=device)
85
+ # return cls(provider)
86
+
87
+ def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
88
+ """Generate document text embeddings using the configured provider.
89
+
90
+ Args:
91
+ texts: Text or list of texts to embed
92
+ **kwargs: Additional provider-specific arguments
93
+
94
+ Returns:
95
+ EmbeddingResult containing the embeddings
96
+ """
97
+ return self._provider.embed_text(texts, **kwargs)
98
+
99
+ def embed_image(
100
+ self, images: Union[Path, str, List[Union[Path, str]]]
101
+ ) -> EmbeddingResult:
102
+ """Generate image embeddings using the configured provider."""
103
+ return self._provider.embed_image(images)
104
+
105
+ def embed_pdf(self, pdf: Union[Path, str]) -> EmbeddingResult:
106
+ """Generate image embeddings from PDFsusing the configured provider. Takes a single PDF file."""
107
+ return self._provider.embed_pdf(pdf)
108
+
109
+ @property
110
+ def provider_info(self) -> str:
111
+ """Get information about the current provider."""
112
+ return f"{self._provider.__class__.__name__}"
113
+
114
+
115
+ # Main exports
116
+ __version__ = "0.1.0"
117
+ __all__ = ["EmbedKit", "Model", "EmbeddingError"]
embedkit/base.py ADDED
@@ -0,0 +1,49 @@
1
+ # ./src/embedkit/base.py
2
+ """Base classes for EmbedKit."""
3
+
4
+ from abc import ABC, abstractmethod
5
+ from typing import Union, List
6
+ from pathlib import Path
7
+ import numpy as np
8
+ from dataclasses import dataclass
9
+
10
+
11
+ @dataclass
12
+ class EmbeddingResult:
13
+ embeddings: np.ndarray
14
+ model_name: str
15
+ model_provider: str
16
+ input_type: str
17
+
18
+ @property
19
+ def shape(self) -> tuple:
20
+ return self.embeddings.shape
21
+
22
+
23
+ class EmbeddingProvider(ABC):
24
+ """Abstract base class for embedding providers."""
25
+
26
+ @abstractmethod
27
+ def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
28
+ """Generate document text embeddings using the configured provider."""
29
+ pass
30
+
31
+ @abstractmethod
32
+ def embed_image(
33
+ self, images: Union[Path, str, List[Union[Path, str]]]
34
+ ) -> EmbeddingResult:
35
+ """Generate image embeddings using the configured provider."""
36
+ pass
37
+
38
+ @abstractmethod
39
+ def embed_pdf(
40
+ self, pdf: Union[Path, str]
41
+ ) -> EmbeddingResult:
42
+ """Generate image embeddings from PDFsusing the configured provider. Takes a single PDF file."""
43
+ pass
44
+
45
+
46
+ class EmbeddingError(Exception):
47
+ """Base exception for embedding-related errors."""
48
+
49
+ pass
embedkit/config.py ADDED
@@ -0,0 +1,8 @@
1
+ from pathlib import Path
2
+
3
+ TEMP_DIR = Path("tmp")
4
+
5
+
6
+ def get_temp_dir() -> Path:
7
+ """Get the temporary directory."""
8
+ return TEMP_DIR
embedkit/models.py ADDED
@@ -0,0 +1,12 @@
1
+ # ./src/embedkit/models.py
2
+ """Model definitions and enum for EmbedKit."""
3
+
4
+ from enum import Enum
5
+
6
+
7
+ class Model:
8
+ class ColPali(Enum):
9
+ V1_3 = "colpali-v1.3"
10
+
11
+ class Cohere(Enum):
12
+ EMBED_V4_0 = "embed-v4.0"
@@ -0,0 +1,7 @@
1
+ # ./src/embedkit/providers/__init__.py
2
+ """Embedding providers for EmbedKit."""
3
+
4
+ from .colpali import ColPaliProvider
5
+ from .cohere import CohereProvider
6
+
7
+ __all__ = ["ColPaliProvider", "CohereProvider"]
@@ -0,0 +1,141 @@
1
+ # ./src/embedkit/providers/cohere.py
2
+ """Cohere embedding provider."""
3
+
4
+ from typing import Union, List
5
+ from pathlib import Path
6
+ import numpy as np
7
+ from enum import Enum
8
+
9
+ from ..utils import pdf_to_images
10
+ from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResult
11
+
12
+
13
+ class CohereInputType(Enum):
14
+ """Enum for Cohere input types."""
15
+
16
+ SEARCH_DOCUMENT = "search_document"
17
+ SEARCH_QUERY = "search_query"
18
+
19
+
20
+ class CohereProvider(EmbeddingProvider):
21
+ """Cohere embedding provider for text embeddings."""
22
+
23
+ def __init__(
24
+ self,
25
+ api_key: str,
26
+ model_name: str,
27
+ text_input_type: CohereInputType = CohereInputType.SEARCH_DOCUMENT,
28
+ ):
29
+ self.api_key = api_key
30
+ self.model_name = model_name
31
+ self.input_type = text_input_type
32
+ self._client = None
33
+ self.provider_name = "Cohere"
34
+
35
+ def _get_client(self):
36
+ """Lazy load the Cohere client."""
37
+ if self._client is None:
38
+ try:
39
+ import cohere
40
+
41
+ self._client = cohere.ClientV2(api_key=self.api_key)
42
+ except ImportError as e:
43
+ raise EmbeddingError(
44
+ "Cohere not installed. Run: pip install cohere"
45
+ ) from e
46
+ except Exception as e:
47
+ raise EmbeddingError(f"Failed to initialize Cohere client: {e}") from e
48
+ return self._client
49
+
50
+ def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
51
+ """Generate text embeddings using the Cohere API."""
52
+ client = self._get_client()
53
+
54
+ if isinstance(texts, str):
55
+ texts = [texts]
56
+
57
+ try:
58
+ response = client.embed(
59
+ texts=texts,
60
+ model=self.model_name,
61
+ input_type=self.input_type.value,
62
+ embedding_types=["float"],
63
+ )
64
+
65
+ return EmbeddingResult(
66
+ embeddings=np.array(response.embeddings.float_),
67
+ model_name=self.model_name,
68
+ model_provider=self.provider_name,
69
+ input_type=self.input_type.value,
70
+ )
71
+
72
+ except Exception as e:
73
+ raise EmbeddingError(f"Failed to embed text with Cohere: {e}") from e
74
+
75
+ def embed_image(
76
+ self,
77
+ images: Union[Path, str, List[Union[Path, str]]],
78
+ ) -> EmbeddingResult:
79
+ """Generate embeddings for images using Cohere API."""
80
+ client = self._get_client()
81
+ input_type = "image"
82
+
83
+ if isinstance(images, (str, Path)):
84
+ images = [images]
85
+
86
+ try:
87
+ import base64
88
+
89
+ b64_images = []
90
+ for image in images:
91
+ if isinstance(image, (Path, str)):
92
+ try:
93
+ base64_only = base64.b64encode(Path(image).read_bytes()).decode(
94
+ "utf-8"
95
+ )
96
+ except Exception as e:
97
+ raise EmbeddingError(
98
+ f"Failed to read image {image}: {e}"
99
+ ) from e
100
+
101
+ if isinstance(image, Path):
102
+ image = str(image)
103
+
104
+ if image.lower().endswith(".png"):
105
+ content_type = "image/png"
106
+ elif image.lower().endswith((".jpg", ".jpeg")):
107
+ content_type = "image/jpeg"
108
+ elif image.lower().endswith(".gif"):
109
+ content_type = "image/gif"
110
+ else:
111
+ raise EmbeddingError(
112
+ f"Unsupported image format for {image}; expected .png, .jpg, .jpeg, or .gif"
113
+ )
114
+ base64_image = f"data:{content_type};base64,{base64_only}"
115
+ else:
116
+ raise EmbeddingError(f"Unsupported image type: {type(image)}")
117
+
118
+ b64_images.append(base64_image)
119
+
120
+ response = client.embed(
121
+ model=self.model_name,
122
+ input_type="image",
123
+ images=b64_images,
124
+ embedding_types=["float"],
125
+ )
126
+
127
+ return EmbeddingResult(
128
+ embeddings=np.array(response.embeddings.float_),
129
+ model_name=self.model_name,
130
+ model_provider=self.provider_name,
131
+ input_type=input_type,
132
+ )
133
+
134
+ except Exception as e:
135
+ raise EmbeddingError(f"Failed to embed image with Cohere: {e}") from e
136
+
137
+
138
+ def embed_pdf(self, pdf_path: Path) -> EmbeddingResult:
139
+ """Generate embeddings for a PDF file using Cohere API."""
140
+ image_paths = pdf_to_images(pdf_path)
141
+ return self.embed_image(image_paths)
@@ -0,0 +1,121 @@
1
+ # ./src/embedkit/providers/colpali.py
2
+ """ColPali embedding provider."""
3
+
4
+ from typing import Union, List, Optional
5
+ from pathlib import Path
6
+ import logging
7
+ import numpy as np
8
+ import torch
9
+ from PIL import Image
10
+
11
+ from ..utils import pdf_to_images
12
+ from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResult
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class ColPaliProvider(EmbeddingProvider):
18
+ """ColPali embedding provider for document understanding."""
19
+
20
+ def __init__(self, model_name: str, device: Optional[str] = None):
21
+ self.model_name = model_name
22
+ self.provider_name = "ColPali"
23
+
24
+ # Auto-detect device
25
+ if device is None:
26
+ if torch.cuda.is_available():
27
+ device = "cuda"
28
+ elif torch.backends.mps.is_available():
29
+ device = "mps"
30
+ else:
31
+ device = "cpu"
32
+
33
+ self.device = device
34
+ self._model = None
35
+ self._processor = None
36
+
37
+ def _load_model(self):
38
+ """Lazy load the model."""
39
+ if self._model is None:
40
+ try:
41
+ from colpali_engine.models import ColPali, ColPaliProcessor
42
+
43
+ self._model = ColPali.from_pretrained(
44
+ self.model_name,
45
+ torch_dtype=torch.bfloat16,
46
+ device_map=self.device,
47
+ ).eval()
48
+
49
+ self._processor = ColPaliProcessor.from_pretrained(self.model_name)
50
+ logger.info(f"Loaded ColPali model on {self.device}")
51
+
52
+ except ImportError as e:
53
+ raise EmbeddingError(
54
+ "ColPali not installed. Run: pip install colpali-engine"
55
+ ) from e
56
+ except Exception as e:
57
+ raise EmbeddingError(f"Failed to load model: {e}") from e
58
+
59
+ def embed_text(self, texts: Union[str, List[str]]) -> np.ndarray:
60
+ """Generate embeddings for text inputs."""
61
+ self._load_model()
62
+
63
+ if isinstance(texts, str):
64
+ texts = [texts]
65
+
66
+ try:
67
+ processed = self._processor.process_queries(texts).to(self.device)
68
+
69
+ with torch.no_grad():
70
+ embeddings = self._model(**processed)
71
+
72
+ return EmbeddingResult(
73
+ embeddings=embeddings.cpu().float().numpy(),
74
+ model_name=self.model_name,
75
+ model_provider=self.provider_name,
76
+ input_type="text",
77
+ )
78
+
79
+ except Exception as e:
80
+ raise EmbeddingError(f"Failed to embed text: {e}") from e
81
+
82
+ def embed_image(
83
+ self, images: Union[Path, str, List[Union[Path, str]]]
84
+ ) -> np.ndarray:
85
+ """Generate embeddings for images."""
86
+ self._load_model()
87
+
88
+ if isinstance(images, (str, Path)):
89
+ images = [Path(images)]
90
+ else:
91
+ images = [Path(img) for img in images]
92
+
93
+ try:
94
+ pil_images = []
95
+ for img_path in images:
96
+ if not img_path.exists():
97
+ raise EmbeddingError(f"Image not found: {img_path}")
98
+
99
+ with Image.open(img_path) as img:
100
+ pil_images.append(img.convert("RGB"))
101
+
102
+ processed = self._processor.process_images(pil_images).to(self.device)
103
+
104
+ with torch.no_grad():
105
+ embeddings = self._model(**processed)
106
+
107
+ return EmbeddingResult(
108
+ embeddings=embeddings.cpu().float().numpy(),
109
+ model_name=self.model_name,
110
+ model_provider=self.provider_name,
111
+ input_type="image",
112
+ )
113
+
114
+ except Exception as e:
115
+ raise EmbeddingError(f"Failed to embed images: {e}") from e
116
+
117
+
118
+ def embed_pdf(self, pdf_path: Path) -> EmbeddingResult:
119
+ """Generate embeddings for a PDF file using ColPali API."""
120
+ images = pdf_to_images(pdf_path)
121
+ return self.embed_image(images)
embedkit/utils.py ADDED
@@ -0,0 +1,21 @@
1
+ from pdf2image import convert_from_path
2
+ from pathlib import Path
3
+ from .config import get_temp_dir
4
+
5
+
6
+ def pdf_to_images(pdf_path: Path) -> list[Path]:
7
+ """Convert a PDF file to a list of images."""
8
+ root_temp_dir = get_temp_dir()
9
+ img_temp_dir = root_temp_dir / "images"
10
+ img_temp_dir.mkdir(parents=True, exist_ok=True)
11
+ images = convert_from_path(pdf_path=str(pdf_path), output_folder=str(img_temp_dir))
12
+ image_paths = []
13
+
14
+ for i, image in enumerate(images):
15
+ output_path = img_temp_dir / f"{pdf_path.stem}_{i}.png"
16
+ if output_path.exists():
17
+ output_path.unlink()
18
+
19
+ image.save(output_path)
20
+ image_paths.append(output_path)
21
+ return image_paths
@@ -0,0 +1,59 @@
1
+ Metadata-Version: 2.4
2
+ Name: embedkit
3
+ Version: 0.1.0
4
+ Summary: A simple toolkit for generating vector embeddings across multiple providers and models
5
+ Author-email: JP Hwang <me@jphwang.com>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: ai,cohere,colpali,embeddings,machine-learning,vector
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: accelerate>=1.7.0
20
+ Requires-Dist: cohere>=5.15.0
21
+ Requires-Dist: colpali-engine<0.4.0,>=0.3.0
22
+ Requires-Dist: pdf2image>=1.17.0
23
+ Requires-Dist: pillow>=11.2.1
24
+ Requires-Dist: torch<=2.5
25
+ Requires-Dist: transformers
26
+ Description-Content-Type: text/markdown
27
+
28
+ # EmbedKit
29
+
30
+ A Python library for generating embeddings from text, images, and PDFs using various models (e.g. from Cohere, ColPali).
31
+
32
+ ## Usage
33
+
34
+ See [main.py](main.py) for examples.
35
+
36
+ ```python
37
+ from embedkit import EmbedKit
38
+ from embedkit.models import Model
39
+
40
+ # Instantiate a kit
41
+ # Using ColPali
42
+ kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
43
+
44
+ # Using Cohere
45
+ kit = EmbedKit.cohere(
46
+ model=Model.Cohere.EMBED_V4_0,
47
+ api_key="your_api_key",
48
+ text_input_type=CohereInputType.SEARCH_DOCUMENT,
49
+ )
50
+
51
+ # Then - the embedding API is consistent
52
+ embeddings = kit.embed_text("Hello world") or kit.embed_text(["Hello world", "Hello world"])
53
+ embeddings = kit.embed_image("path/to/image.png") or kit.embed_image(["path/to/image1.png", "path/to/image2.png"])
54
+ embeddings = kit.embed_pdf("path/to/pdf.pdf") # Single PDF only
55
+ ```
56
+
57
+ ## License
58
+
59
+ MIT
@@ -0,0 +1,12 @@
1
+ embedkit/__init__.py,sha256=vm_dF7i_EGQsNEgBn7WPq-Vbo1xTnqV2devUvY18Z5E,3862
2
+ embedkit/base.py,sha256=ZwCeDnJXVsVVT5l7ybpP5wG2ZU9e19XgV3c9OJp9z2o,1233
3
+ embedkit/config.py,sha256=EVGODSKxQAr46bU8dyORFunsfRuj6dnvtSqa4MxUZCo,138
4
+ embedkit/models.py,sha256=EBIYkyZeIhGaOPL-9bslHHdLaZ7qzOYLd0qxVZ7VX7w,226
5
+ embedkit/utils.py,sha256=TyFyDk6tMx-PaVotixSdJDx8U3JgrPi9nV2j-rW-clw,705
6
+ embedkit/providers/__init__.py,sha256=HaS-HNQabvhn9xLNZCq3VUqPCb7rGG4pvgvpKP4AXcw,201
7
+ embedkit/providers/cohere.py,sha256=u6zoAjXKkjaVfTZk1VgjwRqtQ7Bea1odlVBKWomB_1A,4737
8
+ embedkit/providers/colpali.py,sha256=20YAEeTvkNoexax-KhU7lWjJBdWRHPzE4Zf-6XpP3v0,3896
9
+ embedkit-0.1.0.dist-info/METADATA,sha256=18DAz2h--FOgMSO3VNgm9ZXENSXK9IsVkEYm-xb2a3c,1893
10
+ embedkit-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ embedkit-0.1.0.dist-info/licenses/LICENSE,sha256=-g2Rad7b3rb2oVwOTwfMOIpscHT1zuaJoguamLRCBJs,1072
12
+ embedkit-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright © 2025 JP Hwang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.