embedkit 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: embedkit
3
+ Version: 0.1.4
4
+ Summary: A simple toolkit for generating vector embeddings across multiple providers and models
5
+ Author-email: JP Hwang <me@jphwang.com>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: ai,cohere,colpali,embeddings,machine-learning,vector
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: accelerate>=1.7.0
20
+ Requires-Dist: cohere>=5.15.0
21
+ Requires-Dist: colpali-engine<0.4.0,>=0.3.0
22
+ Requires-Dist: pdf2image>=1.17.0
23
+ Requires-Dist: pillow>=11.2.1
24
+ Requires-Dist: torch<=2.5
25
+ Requires-Dist: transformers
26
+ Description-Content-Type: text/markdown
27
+
28
+ # EmbedKit
29
+
30
+ A unified interface for text and image embeddings, supporting multiple providers.
31
+
32
+ ## Installation
33
+
34
+ ```bash
35
+ pip install embedkit
36
+ ```
37
+
38
+ ## Usage
39
+
40
+ ### Text Embeddings
41
+
42
+ ```python
43
+ from embedkit import EmbedKit
44
+ from embedkit.classes import Model, CohereInputType
45
+
46
+ # Initialize with ColPali
47
+ kit = EmbedKit.colpali(
48
+ model=Model.ColPali.V1_3,
49
+ text_batch_size=16, # Optional: process text in batches of 16
50
+ image_batch_size=8, # Optional: process images in batches of 8
51
+ )
52
+
53
+ # Get embeddings
54
+ result = kit.embed_text("Hello world")
55
+ print(result.model_provider)
56
+ print(result.input_type)
57
+ print(result.objects[0].embedding.shape)
58
+ print(result.objects[0].source_b64)
59
+
60
+ # Initialize with Cohere
61
+ kit = EmbedKit.cohere(
62
+ model=Model.Cohere.EMBED_V4_0,
63
+ api_key="your-api-key",
64
+ text_input_type=CohereInputType.SEARCH_QUERY, # or SEARCH_DOCUMENT
65
+ text_batch_size=64, # Optional: process text in batches of 64
66
+ image_batch_size=8, # Optional: process images in batches of 8
67
+ )
68
+
69
+ # Get embeddings
70
+ result = kit.embed_text("Hello world")
71
+ print(result.model_provider)
72
+ print(result.input_type)
73
+ print(result.objects[0].embedding.shape)
74
+ print(result.objects[0].source_b64)
75
+ ```
76
+
77
+ ### Image Embeddings
78
+
79
+ ```python
80
+ from pathlib import Path
81
+
82
+ # Get embeddings for an image
83
+ image_path = Path("path/to/image.png")
84
+ result = kit.embed_image(image_path)
85
+
86
+ print(result.model_provider)
87
+ print(result.input_type)
88
+ print(result.objects[0].embedding.shape)
89
+ print(result.objects[0].source_b64)
90
+ ```
91
+
92
+ ### PDF Embeddings
93
+
94
+ ```python
95
+ from pathlib import Path
96
+
97
+ # Get embeddings for a PDF
98
+ pdf_path = Path("path/to/document.pdf")
99
+ result = kit.embed_pdf(pdf_path)
100
+
101
+ print(result.model_provider)
102
+ print(result.input_type)
103
+ print(result.objects[0].embedding.shape)
104
+ print(result.objects[0].source_b64)
105
+ ```
106
+
107
+ ## Response Format
108
+
109
+ The embedding methods return an `EmbeddingResponse` object with the following structure:
110
+
111
+ ```python
112
+ class EmbeddingResponse:
113
+ model_name: str
114
+ model_provider: str
115
+ input_type: str
116
+ objects: List[EmbeddingObject]
117
+
118
+ class EmbeddingObject:
119
+ embedding: np.ndarray
120
+ source_b64: Optional[str]
121
+ ```
122
+
123
+ ## Supported Models
124
+
125
+ ### ColPali
126
+ - `Model.ColPali.V1_3`
127
+
128
+ ### Cohere
129
+ - `Model.Cohere.EMBED_V4_0`
130
+
131
+ ## Requirements
132
+
133
+ - Python 3.10+
134
+
135
+ ## License
136
+
137
+ MIT
@@ -0,0 +1,110 @@
1
+ # EmbedKit
2
+
3
+ A unified interface for text and image embeddings, supporting multiple providers.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install embedkit
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ### Text Embeddings
14
+
15
+ ```python
16
+ from embedkit import EmbedKit
17
+ from embedkit.classes import Model, CohereInputType
18
+
19
+ # Initialize with ColPali
20
+ kit = EmbedKit.colpali(
21
+ model=Model.ColPali.V1_3,
22
+ text_batch_size=16, # Optional: process text in batches of 16
23
+ image_batch_size=8, # Optional: process images in batches of 8
24
+ )
25
+
26
+ # Get embeddings
27
+ result = kit.embed_text("Hello world")
28
+ print(result.model_provider)
29
+ print(result.input_type)
30
+ print(result.objects[0].embedding.shape)
31
+ print(result.objects[0].source_b64)
32
+
33
+ # Initialize with Cohere
34
+ kit = EmbedKit.cohere(
35
+ model=Model.Cohere.EMBED_V4_0,
36
+ api_key="your-api-key",
37
+ text_input_type=CohereInputType.SEARCH_QUERY, # or SEARCH_DOCUMENT
38
+ text_batch_size=64, # Optional: process text in batches of 64
39
+ image_batch_size=8, # Optional: process images in batches of 8
40
+ )
41
+
42
+ # Get embeddings
43
+ result = kit.embed_text("Hello world")
44
+ print(result.model_provider)
45
+ print(result.input_type)
46
+ print(result.objects[0].embedding.shape)
47
+ print(result.objects[0].source_b64)
48
+ ```
49
+
50
+ ### Image Embeddings
51
+
52
+ ```python
53
+ from pathlib import Path
54
+
55
+ # Get embeddings for an image
56
+ image_path = Path("path/to/image.png")
57
+ result = kit.embed_image(image_path)
58
+
59
+ print(result.model_provider)
60
+ print(result.input_type)
61
+ print(result.objects[0].embedding.shape)
62
+ print(result.objects[0].source_b64)
63
+ ```
64
+
65
+ ### PDF Embeddings
66
+
67
+ ```python
68
+ from pathlib import Path
69
+
70
+ # Get embeddings for a PDF
71
+ pdf_path = Path("path/to/document.pdf")
72
+ result = kit.embed_pdf(pdf_path)
73
+
74
+ print(result.model_provider)
75
+ print(result.input_type)
76
+ print(result.objects[0].embedding.shape)
77
+ print(result.objects[0].source_b64)
78
+ ```
79
+
80
+ ## Response Format
81
+
82
+ The embedding methods return an `EmbeddingResponse` object with the following structure:
83
+
84
+ ```python
85
+ class EmbeddingResponse:
86
+ model_name: str
87
+ model_provider: str
88
+ input_type: str
89
+ objects: List[EmbeddingObject]
90
+
91
+ class EmbeddingObject:
92
+ embedding: np.ndarray
93
+ source_b64: Optional[str]
94
+ ```
95
+
96
+ ## Supported Models
97
+
98
+ ### ColPali
99
+ - `Model.ColPali.V1_3`
100
+
101
+ ### Cohere
102
+ - `Model.Cohere.EMBED_V4_0`
103
+
104
+ ## Requirements
105
+
106
+ - Python 3.10+
107
+
108
+ ## License
109
+
110
+ MIT
@@ -37,23 +37,24 @@ long_pdf = Path("tmp/2407.01449v6.pdf")
37
37
  kit = EmbedKit.colpali(model=Model.ColPali.V1_3, text_batch_size=16, image_batch_size=8)
38
38
 
39
39
  results = kit.embed_text("Hello world")
40
- assert results.shape[0] == 1
41
- assert len(results.shape) == 3
40
+ assert len(results.objects) == 1
41
+ assert len(results.objects[0].embedding.shape) == 2
42
+ assert results.objects[0].source_b64 == None
42
43
 
43
44
  results = kit.embed_image(sample_image)
44
- assert results.shape[0] == 1
45
- assert len(results.shape) == 3
46
- assert len(results.source_images_b64) > 0
45
+ assert len(results.objects) == 1
46
+ assert len(results.objects[0].embedding.shape) == 2
47
+ assert type(results.objects[0].source_b64) == str
47
48
 
48
49
  results = kit.embed_pdf(sample_pdf)
49
- assert results.shape[0] == 1
50
- assert len(results.shape) == 3
51
- assert len(results.source_images_b64) > 0
50
+ assert len(results.objects) == 1
51
+ assert len(results.objects[0].embedding.shape) == 2
52
+ assert type(results.objects[0].source_b64) == str
52
53
 
53
- results = kit.embed_pdf(long_pdf)
54
- assert results.shape[0] == 26
55
- assert len(results.shape) == 3
56
- assert len(results.source_images_b64) > 0
54
+ # results = kit.embed_pdf(long_pdf)
55
+ # assert len(results.objects) == 26
56
+ # assert len(results.objects[0].embedding.shape) == 2
57
+ # assert type(results.objects[0].source_b64) == str
57
58
 
58
59
 
59
60
  kit = EmbedKit.cohere(
@@ -65,8 +66,9 @@ kit = EmbedKit.cohere(
65
66
  )
66
67
 
67
68
  results = kit.embed_text("Hello world")
68
- assert results.shape[0] == 1
69
- assert len(results.shape) == 2
69
+ assert len(results.objects) == 1
70
+ assert len(results.objects[0].embedding.shape) == 1
71
+ assert results.objects[0].source_b64 == None
70
72
 
71
73
  kit = EmbedKit.cohere(
72
74
  model=Model.Cohere.EMBED_V4_0,
@@ -77,20 +79,21 @@ kit = EmbedKit.cohere(
77
79
  )
78
80
 
79
81
  results = kit.embed_text("Hello world")
80
- assert results.shape[0] == 1
81
- assert len(results.shape) == 2
82
+ assert len(results.objects) == 1
83
+ assert len(results.objects[0].embedding.shape) == 1
84
+ assert results.objects[0].source_b64 == None
82
85
 
83
86
  results = kit.embed_image(sample_image)
84
- assert results.shape[0] == 1
85
- assert len(results.shape) == 2
86
- assert len(results.source_images_b64) > 0
87
+ assert len(results.objects) == 1
88
+ assert len(results.objects[0].embedding.shape) == 1
89
+ assert type(results.objects[0].source_b64) == str
87
90
 
88
91
  results = kit.embed_pdf(sample_pdf)
89
- assert results.shape[0] == 1
90
- assert len(results.shape) == 2
91
- assert len(results.source_images_b64) > 0
92
-
93
- results = kit.embed_pdf(long_pdf)
94
- assert results.shape[0] == 26
95
- assert len(results.shape) == 2
96
- assert len(results.source_images_b64) > 0
92
+ assert len(results.objects) == 1
93
+ assert len(results.objects[0].embedding.shape) == 1
94
+ assert type(results.objects[0].source_b64) == str
95
+
96
+ # results = kit.embed_pdf(long_pdf)
97
+ # assert len(results.objects) == 1
98
+ # assert len(results.objects[0].embedding.shape) == 1
99
+ # assert type(results.objects[0].source_b64) == str
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "embedkit"
3
- version = "0.1.3"
3
+ version = "0.1.4"
4
4
  description = "A simple toolkit for generating vector embeddings across multiple providers and models"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -8,7 +8,7 @@ from pathlib import Path
8
8
  import numpy as np
9
9
 
10
10
  from .models import Model
11
- from .base import EmbeddingError, EmbeddingResult
11
+ from .base import EmbeddingError, EmbeddingResponse
12
12
  from .providers import ColPaliProvider, CohereProvider
13
13
  from .providers.cohere import CohereInputType
14
14
 
@@ -84,8 +84,8 @@ class EmbedKit:
84
84
 
85
85
  provider = CohereProvider(
86
86
  api_key=api_key, model_name=model_name,
87
- text_batch_size=48,
88
- image_batch_size=8,
87
+ text_batch_size=text_batch_size,
88
+ image_batch_size=image_batch_size,
89
89
  text_input_type=text_input_type
90
90
  )
91
91
  return cls(provider)
@@ -103,7 +103,7 @@ class EmbedKit:
103
103
  # provider = HuggingFaceProvider(model_name=model_name, device=device)
104
104
  # return cls(provider)
105
105
 
106
- def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
106
+ def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResponse:
107
107
  """Generate document text embeddings using the configured provider.
108
108
 
109
109
  Args:
@@ -117,11 +117,11 @@ class EmbedKit:
117
117
 
118
118
  def embed_image(
119
119
  self, images: Union[Path, str, List[Union[Path, str]]]
120
- ) -> EmbeddingResult:
120
+ ) -> EmbeddingResponse:
121
121
  """Generate image embeddings using the configured provider."""
122
122
  return self._provider.embed_image(images)
123
123
 
124
- def embed_pdf(self, pdf: Union[Path, str]) -> EmbeddingResult:
124
+ def embed_pdf(self, pdf: Union[Path, str]) -> EmbeddingResponse:
125
125
  """Generate image embeddings from PDFsusing the configured provider. Takes a single PDF file."""
126
126
  return self._provider.embed_pdf(pdf)
127
127
 
@@ -9,35 +9,40 @@ from dataclasses import dataclass
9
9
 
10
10
 
11
11
  @dataclass
12
- class EmbeddingResult:
13
- embeddings: np.ndarray
12
+ class EmbeddingObject:
13
+ embedding: np.ndarray
14
+ source_b64: str = None
15
+
16
+
17
+ @dataclass
18
+ class EmbeddingResponse:
14
19
  model_name: str
15
20
  model_provider: str
16
21
  input_type: str
17
- source_images_b64: Optional[List[str]] = None
22
+ objects: List[EmbeddingObject]
18
23
 
19
24
  @property
20
25
  def shape(self) -> tuple:
21
- return self.embeddings.shape
26
+ return self.objects[0].embedding.shape
22
27
 
23
28
 
24
29
  class EmbeddingProvider(ABC):
25
30
  """Abstract base class for embedding providers."""
26
31
 
27
32
  @abstractmethod
28
- def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
33
+ def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResponse:
29
34
  """Generate document text embeddings using the configured provider."""
30
35
  pass
31
36
 
32
37
  @abstractmethod
33
38
  def embed_image(
34
39
  self, images: Union[Path, str, List[Union[Path, str]]]
35
- ) -> EmbeddingResult:
40
+ ) -> EmbeddingResponse:
36
41
  """Generate image embeddings using the configured provider."""
37
42
  pass
38
43
 
39
44
  @abstractmethod
40
- def embed_pdf(self, pdf: Union[Path, str]) -> EmbeddingResult:
45
+ def embed_pdf(self, pdf: Union[Path, str]) -> EmbeddingResponse:
41
46
  """Generate image embeddings from PDFsusing the configured provider. Takes a single PDF file."""
42
47
  pass
43
48
 
@@ -9,12 +9,12 @@ This module provides the main types and enums that users should interact with:
9
9
  - CohereInputType: Enum for Cohere's input types
10
10
  """
11
11
 
12
- from . import EmbeddingResult, EmbeddingError
12
+ from . import EmbeddingResponse, EmbeddingError
13
13
  from .models import Model
14
14
  from .providers.cohere import CohereInputType
15
15
 
16
16
  __all__ = [
17
- "EmbeddingResult",
17
+ "EmbeddingResponse",
18
18
  "EmbeddingError",
19
19
  "Model",
20
20
  "CohereInputType"
@@ -7,7 +7,7 @@ import numpy as np
7
7
  from enum import Enum
8
8
 
9
9
  from ..utils import pdf_to_images, image_to_base64
10
- from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResult
10
+ from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResponse, EmbeddingObject
11
11
 
12
12
 
13
13
  class CohereInputType(Enum):
@@ -51,7 +51,7 @@ class CohereProvider(EmbeddingProvider):
51
51
  raise EmbeddingError(f"Failed to initialize Cohere client: {e}") from e
52
52
  return self._client
53
53
 
54
- def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
54
+ def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResponse:
55
55
  """Generate text embeddings using the Cohere API."""
56
56
  client = self._get_client()
57
57
 
@@ -70,13 +70,17 @@ class CohereProvider(EmbeddingProvider):
70
70
  input_type=self.input_type.value,
71
71
  embedding_types=["float"],
72
72
  )
73
- all_embeddings.extend(response.embeddings.float_)
73
+ all_embeddings.extend(np.array(response.embeddings.float_))
74
74
 
75
- return EmbeddingResult(
76
- embeddings=np.array(all_embeddings),
75
+ return EmbeddingResponse(
77
76
  model_name=self.model_name,
78
77
  model_provider=self.provider_name,
79
78
  input_type=self.input_type.value,
79
+ objects=[
80
+ EmbeddingObject(
81
+ embedding=e,
82
+ ) for e in all_embeddings
83
+ ]
80
84
  )
81
85
 
82
86
  except Exception as e:
@@ -85,7 +89,7 @@ class CohereProvider(EmbeddingProvider):
85
89
  def embed_image(
86
90
  self,
87
91
  images: Union[Path, str, List[Union[Path, str]]],
88
- ) -> EmbeddingResult:
92
+ ) -> EmbeddingResponse:
89
93
  """Generate embeddings for images using Cohere API."""
90
94
  client = self._get_client()
91
95
  input_type = "image"
@@ -116,21 +120,25 @@ class CohereProvider(EmbeddingProvider):
116
120
  embedding_types=["float"],
117
121
  )
118
122
 
119
- all_embeddings.extend(response.embeddings.float_)
123
+ all_embeddings.extend(np.array(response.embeddings.float_))
120
124
  all_b64_images.extend(b64_images)
121
125
 
122
- return EmbeddingResult(
123
- embeddings=np.array(all_embeddings),
126
+ return EmbeddingResponse(
124
127
  model_name=self.model_name,
125
128
  model_provider=self.provider_name,
126
129
  input_type=input_type,
127
- source_images_b64=all_b64_images,
130
+ objects=[
131
+ EmbeddingObject(
132
+ embedding=all_embeddings[i],
133
+ source_b64=all_b64_images[i]
134
+ ) for i in range(len(all_embeddings))
135
+ ]
128
136
  )
129
137
 
130
138
  except Exception as e:
131
139
  raise EmbeddingError(f"Failed to embed image with Cohere: {e}") from e
132
140
 
133
- def embed_pdf(self, pdf_path: Path) -> EmbeddingResult:
141
+ def embed_pdf(self, pdf_path: Path) -> EmbeddingResponse:
134
142
  """Generate embeddings for a PDF file using Cohere API."""
135
143
  image_paths = pdf_to_images(pdf_path)
136
144
  return self.embed_image(image_paths)
@@ -9,7 +9,7 @@ import torch
9
9
  from PIL import Image
10
10
 
11
11
  from ..utils import pdf_to_images, image_to_base64
12
- from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResult
12
+ from ..base import EmbeddingProvider, EmbeddingError, EmbeddingResponse, EmbeddingObject
13
13
 
14
14
  logger = logging.getLogger(__name__)
15
15
 
@@ -64,7 +64,7 @@ class ColPaliProvider(EmbeddingProvider):
64
64
  except Exception as e:
65
65
  raise EmbeddingError(f"Failed to load model: {e}") from e
66
66
 
67
- def embed_text(self, texts: Union[str, List[str]]) -> np.ndarray:
67
+ def embed_text(self, texts: Union[str, List[str]]) -> EmbeddingResponse:
68
68
  """Generate embeddings for text inputs."""
69
69
  self._load_model()
70
70
 
@@ -86,11 +86,15 @@ class ColPaliProvider(EmbeddingProvider):
86
86
  # Concatenate all batch embeddings
87
87
  final_embeddings = np.concatenate(all_embeddings, axis=0)
88
88
 
89
- return EmbeddingResult(
90
- embeddings=final_embeddings,
89
+ return EmbeddingResponse(
91
90
  model_name=self.model_name,
92
91
  model_provider=self.provider_name,
93
92
  input_type="text",
93
+ objects=[
94
+ EmbeddingObject(
95
+ embedding=e,
96
+ ) for e in final_embeddings
97
+ ]
94
98
  )
95
99
 
96
100
  except Exception as e:
@@ -98,7 +102,7 @@ class ColPaliProvider(EmbeddingProvider):
98
102
 
99
103
  def embed_image(
100
104
  self, images: Union[Path, str, List[Union[Path, str]]]
101
- ) -> np.ndarray:
105
+ ) -> EmbeddingResponse:
102
106
  """Generate embeddings for images."""
103
107
  self._load_model()
104
108
 
@@ -135,18 +139,22 @@ class ColPaliProvider(EmbeddingProvider):
135
139
  # Concatenate all batch embeddings
136
140
  final_embeddings = np.concatenate(all_embeddings, axis=0)
137
141
 
138
- return EmbeddingResult(
139
- embeddings=final_embeddings,
142
+ return EmbeddingResponse(
140
143
  model_name=self.model_name,
141
144
  model_provider=self.provider_name,
142
145
  input_type="image",
143
- source_images_b64=all_b64_images,
146
+ objects=[
147
+ EmbeddingObject(
148
+ embedding=final_embeddings[i],
149
+ source_b64=all_b64_images[i]
150
+ ) for i in range(len(final_embeddings))
151
+ ]
144
152
  )
145
153
 
146
154
  except Exception as e:
147
155
  raise EmbeddingError(f"Failed to embed images: {e}") from e
148
156
 
149
- def embed_pdf(self, pdf_path: Path) -> EmbeddingResult:
157
+ def embed_pdf(self, pdf_path: Path) -> EmbeddingResponse:
150
158
  """Generate embeddings for a PDF file using ColPali API."""
151
159
  images = pdf_to_images(pdf_path)
152
160
  return self.embed_image(images)
@@ -1,6 +1,7 @@
1
1
  # tests/test_embedkit.py
2
2
  import os
3
3
  import pytest
4
+ import numpy as np
4
5
  from pathlib import Path
5
6
  from embedkit import EmbedKit
6
7
  from embedkit.models import Model
@@ -57,10 +58,13 @@ def cohere_kit_search_document():
57
58
  def test_cohere_text_embedding(request, cohere_kit_fixture):
58
59
  """Test text embedding with Cohere models."""
59
60
  kit = request.getfixturevalue(cohere_kit_fixture)
60
- embeddings = kit.embed_text("Hello world")
61
+ result = kit.embed_text("Hello world")
61
62
 
62
- assert embeddings.shape[0] == 1
63
- assert len(embeddings.shape) == 2
63
+ assert len(result.objects) == 1
64
+ assert len(result.objects[0].embedding.shape) == 1
65
+ assert result.objects[0].source_b64 is None
66
+ assert result.model_provider == "Cohere"
67
+ assert result.input_type in ["search_query", "search_document"]
64
68
 
65
69
 
66
70
  @pytest.mark.parametrize(
@@ -76,10 +80,14 @@ def test_cohere_search_document_file_embedding(
76
80
  """Test file embedding with Cohere search document model."""
77
81
  file_path = request.getfixturevalue(file_fixture)
78
82
  embed_func = getattr(cohere_kit_search_document, embed_method)
79
- embeddings = embed_func(file_path)
83
+ result = embed_func(file_path)
80
84
 
81
- assert embeddings.shape[0] == 1
82
- assert len(embeddings.shape) == 2
85
+ assert len(result.objects) == 1
86
+ assert len(result.objects[0].embedding.shape) == 1
87
+ assert result.model_provider == "Cohere"
88
+ assert result.input_type == "image"
89
+ if hasattr(result.objects[0], "source_b64"):
90
+ assert result.objects[0].source_b64 is not None
83
91
 
84
92
 
85
93
  def test_cohere_invalid_model():
@@ -107,28 +115,34 @@ def test_cohere_missing_api_key():
107
115
  def test_colpali_text_embedding():
108
116
  """Test text embedding with Colpali model."""
109
117
  kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
110
- embeddings = kit.embed_text("Hello world")
118
+ result = kit.embed_text("Hello world")
111
119
 
112
- assert embeddings.shape[0] == 1
113
- assert len(embeddings.shape) == 3
120
+ assert len(result.objects) == 1
121
+ assert len(result.objects[0].embedding.shape) == 2
122
+ assert result.objects[0].source_b64 is None
123
+ assert result.model_provider == "ColPali"
124
+ assert result.input_type == "text"
114
125
 
115
126
 
116
127
  @pytest.mark.parametrize(
117
- "embed_method,file_fixture,expected_dims",
128
+ "embed_method,file_fixture",
118
129
  [
119
- ("embed_image", "sample_image_path", 3),
120
- ("embed_pdf", "sample_pdf_path", 3),
130
+ ("embed_image", "sample_image_path"),
131
+ ("embed_pdf", "sample_pdf_path"),
121
132
  ],
122
133
  )
123
- def test_colpali_file_embedding(request, embed_method, file_fixture, expected_dims):
134
+ def test_colpali_file_embedding(request, embed_method, file_fixture):
124
135
  """Test file embedding with Colpali model."""
125
136
  kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
126
137
  file_path = request.getfixturevalue(file_fixture)
127
138
  embed_func = getattr(kit, embed_method)
128
- embeddings = embed_func(file_path)
139
+ result = embed_func(file_path)
129
140
 
130
- assert embeddings.shape[0] == 1
131
- assert len(embeddings.shape) == expected_dims
141
+ assert len(result.objects) == 1
142
+ assert len(result.objects[0].embedding.shape) == 2
143
+ assert isinstance(result.objects[0].source_b64, str)
144
+ assert result.model_provider == "ColPali"
145
+ assert result.input_type == "image"
132
146
 
133
147
 
134
148
  def test_colpali_invalid_model():
embedkit-0.1.3/PKG-INFO DELETED
@@ -1,59 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: embedkit
3
- Version: 0.1.3
4
- Summary: A simple toolkit for generating vector embeddings across multiple providers and models
5
- Author-email: JP Hwang <me@jphwang.com>
6
- License: MIT
7
- License-File: LICENSE
8
- Keywords: ai,cohere,colpali,embeddings,machine-learning,vector
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: Intended Audience :: Developers
11
- Classifier: License :: OSI Approved :: MIT License
12
- Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
- Requires-Python: >=3.10
19
- Requires-Dist: accelerate>=1.7.0
20
- Requires-Dist: cohere>=5.15.0
21
- Requires-Dist: colpali-engine<0.4.0,>=0.3.0
22
- Requires-Dist: pdf2image>=1.17.0
23
- Requires-Dist: pillow>=11.2.1
24
- Requires-Dist: torch<=2.5
25
- Requires-Dist: transformers
26
- Description-Content-Type: text/markdown
27
-
28
- # EmbedKit
29
-
30
- A Python library for generating embeddings from text, images, and PDFs using various models (e.g. from Cohere, ColPali).
31
-
32
- ## Usage
33
-
34
- See [main.py](main.py) for examples.
35
-
36
- ```python
37
- from embedkit import EmbedKit
38
- from embedkit.models import Model
39
-
40
- # Instantiate a kit
41
- # Using ColPali
42
- kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
43
-
44
- # Using Cohere
45
- kit = EmbedKit.cohere(
46
- model=Model.Cohere.EMBED_V4_0,
47
- api_key="your_api_key",
48
- text_input_type=CohereInputType.SEARCH_DOCUMENT,
49
- )
50
-
51
- # Then - the embedding API is consistent
52
- embeddings = kit.embed_text("Hello world") or kit.embed_text(["Hello world", "Hello world"])
53
- embeddings = kit.embed_image("path/to/image.png") or kit.embed_image(["path/to/image1.png", "path/to/image2.png"])
54
- embeddings = kit.embed_pdf("path/to/pdf.pdf") # Single PDF only
55
- ```
56
-
57
- ## License
58
-
59
- MIT
embedkit-0.1.3/README.md DELETED
@@ -1,32 +0,0 @@
1
- # EmbedKit
2
-
3
- A Python library for generating embeddings from text, images, and PDFs using various models (e.g. from Cohere, ColPali).
4
-
5
- ## Usage
6
-
7
- See [main.py](main.py) for examples.
8
-
9
- ```python
10
- from embedkit import EmbedKit
11
- from embedkit.models import Model
12
-
13
- # Instantiate a kit
14
- # Using ColPali
15
- kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
16
-
17
- # Using Cohere
18
- kit = EmbedKit.cohere(
19
- model=Model.Cohere.EMBED_V4_0,
20
- api_key="your_api_key",
21
- text_input_type=CohereInputType.SEARCH_DOCUMENT,
22
- )
23
-
24
- # Then - the embedding API is consistent
25
- embeddings = kit.embed_text("Hello world") or kit.embed_text(["Hello world", "Hello world"])
26
- embeddings = kit.embed_image("path/to/image.png") or kit.embed_image(["path/to/image1.png", "path/to/image2.png"])
27
- embeddings = kit.embed_pdf("path/to/pdf.pdf") # Single PDF only
28
- ```
29
-
30
- ## License
31
-
32
- MIT
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes