embedkit 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- embedkit-0.1.5/PKG-INFO +143 -0
- embedkit-0.1.5/README.md +116 -0
- embedkit-0.1.5/main.py +107 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/pyproject.toml +2 -2
- {embedkit-0.1.3 → embedkit-0.1.5}/src/embedkit/__init__.py +16 -18
- embedkit-0.1.5/src/embedkit/base.py +122 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/src/embedkit/classes.py +2 -7
- embedkit-0.1.5/src/embedkit/models.py +18 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/src/embedkit/providers/cohere.py +26 -39
- embedkit-0.1.5/src/embedkit/providers/colpali.py +157 -0
- embedkit-0.1.5/src/embedkit/utils.py +141 -0
- embedkit-0.1.5/tests/fixtures/2407.01449v6_p1_p5.pdf +0 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/tests/test_embedkit.py +32 -18
- embedkit-0.1.5/tests/test_utils.py +52 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/uv.lock +2 -2
- embedkit-0.1.3/PKG-INFO +0 -59
- embedkit-0.1.3/README.md +0 -32
- embedkit-0.1.3/main.py +0 -96
- embedkit-0.1.3/src/embedkit/base.py +0 -48
- embedkit-0.1.3/src/embedkit/models.py +0 -12
- embedkit-0.1.3/src/embedkit/providers/colpali.py +0 -152
- embedkit-0.1.3/src/embedkit/utils.py +0 -48
- {embedkit-0.1.3 → embedkit-0.1.5}/.gitignore +0 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/.python-version +0 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/LICENSE +0 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/src/embedkit/config.py +0 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/src/embedkit/providers/__init__.py +0 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/tests/conftest.py +0 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/tests/fixtures/2407.01449v6_p1.pdf +0 -0
- {embedkit-0.1.3 → embedkit-0.1.5}/tests/fixtures/2407.01449v6_p1.png +0 -0
embedkit-0.1.5/PKG-INFO
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: embedkit
|
3
|
+
Version: 0.1.5
|
4
|
+
Summary: A simple toolkit for generating vector embeddings across multiple providers and models
|
5
|
+
Author-email: JP Hwang <me@jphwang.com>
|
6
|
+
License: MIT
|
7
|
+
License-File: LICENSE
|
8
|
+
Keywords: ai,cohere,colpali,embeddings,machine-learning,vector
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
18
|
+
Requires-Python: >=3.10
|
19
|
+
Requires-Dist: accelerate>=1.7.0
|
20
|
+
Requires-Dist: cohere>=5.15.0
|
21
|
+
Requires-Dist: colpali-engine<0.4.0,>=0.3.0
|
22
|
+
Requires-Dist: pdf2image>=1.17.0
|
23
|
+
Requires-Dist: pillow>=11.2.1
|
24
|
+
Requires-Dist: torch<=2.5
|
25
|
+
Requires-Dist: transformers>=4.46.2
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
|
28
|
+
# EmbedKit
|
29
|
+
|
30
|
+
A unified interface for text and image embeddings, supporting multiple providers.
|
31
|
+
|
32
|
+
## Installation
|
33
|
+
|
34
|
+
```bash
|
35
|
+
pip install embedkit
|
36
|
+
```
|
37
|
+
|
38
|
+
## Usage
|
39
|
+
|
40
|
+
### Text Embeddings
|
41
|
+
|
42
|
+
```python
|
43
|
+
from embedkit import EmbedKit
|
44
|
+
from embedkit.classes import Model, CohereInputType
|
45
|
+
|
46
|
+
# Initialize with ColPali
|
47
|
+
kit = EmbedKit.colpali(
|
48
|
+
model=Model.ColPali.COLPALI_V1_3, # or COLSMOL_256M, COLSMOL_500M
|
49
|
+
text_batch_size=16, # Optional: process text in batches of 16
|
50
|
+
image_batch_size=8, # Optional: process images in batches of 8
|
51
|
+
)
|
52
|
+
|
53
|
+
# Get embeddings
|
54
|
+
result = kit.embed_text("Hello world")
|
55
|
+
print(result.model_provider)
|
56
|
+
print(result.input_type)
|
57
|
+
print(result.objects[0].embedding.shape) # Returns 2D array for ColPali
|
58
|
+
print(result.objects[0].source_b64)
|
59
|
+
|
60
|
+
# Initialize with Cohere
|
61
|
+
kit = EmbedKit.cohere(
|
62
|
+
model=Model.Cohere.EMBED_V4_0,
|
63
|
+
api_key="your-api-key",
|
64
|
+
text_input_type=CohereInputType.SEARCH_QUERY, # or SEARCH_DOCUMENT
|
65
|
+
text_batch_size=64, # Optional: process text in batches of 64
|
66
|
+
image_batch_size=8, # Optional: process images in batches of 8
|
67
|
+
)
|
68
|
+
|
69
|
+
# Get embeddings
|
70
|
+
result = kit.embed_text("Hello world")
|
71
|
+
print(result.model_provider)
|
72
|
+
print(result.input_type)
|
73
|
+
print(result.objects[0].embedding.shape) # Returns 1D array for Cohere
|
74
|
+
print(result.objects[0].source_b64)
|
75
|
+
```
|
76
|
+
|
77
|
+
### Image Embeddings
|
78
|
+
|
79
|
+
```python
|
80
|
+
from pathlib import Path
|
81
|
+
|
82
|
+
# Get embeddings for an image
|
83
|
+
image_path = Path("path/to/image.png")
|
84
|
+
result = kit.embed_image(image_path)
|
85
|
+
|
86
|
+
print(result.model_provider)
|
87
|
+
print(result.input_type)
|
88
|
+
print(result.objects[0].embedding.shape) # 2D for ColPali, 1D for Cohere
|
89
|
+
print(result.objects[0].source_b64) # Base64 encoded image
|
90
|
+
```
|
91
|
+
|
92
|
+
### PDF Embeddings
|
93
|
+
|
94
|
+
```python
|
95
|
+
from pathlib import Path
|
96
|
+
|
97
|
+
# Get embeddings for a PDF
|
98
|
+
pdf_path = Path("path/to/document.pdf")
|
99
|
+
result = kit.embed_pdf(pdf_path)
|
100
|
+
|
101
|
+
print(result.model_provider)
|
102
|
+
print(result.input_type)
|
103
|
+
print(result.objects[0].embedding.shape) # 2D for ColPali, 1D for Cohere
|
104
|
+
print(result.objects[0].source_b64) # Base64 encoded PDF page
|
105
|
+
```
|
106
|
+
|
107
|
+
## Response Format
|
108
|
+
|
109
|
+
The embedding methods return an `EmbeddingResponse` object with the following structure:
|
110
|
+
|
111
|
+
```python
|
112
|
+
class EmbeddingResponse:
|
113
|
+
model_name: str
|
114
|
+
model_provider: str
|
115
|
+
input_type: str
|
116
|
+
objects: List[EmbeddingObject]
|
117
|
+
|
118
|
+
class EmbeddingObject:
|
119
|
+
embedding: np.ndarray # 1D array for Cohere, 2D array for ColPali
|
120
|
+
source_b64: Optional[str] # Base64 encoded source for images and PDFs
|
121
|
+
```
|
122
|
+
|
123
|
+
## Supported Models
|
124
|
+
|
125
|
+
### ColPali
|
126
|
+
- `Model.ColPali.COLPALI_V1_3`
|
127
|
+
- `Model.ColPali.COLSMOL_256M`
|
128
|
+
- `Model.ColPali.COLSMOL_500M`
|
129
|
+
|
130
|
+
### Cohere
|
131
|
+
- `Model.Cohere.EMBED_V4_0`
|
132
|
+
- `Model.Cohere.EMBED_ENGLISH_V3_0`
|
133
|
+
- `Model.Cohere.EMBED_ENGLISH_LIGHT_V3_0`
|
134
|
+
- `Model.Cohere.EMBED_MULTILINGUAL_V3_0`
|
135
|
+
- `Model.Cohere.EMBED_MULTILINGUAL_LIGHT_V3_0`
|
136
|
+
|
137
|
+
## Requirements
|
138
|
+
|
139
|
+
- Python 3.10+
|
140
|
+
|
141
|
+
## License
|
142
|
+
|
143
|
+
MIT
|
embedkit-0.1.5/README.md
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
# EmbedKit
|
2
|
+
|
3
|
+
A unified interface for text and image embeddings, supporting multiple providers.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
```bash
|
8
|
+
pip install embedkit
|
9
|
+
```
|
10
|
+
|
11
|
+
## Usage
|
12
|
+
|
13
|
+
### Text Embeddings
|
14
|
+
|
15
|
+
```python
|
16
|
+
from embedkit import EmbedKit
|
17
|
+
from embedkit.classes import Model, CohereInputType
|
18
|
+
|
19
|
+
# Initialize with ColPali
|
20
|
+
kit = EmbedKit.colpali(
|
21
|
+
model=Model.ColPali.COLPALI_V1_3, # or COLSMOL_256M, COLSMOL_500M
|
22
|
+
text_batch_size=16, # Optional: process text in batches of 16
|
23
|
+
image_batch_size=8, # Optional: process images in batches of 8
|
24
|
+
)
|
25
|
+
|
26
|
+
# Get embeddings
|
27
|
+
result = kit.embed_text("Hello world")
|
28
|
+
print(result.model_provider)
|
29
|
+
print(result.input_type)
|
30
|
+
print(result.objects[0].embedding.shape) # Returns 2D array for ColPali
|
31
|
+
print(result.objects[0].source_b64)
|
32
|
+
|
33
|
+
# Initialize with Cohere
|
34
|
+
kit = EmbedKit.cohere(
|
35
|
+
model=Model.Cohere.EMBED_V4_0,
|
36
|
+
api_key="your-api-key",
|
37
|
+
text_input_type=CohereInputType.SEARCH_QUERY, # or SEARCH_DOCUMENT
|
38
|
+
text_batch_size=64, # Optional: process text in batches of 64
|
39
|
+
image_batch_size=8, # Optional: process images in batches of 8
|
40
|
+
)
|
41
|
+
|
42
|
+
# Get embeddings
|
43
|
+
result = kit.embed_text("Hello world")
|
44
|
+
print(result.model_provider)
|
45
|
+
print(result.input_type)
|
46
|
+
print(result.objects[0].embedding.shape) # Returns 1D array for Cohere
|
47
|
+
print(result.objects[0].source_b64)
|
48
|
+
```
|
49
|
+
|
50
|
+
### Image Embeddings
|
51
|
+
|
52
|
+
```python
|
53
|
+
from pathlib import Path
|
54
|
+
|
55
|
+
# Get embeddings for an image
|
56
|
+
image_path = Path("path/to/image.png")
|
57
|
+
result = kit.embed_image(image_path)
|
58
|
+
|
59
|
+
print(result.model_provider)
|
60
|
+
print(result.input_type)
|
61
|
+
print(result.objects[0].embedding.shape) # 2D for ColPali, 1D for Cohere
|
62
|
+
print(result.objects[0].source_b64) # Base64 encoded image
|
63
|
+
```
|
64
|
+
|
65
|
+
### PDF Embeddings
|
66
|
+
|
67
|
+
```python
|
68
|
+
from pathlib import Path
|
69
|
+
|
70
|
+
# Get embeddings for a PDF
|
71
|
+
pdf_path = Path("path/to/document.pdf")
|
72
|
+
result = kit.embed_pdf(pdf_path)
|
73
|
+
|
74
|
+
print(result.model_provider)
|
75
|
+
print(result.input_type)
|
76
|
+
print(result.objects[0].embedding.shape) # 2D for ColPali, 1D for Cohere
|
77
|
+
print(result.objects[0].source_b64) # Base64 encoded PDF page
|
78
|
+
```
|
79
|
+
|
80
|
+
## Response Format
|
81
|
+
|
82
|
+
The embedding methods return an `EmbeddingResponse` object with the following structure:
|
83
|
+
|
84
|
+
```python
|
85
|
+
class EmbeddingResponse:
|
86
|
+
model_name: str
|
87
|
+
model_provider: str
|
88
|
+
input_type: str
|
89
|
+
objects: List[EmbeddingObject]
|
90
|
+
|
91
|
+
class EmbeddingObject:
|
92
|
+
embedding: np.ndarray # 1D array for Cohere, 2D array for ColPali
|
93
|
+
source_b64: Optional[str] # Base64 encoded source for images and PDFs
|
94
|
+
```
|
95
|
+
|
96
|
+
## Supported Models
|
97
|
+
|
98
|
+
### ColPali
|
99
|
+
- `Model.ColPali.COLPALI_V1_3`
|
100
|
+
- `Model.ColPali.COLSMOL_256M`
|
101
|
+
- `Model.ColPali.COLSMOL_500M`
|
102
|
+
|
103
|
+
### Cohere
|
104
|
+
- `Model.Cohere.EMBED_V4_0`
|
105
|
+
- `Model.Cohere.EMBED_ENGLISH_V3_0`
|
106
|
+
- `Model.Cohere.EMBED_ENGLISH_LIGHT_V3_0`
|
107
|
+
- `Model.Cohere.EMBED_MULTILINGUAL_V3_0`
|
108
|
+
- `Model.Cohere.EMBED_MULTILINGUAL_LIGHT_V3_0`
|
109
|
+
|
110
|
+
## Requirements
|
111
|
+
|
112
|
+
- Python 3.10+
|
113
|
+
|
114
|
+
## License
|
115
|
+
|
116
|
+
MIT
|
embedkit-0.1.5/main.py
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
# ./main.py
|
2
|
+
from embedkit import EmbedKit
|
3
|
+
from embedkit.classes import Model, CohereInputType
|
4
|
+
from pathlib import Path
|
5
|
+
import os
|
6
|
+
|
7
|
+
|
8
|
+
def get_online_image(url: str) -> Path:
|
9
|
+
"""Download an image from a URL and return its local path."""
|
10
|
+
import requests
|
11
|
+
from tempfile import NamedTemporaryFile
|
12
|
+
|
13
|
+
# Add User-Agent header to comply with Wikipedia's policy
|
14
|
+
headers = {"User-Agent": "EmbedKit-Example/1.0"}
|
15
|
+
|
16
|
+
response = requests.get(url, headers=headers)
|
17
|
+
response.raise_for_status()
|
18
|
+
|
19
|
+
temp_file = NamedTemporaryFile(delete=False, suffix=".png")
|
20
|
+
temp_file.write(response.content)
|
21
|
+
temp_file.close()
|
22
|
+
|
23
|
+
return Path(temp_file.name)
|
24
|
+
|
25
|
+
|
26
|
+
def get_sample_image() -> Path:
|
27
|
+
"""Get a sample image for testing."""
|
28
|
+
url = "https://upload.wikimedia.org/wikipedia/commons/b/b8/English_Wikipedia_HomePage_2001-12-20.png"
|
29
|
+
return get_online_image(url)
|
30
|
+
|
31
|
+
|
32
|
+
sample_image = get_sample_image()
|
33
|
+
|
34
|
+
sample_pdf = Path("tests/fixtures/2407.01449v6_p1.pdf")
|
35
|
+
longer_pdf = Path("tests/fixtures/2407.01449v6_p1_p5.pdf")
|
36
|
+
|
37
|
+
kit = EmbedKit.cohere(
|
38
|
+
model=Model.Cohere.EMBED_V4_0,
|
39
|
+
api_key=os.getenv("COHERE_API_KEY"),
|
40
|
+
text_batch_size=64,
|
41
|
+
image_batch_size=8,
|
42
|
+
text_input_type=CohereInputType.SEARCH_QUERY,
|
43
|
+
)
|
44
|
+
|
45
|
+
print(f"Trying out Cohere")
|
46
|
+
results = kit.embed_text("Hello world")
|
47
|
+
assert len(results.objects) == 1
|
48
|
+
assert len(results.objects[0].embedding.shape) == 1
|
49
|
+
assert results.objects[0].source_b64 == None
|
50
|
+
|
51
|
+
kit = EmbedKit.cohere(
|
52
|
+
model=Model.Cohere.EMBED_V4_0,
|
53
|
+
api_key=os.getenv("COHERE_API_KEY"),
|
54
|
+
text_batch_size=64,
|
55
|
+
image_batch_size=8,
|
56
|
+
text_input_type=CohereInputType.SEARCH_DOCUMENT,
|
57
|
+
)
|
58
|
+
|
59
|
+
results = kit.embed_text("Hello world")
|
60
|
+
assert len(results.objects) == 1
|
61
|
+
assert len(results.objects[0].embedding.shape) == 1
|
62
|
+
assert results.objects[0].source_b64 == None
|
63
|
+
|
64
|
+
results = kit.embed_image(sample_image)
|
65
|
+
assert len(results.objects) == 1
|
66
|
+
assert len(results.objects[0].embedding.shape) == 1
|
67
|
+
assert type(results.objects[0].source_b64) == str
|
68
|
+
|
69
|
+
results = kit.embed_pdf(sample_pdf)
|
70
|
+
assert len(results.objects) == 1
|
71
|
+
assert len(results.objects[0].embedding.shape) == 1
|
72
|
+
assert type(results.objects[0].source_b64) == str
|
73
|
+
|
74
|
+
results = kit.embed_pdf(longer_pdf)
|
75
|
+
assert len(results.objects) == 5
|
76
|
+
assert len(results.objects[0].embedding.shape) == 1
|
77
|
+
assert type(results.objects[0].source_b64) == str
|
78
|
+
|
79
|
+
for colpali_model in [
|
80
|
+
Model.ColPali.COLSMOL_256M,
|
81
|
+
Model.ColPali.COLSMOL_500M,
|
82
|
+
Model.ColPali.COLPALI_V1_3,
|
83
|
+
]:
|
84
|
+
print(f"Trying out {colpali_model}")
|
85
|
+
kit = EmbedKit.colpali(
|
86
|
+
model=colpali_model, text_batch_size=16, image_batch_size=8
|
87
|
+
)
|
88
|
+
|
89
|
+
results = kit.embed_text("Hello world")
|
90
|
+
assert len(results.objects) == 1
|
91
|
+
assert len(results.objects[0].embedding.shape) == 2
|
92
|
+
assert results.objects[0].source_b64 == None
|
93
|
+
|
94
|
+
results = kit.embed_image(sample_image)
|
95
|
+
assert len(results.objects) == 1
|
96
|
+
assert len(results.objects[0].embedding.shape) == 2
|
97
|
+
assert type(results.objects[0].source_b64) == str
|
98
|
+
|
99
|
+
results = kit.embed_pdf(sample_pdf)
|
100
|
+
assert len(results.objects) == 1
|
101
|
+
assert len(results.objects[0].embedding.shape) == 2
|
102
|
+
assert type(results.objects[0].source_b64) == str
|
103
|
+
|
104
|
+
results = kit.embed_pdf(longer_pdf)
|
105
|
+
assert len(results.objects) == 5
|
106
|
+
assert len(results.objects[0].embedding.shape) == 2
|
107
|
+
assert type(results.objects[0].source_b64) == str
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "embedkit"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.5"
|
4
4
|
description = "A simple toolkit for generating vector embeddings across multiple providers and models"
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.10"
|
@@ -11,7 +11,7 @@ dependencies = [
|
|
11
11
|
"pdf2image>=1.17.0",
|
12
12
|
"pillow>=11.2.1",
|
13
13
|
"torch<=2.5",
|
14
|
-
"transformers",
|
14
|
+
"transformers>=4.46.2",
|
15
15
|
]
|
16
16
|
authors = [
|
17
17
|
{name = "JP Hwang", email = "me@jphwang.com"},
|
@@ -5,10 +5,9 @@ EmbedKit: A unified toolkit for generating vector embeddings.
|
|
5
5
|
|
6
6
|
from typing import Union, List, Optional
|
7
7
|
from pathlib import Path
|
8
|
-
import numpy as np
|
9
8
|
|
10
9
|
from .models import Model
|
11
|
-
from .base import EmbeddingError,
|
10
|
+
from .base import EmbeddingError, EmbeddingResponse
|
12
11
|
from .providers import ColPaliProvider, CohereProvider
|
13
12
|
from .providers.cohere import CohereInputType
|
14
13
|
|
@@ -28,7 +27,7 @@ class EmbedKit:
|
|
28
27
|
@classmethod
|
29
28
|
def colpali(
|
30
29
|
cls,
|
31
|
-
model: Model = Model.ColPali.
|
30
|
+
model: Model = Model.ColPali.COLPALI_V1_3,
|
32
31
|
device: Optional[str] = None,
|
33
32
|
text_batch_size: int = 32,
|
34
33
|
image_batch_size: int = 8,
|
@@ -42,13 +41,13 @@ class EmbedKit:
|
|
42
41
|
text_batch_size: Batch size for text embedding generation
|
43
42
|
image_batch_size: Batch size for image embedding generation
|
44
43
|
"""
|
45
|
-
if model
|
46
|
-
|
47
|
-
|
48
|
-
|
44
|
+
if not isinstance(model, Model.ColPali):
|
45
|
+
raise ValueError(
|
46
|
+
f"Unsupported model: {model}. Must be a Model.ColPali enum value."
|
47
|
+
)
|
49
48
|
|
50
49
|
provider = ColPaliProvider(
|
51
|
-
|
50
|
+
model=model,
|
52
51
|
device=device,
|
53
52
|
text_batch_size=text_batch_size,
|
54
53
|
image_batch_size=image_batch_size,
|
@@ -77,16 +76,15 @@ class EmbedKit:
|
|
77
76
|
if not api_key:
|
78
77
|
raise ValueError("API key is required")
|
79
78
|
|
80
|
-
if model
|
81
|
-
model_name = "embed-v4.0"
|
82
|
-
else:
|
79
|
+
if not isinstance(model, Model.Cohere):
|
83
80
|
raise ValueError(f"Unsupported model: {model}")
|
84
81
|
|
85
82
|
provider = CohereProvider(
|
86
|
-
api_key=api_key,
|
87
|
-
|
88
|
-
|
89
|
-
|
83
|
+
api_key=api_key,
|
84
|
+
model=model,
|
85
|
+
text_batch_size=text_batch_size,
|
86
|
+
image_batch_size=image_batch_size,
|
87
|
+
text_input_type=text_input_type,
|
90
88
|
)
|
91
89
|
return cls(provider)
|
92
90
|
|
@@ -103,7 +101,7 @@ class EmbedKit:
|
|
103
101
|
# provider = HuggingFaceProvider(model_name=model_name, device=device)
|
104
102
|
# return cls(provider)
|
105
103
|
|
106
|
-
def embed_text(self, texts: Union[str, List[str]], **kwargs) ->
|
104
|
+
def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResponse:
|
107
105
|
"""Generate document text embeddings using the configured provider.
|
108
106
|
|
109
107
|
Args:
|
@@ -117,11 +115,11 @@ class EmbedKit:
|
|
117
115
|
|
118
116
|
def embed_image(
|
119
117
|
self, images: Union[Path, str, List[Union[Path, str]]]
|
120
|
-
) ->
|
118
|
+
) -> EmbeddingResponse:
|
121
119
|
"""Generate image embeddings using the configured provider."""
|
122
120
|
return self._provider.embed_image(images)
|
123
121
|
|
124
|
-
def embed_pdf(self, pdf: Union[Path, str]) ->
|
122
|
+
def embed_pdf(self, pdf: Union[Path, str]) -> EmbeddingResponse:
|
125
123
|
"""Generate image embeddings from PDFsusing the configured provider. Takes a single PDF file."""
|
126
124
|
return self._provider.embed_pdf(pdf)
|
127
125
|
|
@@ -0,0 +1,122 @@
|
|
1
|
+
# ./src/embedkit/base.py
|
2
|
+
"""Base classes for EmbedKit."""
|
3
|
+
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from typing import Union, List, Optional
|
6
|
+
from pathlib import Path
|
7
|
+
import numpy as np
|
8
|
+
from dataclasses import dataclass
|
9
|
+
|
10
|
+
from .models import Model
|
11
|
+
from .utils import with_pdf_cleanup
|
12
|
+
|
13
|
+
|
14
|
+
@dataclass
|
15
|
+
class EmbeddingObject:
|
16
|
+
embedding: np.ndarray
|
17
|
+
source_b64: str = None
|
18
|
+
source_content_type: str = None # e.g., "image/png", "image/jpeg"
|
19
|
+
|
20
|
+
|
21
|
+
@dataclass
|
22
|
+
class EmbeddingResponse:
|
23
|
+
model_name: str
|
24
|
+
model_provider: str
|
25
|
+
input_type: str
|
26
|
+
objects: List[EmbeddingObject]
|
27
|
+
|
28
|
+
@property
|
29
|
+
def shape(self) -> tuple:
|
30
|
+
return self.objects[0].embedding.shape
|
31
|
+
|
32
|
+
|
33
|
+
class EmbeddingProvider(ABC):
|
34
|
+
"""Abstract base class for embedding providers."""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
model_name: str,
|
39
|
+
text_batch_size: int,
|
40
|
+
image_batch_size: int,
|
41
|
+
provider_name: str,
|
42
|
+
):
|
43
|
+
self.model_name = model_name
|
44
|
+
self.provider_name = provider_name
|
45
|
+
self.text_batch_size = text_batch_size
|
46
|
+
self.image_batch_size = image_batch_size
|
47
|
+
|
48
|
+
def _normalize_text_input(self, texts: Union[str, List[str]]) -> List[str]:
|
49
|
+
"""Normalize text input to a list of strings."""
|
50
|
+
if isinstance(texts, str):
|
51
|
+
return [texts]
|
52
|
+
return texts
|
53
|
+
|
54
|
+
def _normalize_image_input(
|
55
|
+
self, images: Union[Path, str, List[Union[Path, str]]]
|
56
|
+
) -> List[Path]:
|
57
|
+
"""Normalize image input to a list of Path objects."""
|
58
|
+
if isinstance(images, (str, Path)):
|
59
|
+
return [Path(images)]
|
60
|
+
return [Path(img) for img in images]
|
61
|
+
|
62
|
+
def _create_text_response(
|
63
|
+
self, embeddings: List[np.ndarray], input_type: str = "text"
|
64
|
+
) -> EmbeddingResponse:
|
65
|
+
"""Create a standardized text embedding response."""
|
66
|
+
return EmbeddingResponse(
|
67
|
+
model_name=self.model_name,
|
68
|
+
model_provider=self.provider_name,
|
69
|
+
input_type=input_type,
|
70
|
+
objects=[EmbeddingObject(embedding=e) for e in embeddings],
|
71
|
+
)
|
72
|
+
|
73
|
+
def _create_image_response(
|
74
|
+
self,
|
75
|
+
embeddings: List[np.ndarray],
|
76
|
+
b64_data: List[str],
|
77
|
+
content_types: List[str],
|
78
|
+
input_type: str = "image",
|
79
|
+
) -> EmbeddingResponse:
|
80
|
+
"""Create a standardized image embedding response."""
|
81
|
+
return EmbeddingResponse(
|
82
|
+
model_name=self.model_name,
|
83
|
+
model_provider=self.provider_name,
|
84
|
+
input_type=input_type,
|
85
|
+
objects=[
|
86
|
+
EmbeddingObject(
|
87
|
+
embedding=embedding,
|
88
|
+
source_b64=b64_data,
|
89
|
+
source_content_type=content_type,
|
90
|
+
)
|
91
|
+
for embedding, b64_data, content_type in zip(
|
92
|
+
embeddings, b64_data, content_types
|
93
|
+
)
|
94
|
+
],
|
95
|
+
)
|
96
|
+
|
97
|
+
@abstractmethod
|
98
|
+
def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResponse:
|
99
|
+
"""Generate document text embeddings using the configured provider."""
|
100
|
+
pass
|
101
|
+
|
102
|
+
@abstractmethod
|
103
|
+
def embed_image(
|
104
|
+
self, images: Union[Path, str, List[Union[Path, str]]]
|
105
|
+
) -> EmbeddingResponse:
|
106
|
+
"""Generate image embeddings using the configured provider."""
|
107
|
+
pass
|
108
|
+
|
109
|
+
def embed_pdf(self, pdf_path: Path) -> EmbeddingResponse:
|
110
|
+
"""Generate embeddings for a PDF file."""
|
111
|
+
return self._embed_pdf_impl(pdf_path)
|
112
|
+
|
113
|
+
@with_pdf_cleanup
|
114
|
+
def _embed_pdf_impl(self, pdf_path: List[Path]) -> EmbeddingResponse:
|
115
|
+
"""Internal implementation of PDF embedding with cleanup handled by decorator."""
|
116
|
+
return self.embed_image(pdf_path)
|
117
|
+
|
118
|
+
|
119
|
+
class EmbeddingError(Exception):
|
120
|
+
"""Base exception for embedding-related errors."""
|
121
|
+
|
122
|
+
pass
|
@@ -9,13 +9,8 @@ This module provides the main types and enums that users should interact with:
|
|
9
9
|
- CohereInputType: Enum for Cohere's input types
|
10
10
|
"""
|
11
11
|
|
12
|
-
from . import
|
12
|
+
from . import EmbeddingResponse, EmbeddingError
|
13
13
|
from .models import Model
|
14
14
|
from .providers.cohere import CohereInputType
|
15
15
|
|
16
|
-
__all__ = [
|
17
|
-
"EmbeddingResult",
|
18
|
-
"EmbeddingError",
|
19
|
-
"Model",
|
20
|
-
"CohereInputType"
|
21
|
-
]
|
16
|
+
__all__ = ["EmbeddingResponse", "EmbeddingError", "Model", "CohereInputType"]
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# ./src/embedkit/models.py
|
2
|
+
"""Model definitions and enum for EmbedKit."""
|
3
|
+
|
4
|
+
from enum import Enum
|
5
|
+
|
6
|
+
|
7
|
+
class Model:
|
8
|
+
class ColPali(Enum):
|
9
|
+
COLPALI_V1_3 = "vidore/colpali-v1.3"
|
10
|
+
COLSMOL_500M = "vidore/colSmol-500M"
|
11
|
+
COLSMOL_256M = "vidore/colSmol-256M"
|
12
|
+
|
13
|
+
class Cohere(Enum):
|
14
|
+
EMBED_V4_0 = "embed-v4.0"
|
15
|
+
EMBED_ENGLISH_V3_0 = "embed-english-v3.0"
|
16
|
+
EMBED_ENGLISH_LIGHT_V3_0 = "embed-english-light-v3.0"
|
17
|
+
EMBED_MULTILINGUAL_V3_0 = "embed-multilingual-v3.0"
|
18
|
+
EMBED_MULTILINGUAL_LIGHT_V3_0 = "embed-multilingual-light-v3.0"
|