embedkit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- embedkit-0.1.0/.gitignore +223 -0
- embedkit-0.1.0/.python-version +1 -0
- embedkit-0.1.0/LICENSE +9 -0
- embedkit-0.1.0/PKG-INFO +59 -0
- embedkit-0.1.0/README.md +32 -0
- embedkit-0.1.0/main.py +78 -0
- embedkit-0.1.0/pyproject.toml +55 -0
- embedkit-0.1.0/src/embedkit/__init__.py +117 -0
- embedkit-0.1.0/src/embedkit/base.py +49 -0
- embedkit-0.1.0/src/embedkit/config.py +8 -0
- embedkit-0.1.0/src/embedkit/models.py +12 -0
- embedkit-0.1.0/src/embedkit/providers/__init__.py +7 -0
- embedkit-0.1.0/src/embedkit/providers/cohere.py +141 -0
- embedkit-0.1.0/src/embedkit/providers/colpali.py +121 -0
- embedkit-0.1.0/src/embedkit/utils.py +21 -0
- embedkit-0.1.0/tests/conftest.py +22 -0
- embedkit-0.1.0/tests/fixtures/2407.01449v6_p1.pdf +0 -0
- embedkit-0.1.0/tests/fixtures/2407.01449v6_p1.png +0 -0
- embedkit-0.1.0/tests/test_embedkit.py +137 -0
- embedkit-0.1.0/uv.lock +1811 -0
@@ -0,0 +1,223 @@
|
|
1
|
+
# Development files
|
2
|
+
tmp/
|
3
|
+
|
4
|
+
|
5
|
+
# Byte-compiled / optimized / DLL files
|
6
|
+
__pycache__/
|
7
|
+
*.py[cod]
|
8
|
+
*$py.class
|
9
|
+
|
10
|
+
# C extensions
|
11
|
+
*.so
|
12
|
+
|
13
|
+
# Distribution / packaging
|
14
|
+
.Python
|
15
|
+
build/
|
16
|
+
develop-eggs/
|
17
|
+
dist/
|
18
|
+
downloads/
|
19
|
+
eggs/
|
20
|
+
.eggs/
|
21
|
+
lib/
|
22
|
+
lib64/
|
23
|
+
parts/
|
24
|
+
sdist/
|
25
|
+
var/
|
26
|
+
wheels/
|
27
|
+
share/python-wheels/
|
28
|
+
*.egg-info/
|
29
|
+
.installed.cfg
|
30
|
+
*.egg
|
31
|
+
MANIFEST
|
32
|
+
|
33
|
+
# PyInstaller
|
34
|
+
# Usually these files are written by a python script from a template
|
35
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
36
|
+
*.manifest
|
37
|
+
*.spec
|
38
|
+
|
39
|
+
# Installer logs
|
40
|
+
pip-log.txt
|
41
|
+
pip-delete-this-directory.txt
|
42
|
+
|
43
|
+
# Unit test / coverage reports
|
44
|
+
htmlcov/
|
45
|
+
.tox/
|
46
|
+
.nox/
|
47
|
+
.coverage
|
48
|
+
.coverage.*
|
49
|
+
.cache
|
50
|
+
nosetests.xml
|
51
|
+
coverage.xml
|
52
|
+
*.cover
|
53
|
+
*.py,cover
|
54
|
+
.hypothesis/
|
55
|
+
.pytest_cache/
|
56
|
+
cover/
|
57
|
+
|
58
|
+
# Translations
|
59
|
+
*.mo
|
60
|
+
*.pot
|
61
|
+
|
62
|
+
# Django stuff:
|
63
|
+
*.log
|
64
|
+
local_settings.py
|
65
|
+
db.sqlite3
|
66
|
+
db.sqlite3-journal
|
67
|
+
|
68
|
+
# Flask stuff:
|
69
|
+
instance/
|
70
|
+
.webassets-cache
|
71
|
+
|
72
|
+
# Scrapy stuff:
|
73
|
+
.scrapy
|
74
|
+
|
75
|
+
# Sphinx documentation
|
76
|
+
docs/_build/
|
77
|
+
|
78
|
+
# PyBuilder
|
79
|
+
.pybuilder/
|
80
|
+
target/
|
81
|
+
|
82
|
+
# Jupyter Notebook
|
83
|
+
.ipynb_checkpoints
|
84
|
+
|
85
|
+
# IPython
|
86
|
+
profile_default/
|
87
|
+
ipython_config.py
|
88
|
+
|
89
|
+
# pyenv
|
90
|
+
# For a library or package, you might want to ignore these files since the code is
|
91
|
+
# intended to run in multiple environments; otherwise, check them in:
|
92
|
+
# .python-version
|
93
|
+
|
94
|
+
# pipenv
|
95
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
96
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
97
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
98
|
+
# install all needed dependencies.
|
99
|
+
#Pipfile.lock
|
100
|
+
|
101
|
+
# UV
|
102
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
103
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
104
|
+
# commonly ignored for libraries.
|
105
|
+
#uv.lock
|
106
|
+
|
107
|
+
# poetry
|
108
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
109
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
110
|
+
# commonly ignored for libraries.
|
111
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
112
|
+
#poetry.lock
|
113
|
+
|
114
|
+
# pdm
|
115
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
116
|
+
#pdm.lock
|
117
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
118
|
+
# in version control.
|
119
|
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
120
|
+
.pdm.toml
|
121
|
+
.pdm-python
|
122
|
+
.pdm-build/
|
123
|
+
|
124
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
125
|
+
__pypackages__/
|
126
|
+
|
127
|
+
# Celery stuff
|
128
|
+
celerybeat-schedule
|
129
|
+
celerybeat.pid
|
130
|
+
|
131
|
+
# SageMath parsed files
|
132
|
+
*.sage.py
|
133
|
+
|
134
|
+
# Environments
|
135
|
+
.env
|
136
|
+
.venv
|
137
|
+
env/
|
138
|
+
venv/
|
139
|
+
ENV/
|
140
|
+
env.bak/
|
141
|
+
venv.bak/
|
142
|
+
|
143
|
+
# Spyder project settings
|
144
|
+
.spyderproject
|
145
|
+
.spyproject
|
146
|
+
|
147
|
+
# Rope project settings
|
148
|
+
.ropeproject
|
149
|
+
|
150
|
+
# mkdocs documentation
|
151
|
+
/site
|
152
|
+
|
153
|
+
# mypy
|
154
|
+
.mypy_cache/
|
155
|
+
.dmypy.json
|
156
|
+
dmypy.json
|
157
|
+
|
158
|
+
# Pyre type checker
|
159
|
+
.pyre/
|
160
|
+
|
161
|
+
# pytype static type analyzer
|
162
|
+
.pytype/
|
163
|
+
|
164
|
+
# Cython debug symbols
|
165
|
+
cython_debug/
|
166
|
+
|
167
|
+
# PyCharm
|
168
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
169
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
170
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
171
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
172
|
+
#.idea/
|
173
|
+
|
174
|
+
# Abstra
|
175
|
+
# Abstra is an AI-powered process automation framework.
|
176
|
+
# Ignore directories containing user credentials, local state, and settings.
|
177
|
+
# Learn more at https://abstra.io/docs
|
178
|
+
.abstra/
|
179
|
+
|
180
|
+
# Visual Studio Code
|
181
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
182
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
183
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
184
|
+
# you could uncomment the following to ignore the enitre vscode folder
|
185
|
+
# .vscode/
|
186
|
+
|
187
|
+
# Ruff stuff:
|
188
|
+
.ruff_cache/
|
189
|
+
|
190
|
+
# PyPI configuration file
|
191
|
+
.pypirc
|
192
|
+
|
193
|
+
# Cursor
|
194
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
195
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
196
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
197
|
+
.cursorignore
|
198
|
+
.cursorindexingignore
|
199
|
+
# General
|
200
|
+
.DS_Store
|
201
|
+
.AppleDouble
|
202
|
+
.LSOverride
|
203
|
+
Icon[
|
204
|
+
]
|
205
|
+
|
206
|
+
# Thumbnails
|
207
|
+
._*
|
208
|
+
|
209
|
+
# Files that might appear in the root of a volume
|
210
|
+
.DocumentRevisions-V100
|
211
|
+
.fseventsd
|
212
|
+
.Spotlight-V100
|
213
|
+
.TemporaryItems
|
214
|
+
.Trashes
|
215
|
+
.VolumeIcon.icns
|
216
|
+
.com.apple.timemachine.donotpresent
|
217
|
+
|
218
|
+
# Directories potentially created on remote AFP share
|
219
|
+
.AppleDB
|
220
|
+
.AppleDesktop
|
221
|
+
Network Trash Folder
|
222
|
+
Temporary Items
|
223
|
+
.apdisk
|
@@ -0,0 +1 @@
|
|
1
|
+
3.10
|
embedkit-0.1.0/LICENSE
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright © 2025 JP Hwang
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
6
|
+
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
8
|
+
|
9
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
embedkit-0.1.0/PKG-INFO
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: embedkit
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: A simple toolkit for generating vector embeddings across multiple providers and models
|
5
|
+
Author-email: JP Hwang <me@jphwang.com>
|
6
|
+
License: MIT
|
7
|
+
License-File: LICENSE
|
8
|
+
Keywords: ai,cohere,colpali,embeddings,machine-learning,vector
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
18
|
+
Requires-Python: >=3.10
|
19
|
+
Requires-Dist: accelerate>=1.7.0
|
20
|
+
Requires-Dist: cohere>=5.15.0
|
21
|
+
Requires-Dist: colpali-engine<0.4.0,>=0.3.0
|
22
|
+
Requires-Dist: pdf2image>=1.17.0
|
23
|
+
Requires-Dist: pillow>=11.2.1
|
24
|
+
Requires-Dist: torch<=2.5
|
25
|
+
Requires-Dist: transformers
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
|
28
|
+
# EmbedKit
|
29
|
+
|
30
|
+
A Python library for generating embeddings from text, images, and PDFs using various models (e.g. from Cohere, ColPali).
|
31
|
+
|
32
|
+
## Usage
|
33
|
+
|
34
|
+
See [main.py](main.py) for examples.
|
35
|
+
|
36
|
+
```python
|
37
|
+
from embedkit import EmbedKit
|
38
|
+
from embedkit.models import Model
|
39
|
+
|
40
|
+
# Instantiate a kit
|
41
|
+
# Using ColPali
|
42
|
+
kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
|
43
|
+
|
44
|
+
# Using Cohere
|
45
|
+
kit = EmbedKit.cohere(
|
46
|
+
model=Model.Cohere.EMBED_V4_0,
|
47
|
+
api_key="your_api_key",
|
48
|
+
text_input_type=CohereInputType.SEARCH_DOCUMENT,
|
49
|
+
)
|
50
|
+
|
51
|
+
# Then - the embedding API is consistent
|
52
|
+
embeddings = kit.embed_text("Hello world") or kit.embed_text(["Hello world", "Hello world"])
|
53
|
+
embeddings = kit.embed_image("path/to/image.png") or kit.embed_image(["path/to/image1.png", "path/to/image2.png"])
|
54
|
+
embeddings = kit.embed_pdf("path/to/pdf.pdf") # Single PDF only
|
55
|
+
```
|
56
|
+
|
57
|
+
## License
|
58
|
+
|
59
|
+
MIT
|
embedkit-0.1.0/README.md
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# EmbedKit
|
2
|
+
|
3
|
+
A Python library for generating embeddings from text, images, and PDFs using various models (e.g. from Cohere, ColPali).
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
See [main.py](main.py) for examples.
|
8
|
+
|
9
|
+
```python
|
10
|
+
from embedkit import EmbedKit
|
11
|
+
from embedkit.models import Model
|
12
|
+
|
13
|
+
# Instantiate a kit
|
14
|
+
# Using ColPali
|
15
|
+
kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
|
16
|
+
|
17
|
+
# Using Cohere
|
18
|
+
kit = EmbedKit.cohere(
|
19
|
+
model=Model.Cohere.EMBED_V4_0,
|
20
|
+
api_key="your_api_key",
|
21
|
+
text_input_type=CohereInputType.SEARCH_DOCUMENT,
|
22
|
+
)
|
23
|
+
|
24
|
+
# Then - the embedding API is consistent
|
25
|
+
embeddings = kit.embed_text("Hello world") or kit.embed_text(["Hello world", "Hello world"])
|
26
|
+
embeddings = kit.embed_image("path/to/image.png") or kit.embed_image(["path/to/image1.png", "path/to/image2.png"])
|
27
|
+
embeddings = kit.embed_pdf("path/to/pdf.pdf") # Single PDF only
|
28
|
+
```
|
29
|
+
|
30
|
+
## License
|
31
|
+
|
32
|
+
MIT
|
embedkit-0.1.0/main.py
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# ./main.py
|
2
|
+
from embedkit import EmbedKit
|
3
|
+
from embedkit.models import Model
|
4
|
+
from embedkit.providers.cohere import CohereInputType
|
5
|
+
from pathlib import Path
|
6
|
+
import os
|
7
|
+
|
8
|
+
|
9
|
+
def get_online_image(url: str) -> Path:
|
10
|
+
"""Download an image from a URL and return its local path."""
|
11
|
+
import requests
|
12
|
+
from tempfile import NamedTemporaryFile
|
13
|
+
|
14
|
+
# Add User-Agent header to comply with Wikipedia's policy
|
15
|
+
headers = {"User-Agent": "EmbedKit-Example/1.0"}
|
16
|
+
|
17
|
+
response = requests.get(url, headers=headers)
|
18
|
+
response.raise_for_status()
|
19
|
+
|
20
|
+
temp_file = NamedTemporaryFile(delete=False, suffix=".png")
|
21
|
+
temp_file.write(response.content)
|
22
|
+
temp_file.close()
|
23
|
+
|
24
|
+
return Path(temp_file.name)
|
25
|
+
|
26
|
+
|
27
|
+
def get_sample_image() -> Path:
|
28
|
+
"""Get a sample image for testing."""
|
29
|
+
url = "https://upload.wikimedia.org/wikipedia/commons/b/b8/English_Wikipedia_HomePage_2001-12-20.png"
|
30
|
+
return get_online_image(url)
|
31
|
+
|
32
|
+
|
33
|
+
sample_image = get_sample_image()
|
34
|
+
|
35
|
+
sample_pdf = Path("tests/fixtures/2407.01449v6_p1.pdf")
|
36
|
+
|
37
|
+
kit = EmbedKit.colpali(model=Model.ColPali.V1_3)
|
38
|
+
|
39
|
+
embeddings = kit.embed_text("Hello world")
|
40
|
+
assert embeddings.shape[0] == 1
|
41
|
+
assert len(embeddings.shape) == 3
|
42
|
+
|
43
|
+
embeddings = kit.embed_image(sample_image)
|
44
|
+
assert embeddings.shape[0] == 1
|
45
|
+
assert len(embeddings.shape) == 3
|
46
|
+
|
47
|
+
embeddings = kit.embed_pdf(sample_pdf)
|
48
|
+
assert embeddings.shape[0] == 1
|
49
|
+
assert len(embeddings.shape) == 3
|
50
|
+
|
51
|
+
|
52
|
+
kit = EmbedKit.cohere(
|
53
|
+
model=Model.Cohere.EMBED_V4_0,
|
54
|
+
api_key=os.getenv("COHERE_API_KEY"),
|
55
|
+
text_input_type=CohereInputType.SEARCH_QUERY,
|
56
|
+
)
|
57
|
+
|
58
|
+
embeddings = kit.embed_text("Hello world")
|
59
|
+
assert embeddings.shape[0] == 1
|
60
|
+
assert len(embeddings.shape) == 2
|
61
|
+
|
62
|
+
kit = EmbedKit.cohere(
|
63
|
+
model=Model.Cohere.EMBED_V4_0,
|
64
|
+
api_key=os.getenv("COHERE_API_KEY"),
|
65
|
+
text_input_type=CohereInputType.SEARCH_DOCUMENT,
|
66
|
+
)
|
67
|
+
|
68
|
+
embeddings = kit.embed_text("Hello world")
|
69
|
+
assert embeddings.shape[0] == 1
|
70
|
+
assert len(embeddings.shape) == 2
|
71
|
+
|
72
|
+
embeddings = kit.embed_image(sample_image)
|
73
|
+
assert embeddings.shape[0] == 1
|
74
|
+
assert len(embeddings.shape) == 2
|
75
|
+
|
76
|
+
embeddings = kit.embed_pdf(sample_pdf)
|
77
|
+
assert embeddings.shape[0] == 1
|
78
|
+
assert len(embeddings.shape) == 2
|
@@ -0,0 +1,55 @@
|
|
1
|
+
[project]
|
2
|
+
name = "embedkit"
|
3
|
+
version = "0.1.0"
|
4
|
+
description = "A simple toolkit for generating vector embeddings across multiple providers and models"
|
5
|
+
readme = "README.md"
|
6
|
+
requires-python = ">=3.10"
|
7
|
+
dependencies = [
|
8
|
+
"accelerate>=1.7.0",
|
9
|
+
"cohere>=5.15.0",
|
10
|
+
"colpali-engine>=0.3.0,<0.4.0",
|
11
|
+
"pdf2image>=1.17.0",
|
12
|
+
"pillow>=11.2.1",
|
13
|
+
"torch<=2.5",
|
14
|
+
"transformers",
|
15
|
+
]
|
16
|
+
authors = [
|
17
|
+
{name = "JP Hwang", email = "me@jphwang.com"},
|
18
|
+
]
|
19
|
+
license = {text = "MIT"}
|
20
|
+
classifiers = [
|
21
|
+
"Development Status :: 4 - Beta",
|
22
|
+
"Intended Audience :: Developers",
|
23
|
+
"License :: OSI Approved :: MIT License",
|
24
|
+
"Programming Language :: Python :: 3",
|
25
|
+
"Programming Language :: Python :: 3.10",
|
26
|
+
"Programming Language :: Python :: 3.11",
|
27
|
+
"Programming Language :: Python :: 3.12",
|
28
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
29
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
30
|
+
]
|
31
|
+
keywords = ["embeddings", "vector", "ai", "machine-learning", "cohere", "colpali"]
|
32
|
+
|
33
|
+
[build-system]
|
34
|
+
requires = ["hatchling"]
|
35
|
+
build-backend = "hatchling.build"
|
36
|
+
|
37
|
+
[tool.hatch.build.targets.wheel]
|
38
|
+
packages = ["src/embedkit"]
|
39
|
+
|
40
|
+
[tool.pytest.ini_options]
|
41
|
+
testpaths = ["tests"]
|
42
|
+
python_files = ["test_*.py"]
|
43
|
+
addopts = "-v --cov=embedkit --cov-report=term-missing"
|
44
|
+
filterwarnings = [
|
45
|
+
"ignore::DeprecationWarning",
|
46
|
+
"ignore::UserWarning",
|
47
|
+
]
|
48
|
+
|
49
|
+
[tool.uv]
|
50
|
+
dev-dependencies = [
|
51
|
+
"build>=1.2.2.post1",
|
52
|
+
"pytest>=8.0.0",
|
53
|
+
"pytest-cov>=4.1.0",
|
54
|
+
"twine>=6.1.0",
|
55
|
+
]
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# ./src/embedkit/__init__.py
|
2
|
+
"""
|
3
|
+
EmbedKit: A unified toolkit for generating vector embeddings.
|
4
|
+
"""
|
5
|
+
|
6
|
+
from typing import Union, List, Optional
|
7
|
+
from pathlib import Path
|
8
|
+
import numpy as np
|
9
|
+
|
10
|
+
from .models import Model
|
11
|
+
from .base import EmbeddingError, EmbeddingResult
|
12
|
+
from .providers import ColPaliProvider, CohereProvider
|
13
|
+
from .providers.cohere import CohereInputType
|
14
|
+
|
15
|
+
|
16
|
+
class EmbedKit:
|
17
|
+
"""Main interface for generating embeddings."""
|
18
|
+
|
19
|
+
def __init__(self, provider_instance):
|
20
|
+
"""
|
21
|
+
Initialize EmbedKit with a provider instance.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
provider_instance: An initialized provider (use class methods to create)
|
25
|
+
"""
|
26
|
+
self._provider = provider_instance
|
27
|
+
|
28
|
+
@classmethod
|
29
|
+
def colpali(cls, model: Model = Model.ColPali.V1_3, device: Optional[str] = None):
|
30
|
+
"""
|
31
|
+
Create EmbedKit instance with ColPali provider.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
model: ColPali model enum
|
35
|
+
device: Device to run on ('cuda', 'mps', 'cpu', or None for auto-detect)
|
36
|
+
"""
|
37
|
+
if model == Model.ColPali.V1_3:
|
38
|
+
model_name = "vidore/colpali-v1.3"
|
39
|
+
else:
|
40
|
+
raise ValueError(f"Unsupported model: {model}")
|
41
|
+
|
42
|
+
|
43
|
+
provider = ColPaliProvider(model_name=model_name, device=device)
|
44
|
+
return cls(provider)
|
45
|
+
|
46
|
+
@classmethod
|
47
|
+
def cohere(
|
48
|
+
cls,
|
49
|
+
api_key: str,
|
50
|
+
model: Model = Model.Cohere.EMBED_V4_0,
|
51
|
+
text_input_type: CohereInputType = CohereInputType.SEARCH_DOCUMENT,
|
52
|
+
):
|
53
|
+
"""
|
54
|
+
Create EmbedKit instance with Cohere provider.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
api_key: Cohere API key
|
58
|
+
model: Cohere model enum
|
59
|
+
input_type: Type of input for embedding (search_document or search_query)
|
60
|
+
"""
|
61
|
+
if not api_key:
|
62
|
+
raise ValueError("API key is required")
|
63
|
+
|
64
|
+
if model == Model.Cohere.EMBED_V4_0:
|
65
|
+
model_name = "embed-v4.0"
|
66
|
+
else:
|
67
|
+
raise ValueError(f"Unsupported model: {model}")
|
68
|
+
|
69
|
+
provider = CohereProvider(
|
70
|
+
api_key=api_key, model_name=model_name, text_input_type=text_input_type
|
71
|
+
)
|
72
|
+
return cls(provider)
|
73
|
+
|
74
|
+
# Future class methods:
|
75
|
+
# @classmethod
|
76
|
+
# def openai(cls, api_key: str, model_name: str = "text-embedding-3-large"):
|
77
|
+
# """Create EmbedKit instance with OpenAI provider."""
|
78
|
+
# provider = OpenAIProvider(api_key=api_key, model_name=model_name)
|
79
|
+
# return cls(provider)
|
80
|
+
#
|
81
|
+
# @classmethod
|
82
|
+
# def huggingface(cls, model_name: str = "all-MiniLM-L6-v2", device: Optional[str] = None):
|
83
|
+
# """Create EmbedKit instance with HuggingFace provider."""
|
84
|
+
# provider = HuggingFaceProvider(model_name=model_name, device=device)
|
85
|
+
# return cls(provider)
|
86
|
+
|
87
|
+
def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
|
88
|
+
"""Generate document text embeddings using the configured provider.
|
89
|
+
|
90
|
+
Args:
|
91
|
+
texts: Text or list of texts to embed
|
92
|
+
**kwargs: Additional provider-specific arguments
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
EmbeddingResult containing the embeddings
|
96
|
+
"""
|
97
|
+
return self._provider.embed_text(texts, **kwargs)
|
98
|
+
|
99
|
+
def embed_image(
|
100
|
+
self, images: Union[Path, str, List[Union[Path, str]]]
|
101
|
+
) -> EmbeddingResult:
|
102
|
+
"""Generate image embeddings using the configured provider."""
|
103
|
+
return self._provider.embed_image(images)
|
104
|
+
|
105
|
+
def embed_pdf(self, pdf: Union[Path, str]) -> EmbeddingResult:
|
106
|
+
"""Generate image embeddings from PDFsusing the configured provider. Takes a single PDF file."""
|
107
|
+
return self._provider.embed_pdf(pdf)
|
108
|
+
|
109
|
+
@property
|
110
|
+
def provider_info(self) -> str:
|
111
|
+
"""Get information about the current provider."""
|
112
|
+
return f"{self._provider.__class__.__name__}"
|
113
|
+
|
114
|
+
|
115
|
+
# Main exports
|
116
|
+
__version__ = "0.1.0"
|
117
|
+
__all__ = ["EmbedKit", "Model", "EmbeddingError"]
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# ./src/embedkit/base.py
|
2
|
+
"""Base classes for EmbedKit."""
|
3
|
+
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from typing import Union, List
|
6
|
+
from pathlib import Path
|
7
|
+
import numpy as np
|
8
|
+
from dataclasses import dataclass
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass
|
12
|
+
class EmbeddingResult:
|
13
|
+
embeddings: np.ndarray
|
14
|
+
model_name: str
|
15
|
+
model_provider: str
|
16
|
+
input_type: str
|
17
|
+
|
18
|
+
@property
|
19
|
+
def shape(self) -> tuple:
|
20
|
+
return self.embeddings.shape
|
21
|
+
|
22
|
+
|
23
|
+
class EmbeddingProvider(ABC):
|
24
|
+
"""Abstract base class for embedding providers."""
|
25
|
+
|
26
|
+
@abstractmethod
|
27
|
+
def embed_text(self, texts: Union[str, List[str]], **kwargs) -> EmbeddingResult:
|
28
|
+
"""Generate document text embeddings using the configured provider."""
|
29
|
+
pass
|
30
|
+
|
31
|
+
@abstractmethod
|
32
|
+
def embed_image(
|
33
|
+
self, images: Union[Path, str, List[Union[Path, str]]]
|
34
|
+
) -> EmbeddingResult:
|
35
|
+
"""Generate image embeddings using the configured provider."""
|
36
|
+
pass
|
37
|
+
|
38
|
+
@abstractmethod
|
39
|
+
def embed_pdf(
|
40
|
+
self, pdf: Union[Path, str]
|
41
|
+
) -> EmbeddingResult:
|
42
|
+
"""Generate image embeddings from PDFsusing the configured provider. Takes a single PDF file."""
|
43
|
+
pass
|
44
|
+
|
45
|
+
|
46
|
+
class EmbeddingError(Exception):
|
47
|
+
"""Base exception for embedding-related errors."""
|
48
|
+
|
49
|
+
pass
|