cocoindex 0.2.15__cp311-abi3-manylinux_2_28_aarch64.whl → 0.2.17__cp311-abi3-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoindex/_engine.abi3.so +0 -0
- cocoindex/auth_registry.py +1 -1
- cocoindex/cli.py +121 -41
- cocoindex/engine_object.py +272 -0
- cocoindex/{convert.py → engine_value.py} +64 -208
- cocoindex/flow.py +17 -10
- cocoindex/functions/__init__.py +45 -0
- cocoindex/functions/_engine_builtin_specs.py +62 -0
- cocoindex/functions/colpali.py +250 -0
- cocoindex/functions/sbert.py +63 -0
- cocoindex/lib.py +1 -1
- cocoindex/op.py +7 -3
- cocoindex/sources/__init__.py +5 -0
- cocoindex/{sources.py → sources/_engine_builtin_specs.py} +3 -3
- cocoindex/targets/_engine_builtin_specs.py +9 -0
- cocoindex/tests/test_engine_object.py +331 -0
- cocoindex/tests/{test_convert.py → test_engine_value.py} +150 -26
- cocoindex/typing.py +125 -3
- {cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/METADATA +4 -1
- cocoindex-0.2.17.dist-info/RECORD +43 -0
- {cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/WHEEL +1 -1
- {cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/licenses/THIRD_PARTY_NOTICES.html +22 -19
- cocoindex/tests/test_load_convert.py +0 -118
- cocoindex-0.2.15.dist-info/RECORD +0 -37
- {cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,250 @@
|
|
1
|
+
"""ColPali image and query embedding functions for multimodal document retrieval."""
|
2
|
+
|
3
|
+
import functools
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from typing import Any, Optional, TYPE_CHECKING, Literal
|
6
|
+
import numpy as np
|
7
|
+
|
8
|
+
from .. import op
|
9
|
+
from ..typing import Vector
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
import torch
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass
|
16
|
+
class ColPaliModelInfo:
|
17
|
+
"""Shared model information for ColPali embedding functions."""
|
18
|
+
|
19
|
+
model: Any
|
20
|
+
processor: Any
|
21
|
+
device: Any
|
22
|
+
dimension: int
|
23
|
+
|
24
|
+
|
25
|
+
@functools.lru_cache(maxsize=None)
|
26
|
+
def _get_colpali_model_and_processor(model_name: str) -> ColPaliModelInfo:
|
27
|
+
"""Load and cache ColPali model and processor with shared device setup."""
|
28
|
+
try:
|
29
|
+
from colpali_engine import ( # type: ignore[import-untyped]
|
30
|
+
ColPali,
|
31
|
+
ColPaliProcessor,
|
32
|
+
ColQwen2,
|
33
|
+
ColQwen2Processor,
|
34
|
+
ColSmol,
|
35
|
+
ColSmolProcessor,
|
36
|
+
)
|
37
|
+
import torch
|
38
|
+
except ImportError as e:
|
39
|
+
raise ImportError(
|
40
|
+
"ColPali support requires the optional 'colpali' dependency. "
|
41
|
+
"Install it with: pip install 'cocoindex[colpali]'"
|
42
|
+
) from e
|
43
|
+
|
44
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
45
|
+
|
46
|
+
# Determine model type from name
|
47
|
+
if "colpali" in model_name.lower():
|
48
|
+
model = ColPali.from_pretrained(
|
49
|
+
model_name, torch_dtype=torch.bfloat16, device_map=device
|
50
|
+
)
|
51
|
+
processor = ColPaliProcessor.from_pretrained(model_name)
|
52
|
+
elif "colqwen" in model_name.lower():
|
53
|
+
model = ColQwen2.from_pretrained(
|
54
|
+
model_name, torch_dtype=torch.bfloat16, device_map=device
|
55
|
+
)
|
56
|
+
processor = ColQwen2Processor.from_pretrained(model_name)
|
57
|
+
elif "colsmol" in model_name.lower():
|
58
|
+
model = ColSmol.from_pretrained(
|
59
|
+
model_name, torch_dtype=torch.bfloat16, device_map=device
|
60
|
+
)
|
61
|
+
processor = ColSmolProcessor.from_pretrained(model_name)
|
62
|
+
else:
|
63
|
+
# Fallback to ColPali for backwards compatibility
|
64
|
+
model = ColPali.from_pretrained(
|
65
|
+
model_name, torch_dtype=torch.bfloat16, device_map=device
|
66
|
+
)
|
67
|
+
processor = ColPaliProcessor.from_pretrained(model_name)
|
68
|
+
|
69
|
+
# Detect dimension
|
70
|
+
dimension = _detect_colpali_dimension(model, processor, device)
|
71
|
+
|
72
|
+
return ColPaliModelInfo(
|
73
|
+
model=model,
|
74
|
+
processor=processor,
|
75
|
+
dimension=dimension,
|
76
|
+
device=device,
|
77
|
+
)
|
78
|
+
|
79
|
+
|
80
|
+
def _detect_colpali_dimension(model: Any, processor: Any, device: Any) -> int:
|
81
|
+
"""Detect ColPali embedding dimension from the actual model config."""
|
82
|
+
# Try to access embedding dimension
|
83
|
+
if hasattr(model.config, "embedding_dim"):
|
84
|
+
dim = model.config.embedding_dim
|
85
|
+
else:
|
86
|
+
# Fallback: infer from output shape with dummy data
|
87
|
+
from PIL import Image
|
88
|
+
import numpy as np
|
89
|
+
import torch
|
90
|
+
|
91
|
+
dummy_img = Image.fromarray(np.zeros((224, 224, 3), np.uint8))
|
92
|
+
# Use the processor to process the dummy image
|
93
|
+
processed = processor.process_images([dummy_img]).to(device)
|
94
|
+
with torch.no_grad():
|
95
|
+
output = model(**processed)
|
96
|
+
dim = int(output.shape[-1])
|
97
|
+
if isinstance(dim, int):
|
98
|
+
return dim
|
99
|
+
else:
|
100
|
+
raise ValueError(f"Expected integer dimension, got {type(dim)}: {dim}")
|
101
|
+
return dim
|
102
|
+
|
103
|
+
|
104
|
+
class ColPaliEmbedImage(op.FunctionSpec):
|
105
|
+
"""
|
106
|
+
`ColPaliEmbedImage` embeds images using ColVision multimodal models.
|
107
|
+
|
108
|
+
Supports ALL models available in the colpali-engine library, including:
|
109
|
+
- ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
|
110
|
+
- ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
|
111
|
+
- ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
|
112
|
+
- Any future ColVision models supported by colpali-engine
|
113
|
+
|
114
|
+
These models use late interaction between image patch embeddings and text token
|
115
|
+
embeddings for retrieval.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
model: Any ColVision model name supported by colpali-engine
|
119
|
+
(e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
|
120
|
+
See https://github.com/illuin-tech/colpali for the complete list of supported models.
|
121
|
+
|
122
|
+
Note:
|
123
|
+
This function requires the optional colpali-engine dependency.
|
124
|
+
Install it with: pip install 'cocoindex[colpali]'
|
125
|
+
"""
|
126
|
+
|
127
|
+
model: str
|
128
|
+
|
129
|
+
|
130
|
+
@op.executor_class(
|
131
|
+
gpu=True,
|
132
|
+
cache=True,
|
133
|
+
behavior_version=1,
|
134
|
+
)
|
135
|
+
class ColPaliEmbedImageExecutor:
|
136
|
+
"""Executor for ColVision image embedding (ColPali, ColQwen2, ColSmol, etc.)."""
|
137
|
+
|
138
|
+
spec: ColPaliEmbedImage
|
139
|
+
_model_info: ColPaliModelInfo
|
140
|
+
|
141
|
+
def analyze(self) -> type:
|
142
|
+
# Get shared model and dimension
|
143
|
+
self._model_info = _get_colpali_model_and_processor(self.spec.model)
|
144
|
+
|
145
|
+
# Return multi-vector type: Variable patches x Fixed hidden dimension
|
146
|
+
dimension = self._model_info.dimension
|
147
|
+
return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
|
148
|
+
|
149
|
+
def __call__(self, img_bytes: bytes) -> Any:
|
150
|
+
try:
|
151
|
+
from PIL import Image
|
152
|
+
import torch
|
153
|
+
import io
|
154
|
+
except ImportError as e:
|
155
|
+
raise ImportError(
|
156
|
+
"Required dependencies (PIL, torch) are missing for ColVision image embedding."
|
157
|
+
) from e
|
158
|
+
|
159
|
+
model = self._model_info.model
|
160
|
+
processor = self._model_info.processor
|
161
|
+
device = self._model_info.device
|
162
|
+
|
163
|
+
pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
164
|
+
inputs = processor.process_images([pil_image]).to(device)
|
165
|
+
with torch.no_grad():
|
166
|
+
embeddings = model(**inputs)
|
167
|
+
|
168
|
+
# Return multi-vector format: [patches, hidden_dim]
|
169
|
+
if len(embeddings.shape) != 3:
|
170
|
+
raise ValueError(
|
171
|
+
f"Expected 3D tensor [batch, patches, hidden_dim], got shape {embeddings.shape}"
|
172
|
+
)
|
173
|
+
|
174
|
+
# Keep patch-level embeddings: [batch, patches, hidden_dim] -> [patches, hidden_dim]
|
175
|
+
patch_embeddings = embeddings[0] # Remove batch dimension
|
176
|
+
|
177
|
+
return patch_embeddings.cpu().to(torch.float32).numpy()
|
178
|
+
|
179
|
+
|
180
|
+
class ColPaliEmbedQuery(op.FunctionSpec):
|
181
|
+
"""
|
182
|
+
`ColPaliEmbedQuery` embeds text queries using ColVision multimodal models.
|
183
|
+
|
184
|
+
Supports ALL models available in the colpali-engine library, including:
|
185
|
+
- ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
|
186
|
+
- ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
|
187
|
+
- ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
|
188
|
+
- Any future ColVision models supported by colpali-engine
|
189
|
+
|
190
|
+
This produces query embeddings compatible with ColVision image embeddings
|
191
|
+
for late interaction scoring (MaxSim).
|
192
|
+
|
193
|
+
Args:
|
194
|
+
model: Any ColVision model name supported by colpali-engine
|
195
|
+
(e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
|
196
|
+
See https://github.com/illuin-tech/colpali for the complete list of supported models.
|
197
|
+
|
198
|
+
Note:
|
199
|
+
This function requires the optional colpali-engine dependency.
|
200
|
+
Install it with: pip install 'cocoindex[colpali]'
|
201
|
+
"""
|
202
|
+
|
203
|
+
model: str
|
204
|
+
|
205
|
+
|
206
|
+
@op.executor_class(
|
207
|
+
gpu=True,
|
208
|
+
cache=True,
|
209
|
+
behavior_version=1,
|
210
|
+
)
|
211
|
+
class ColPaliEmbedQueryExecutor:
|
212
|
+
"""Executor for ColVision query embedding (ColPali, ColQwen2, ColSmol, etc.)."""
|
213
|
+
|
214
|
+
spec: ColPaliEmbedQuery
|
215
|
+
_model_info: ColPaliModelInfo
|
216
|
+
|
217
|
+
def analyze(self) -> type:
|
218
|
+
# Get shared model and dimension
|
219
|
+
self._model_info = _get_colpali_model_and_processor(self.spec.model)
|
220
|
+
|
221
|
+
# Return multi-vector type: Variable tokens x Fixed hidden dimension
|
222
|
+
dimension = self._model_info.dimension
|
223
|
+
return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
|
224
|
+
|
225
|
+
def __call__(self, query: str) -> Any:
|
226
|
+
try:
|
227
|
+
import torch
|
228
|
+
except ImportError as e:
|
229
|
+
raise ImportError(
|
230
|
+
"Required dependencies (torch) are missing for ColVision query embedding."
|
231
|
+
) from e
|
232
|
+
|
233
|
+
model = self._model_info.model
|
234
|
+
processor = self._model_info.processor
|
235
|
+
device = self._model_info.device
|
236
|
+
|
237
|
+
inputs = processor.process_queries([query]).to(device)
|
238
|
+
with torch.no_grad():
|
239
|
+
embeddings = model(**inputs)
|
240
|
+
|
241
|
+
# Return multi-vector format: [tokens, hidden_dim]
|
242
|
+
if len(embeddings.shape) != 3:
|
243
|
+
raise ValueError(
|
244
|
+
f"Expected 3D tensor [batch, tokens, hidden_dim], got shape {embeddings.shape}"
|
245
|
+
)
|
246
|
+
|
247
|
+
# Keep token-level embeddings: [batch, tokens, hidden_dim] -> [tokens, hidden_dim]
|
248
|
+
token_embeddings = embeddings[0] # Remove batch dimension
|
249
|
+
|
250
|
+
return token_embeddings.cpu().to(torch.float32).numpy()
|
@@ -0,0 +1,63 @@
|
|
1
|
+
"""SentenceTransformer embedding functionality."""
|
2
|
+
|
3
|
+
import dataclasses
|
4
|
+
from typing import Any, Literal
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
from numpy.typing import NDArray
|
8
|
+
|
9
|
+
from .. import op
|
10
|
+
from ..typing import Vector
|
11
|
+
|
12
|
+
|
13
|
+
class SentenceTransformerEmbed(op.FunctionSpec):
|
14
|
+
"""
|
15
|
+
`SentenceTransformerEmbed` embeds a text into a vector space using the [SentenceTransformer](https://huggingface.co/sentence-transformers) library.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
|
19
|
+
model: The name of the SentenceTransformer model to use.
|
20
|
+
args: Additional arguments to pass to the SentenceTransformer constructor. e.g. {"trust_remote_code": True}
|
21
|
+
|
22
|
+
Note:
|
23
|
+
This function requires the optional sentence-transformers dependency.
|
24
|
+
Install it with: pip install 'cocoindex[embeddings]'
|
25
|
+
"""
|
26
|
+
|
27
|
+
model: str
|
28
|
+
args: dict[str, Any] | None = None
|
29
|
+
|
30
|
+
|
31
|
+
@op.executor_class(
|
32
|
+
gpu=True,
|
33
|
+
cache=True,
|
34
|
+
behavior_version=1,
|
35
|
+
arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),
|
36
|
+
)
|
37
|
+
class SentenceTransformerEmbedExecutor:
|
38
|
+
"""Executor for SentenceTransformerEmbed."""
|
39
|
+
|
40
|
+
spec: SentenceTransformerEmbed
|
41
|
+
_model: Any | None = None
|
42
|
+
|
43
|
+
def analyze(self) -> type:
|
44
|
+
try:
|
45
|
+
# Only import sentence_transformers locally when it's needed, as its import is very slow.
|
46
|
+
import sentence_transformers # pylint: disable=import-outside-toplevel
|
47
|
+
except ImportError as e:
|
48
|
+
raise ImportError(
|
49
|
+
"sentence_transformers is required for SentenceTransformerEmbed function. "
|
50
|
+
"Install it with one of these commands:\n"
|
51
|
+
" pip install 'cocoindex[embeddings]'\n"
|
52
|
+
" pip install sentence-transformers"
|
53
|
+
) from e
|
54
|
+
|
55
|
+
args = self.spec.args or {}
|
56
|
+
self._model = sentence_transformers.SentenceTransformer(self.spec.model, **args)
|
57
|
+
dim = self._model.get_sentence_embedding_dimension()
|
58
|
+
return Vector[np.float32, Literal[dim]] # type: ignore
|
59
|
+
|
60
|
+
def __call__(self, text: str) -> NDArray[np.float32]:
|
61
|
+
assert self._model is not None
|
62
|
+
result: NDArray[np.float32] = self._model.encode(text, convert_to_numpy=True)
|
63
|
+
return result
|
cocoindex/lib.py
CHANGED
@@ -7,7 +7,7 @@ import warnings
|
|
7
7
|
|
8
8
|
from . import _engine # type: ignore
|
9
9
|
from . import flow, setting
|
10
|
-
from .
|
10
|
+
from .engine_object import dump_engine_object
|
11
11
|
from .validation import validate_app_namespace_name
|
12
12
|
from typing import Any, Callable, overload
|
13
13
|
|
cocoindex/op.py
CHANGED
@@ -17,9 +17,8 @@ from typing import (
|
|
17
17
|
|
18
18
|
from . import _engine # type: ignore
|
19
19
|
from .subprocess_exec import executor_stub
|
20
|
-
from .
|
21
|
-
|
22
|
-
load_engine_object,
|
20
|
+
from .engine_object import dump_engine_object, load_engine_object
|
21
|
+
from .engine_value import (
|
23
22
|
make_engine_value_encoder,
|
24
23
|
make_engine_value_decoder,
|
25
24
|
make_engine_key_decoder,
|
@@ -47,6 +46,7 @@ class OpCategory(Enum):
|
|
47
46
|
SOURCE = "source"
|
48
47
|
TARGET = "target"
|
49
48
|
DECLARATION = "declaration"
|
49
|
+
TARGET_ATTACHMENT = "target_attachment"
|
50
50
|
|
51
51
|
|
52
52
|
@dataclass_transform()
|
@@ -82,6 +82,10 @@ class TargetSpec(metaclass=SpecMeta, category=OpCategory.TARGET): # pylint: dis
|
|
82
82
|
"""A target spec. All its subclass can be instantiated similar to a dataclass, i.e. ClassName(field1=value1, field2=value2, ...)"""
|
83
83
|
|
84
84
|
|
85
|
+
class TargetAttachmentSpec(metaclass=SpecMeta, category=OpCategory.TARGET_ATTACHMENT): # pylint: disable=too-few-public-methods
|
86
|
+
"""A target attachment spec. All its subclass can be instantiated similar to a dataclass, i.e. ClassName(field1=value1, field2=value2, ...)"""
|
87
|
+
|
88
|
+
|
85
89
|
class DeclarationSpec(metaclass=SpecMeta, category=OpCategory.DECLARATION): # pylint: disable=too-few-public-methods
|
86
90
|
"""A declaration spec. All its subclass can be instantiated similar to a dataclass, i.e. ClassName(field1=value1, field2=value2, ...)"""
|
87
91
|
|
@@ -1,8 +1,8 @@
|
|
1
1
|
"""All builtin sources."""
|
2
2
|
|
3
|
-
from
|
4
|
-
from
|
5
|
-
from
|
3
|
+
from .. import op
|
4
|
+
from ..auth_registry import TransientAuthEntryReference
|
5
|
+
from ..setting import DatabaseConnectionSpec
|
6
6
|
from dataclasses import dataclass
|
7
7
|
import datetime
|
8
8
|
|
@@ -14,6 +14,15 @@ class Postgres(op.TargetSpec):
|
|
14
14
|
|
15
15
|
database: AuthEntryReference[DatabaseConnectionSpec] | None = None
|
16
16
|
table_name: str | None = None
|
17
|
+
schema: str | None = None
|
18
|
+
|
19
|
+
|
20
|
+
class PostgresSqlCommand(op.TargetAttachmentSpec):
|
21
|
+
"""Attachment to execute specified SQL statements for Postgres targets."""
|
22
|
+
|
23
|
+
name: str
|
24
|
+
setup_sql: str
|
25
|
+
teardown_sql: str | None = None
|
17
26
|
|
18
27
|
|
19
28
|
@dataclass
|