cocoindex 0.2.18__cp311-abi3-manylinux_2_28_aarch64.whl → 0.2.20__cp311-abi3-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoindex/_engine.abi3.so +0 -0
- cocoindex/llm.py +1 -0
- cocoindex/sources/_engine_builtin_specs.py +3 -0
- {cocoindex-0.2.18.dist-info → cocoindex-0.2.20.dist-info}/METADATA +15 -8
- {cocoindex-0.2.18.dist-info → cocoindex-0.2.20.dist-info}/RECORD +8 -9
- {cocoindex-0.2.18.dist-info → cocoindex-0.2.20.dist-info}/licenses/THIRD_PARTY_NOTICES.html +1 -1
- cocoindex/functions.py +0 -375
- {cocoindex-0.2.18.dist-info → cocoindex-0.2.20.dist-info}/WHEEL +0 -0
- {cocoindex-0.2.18.dist-info → cocoindex-0.2.20.dist-info}/entry_points.txt +0 -0
cocoindex/_engine.abi3.so
CHANGED
Binary file
|
cocoindex/llm.py
CHANGED
@@ -100,3 +100,6 @@ class Postgres(op.SourceSpec):
|
|
100
100
|
|
101
101
|
# Optional: when set, supports change capture from PostgreSQL notification.
|
102
102
|
notification: PostgresNotification | None = None
|
103
|
+
|
104
|
+
# Optional: SQL expression filter for rows (arbitrary SQL boolean expression)
|
105
|
+
filter: str | None = None
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cocoindex
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.20
|
4
4
|
Classifier: Development Status :: 3 - Alpha
|
5
5
|
Classifier: License :: OSI Approved :: Apache Software License
|
6
6
|
Classifier: Operating System :: OS Independent
|
@@ -75,7 +75,6 @@ Project-URL: Homepage, https://cocoindex.io/
|
|
75
75
|
<a href="https://trendshift.io/repositories/13939" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13939" alt="cocoindex-io%2Fcocoindex | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
76
76
|
</div>
|
77
77
|
|
78
|
-
|
79
78
|
Ultra performant data transformation framework for AI, with core engine written in Rust. Support incremental processing and data lineage out-of-box. Exceptional developer velocity. Production-ready at day 0.
|
80
79
|
|
81
80
|
⭐ Drop a star to help us grow!
|
@@ -113,9 +112,8 @@ CocoIndex makes it effortless to transform data with AI, and keep source data an
|
|
113
112
|
|
114
113
|
</br>
|
115
114
|
|
116
|
-
|
117
|
-
|
118
115
|
## Exceptional velocity
|
116
|
+
|
119
117
|
Just declare transformation in dataflow with ~100 lines of python
|
120
118
|
|
121
119
|
```python
|
@@ -139,6 +137,7 @@ CocoIndex follows the idea of [Dataflow](https://en.wikipedia.org/wiki/Dataflow_
|
|
139
137
|
**Particularly**, developers don't explicitly mutate data by creating, updating and deleting. They just need to define transformation/formula for a set of source data.
|
140
138
|
|
141
139
|
## Plug-and-Play Building Blocks
|
140
|
+
|
142
141
|
Native builtins for different source, targets and transformations. Standardize interface, make it 1-line code switch between different components - as easy as assembling building blocks.
|
143
142
|
|
144
143
|
<p align="center">
|
@@ -146,6 +145,7 @@ Native builtins for different source, targets and transformations. Standardize i
|
|
146
145
|
</p>
|
147
146
|
|
148
147
|
## Data Freshness
|
148
|
+
|
149
149
|
CocoIndex keep source data and target in sync effortlessly.
|
150
150
|
|
151
151
|
<p align="center">
|
@@ -153,11 +153,14 @@ CocoIndex keep source data and target in sync effortlessly.
|
|
153
153
|
</p>
|
154
154
|
|
155
155
|
It has out-of-box support for incremental indexing:
|
156
|
+
|
156
157
|
- minimal recomputation on source or logic change.
|
157
158
|
- (re-)processing necessary portions; reuse cache when possible
|
158
159
|
|
159
|
-
## Quick Start
|
160
|
+
## Quick Start
|
161
|
+
|
160
162
|
If you're new to CocoIndex, we recommend checking out
|
163
|
+
|
161
164
|
- 📖 [Documentation](https://cocoindex.io/docs)
|
162
165
|
- ⚡ [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart)
|
163
166
|
- 🎬 [Quick Start Video Tutorial](https://youtu.be/gv5R8nOXsWU?si=9ioeKYkMEnYevTXT)
|
@@ -172,7 +175,6 @@ pip install -U cocoindex
|
|
172
175
|
|
173
176
|
2. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. CocoIndex uses it for incremental processing.
|
174
177
|
|
175
|
-
|
176
178
|
## Define data flow
|
177
179
|
|
178
180
|
Follow [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart) to define your first indexing flow. An example flow looks like:
|
@@ -228,6 +230,7 @@ It defines an index flow like this:
|
|
228
230
|
| [Text Embedding](examples/text_embedding) | Index text documents with embeddings for semantic search |
|
229
231
|
| [Code Embedding](examples/code_embedding) | Index code embeddings for semantic search |
|
230
232
|
| [PDF Embedding](examples/pdf_embedding) | Parse PDF and index text embeddings for semantic search |
|
233
|
+
| [PDF Elements Embedding](examples/pdf_elements_embedding) | Extract text and images from PDFs; embed text with SentenceTransformers and images with CLIP; store in Qdrant for multimodal search |
|
231
234
|
| [Manuals LLM Extraction](examples/manuals_llm_extraction) | Extract structured information from a manual using LLM |
|
232
235
|
| [Amazon S3 Embedding](examples/amazon_s3_embedding) | Index text documents from Amazon S3 |
|
233
236
|
| [Azure Blob Storage Embedding](examples/azure_blob_embedding) | Index text documents from Azure Blob Storage |
|
@@ -244,16 +247,18 @@ It defines an index flow like this:
|
|
244
247
|
| [Custom Output Files](examples/custom_output_files) | Convert markdown files to HTML files and save them to a local directory, using *CocoIndex Custom Targets* |
|
245
248
|
| [Patient intake form extraction](examples/patient_intake_extraction) | Use LLM to extract structured data from patient intake forms with different formats |
|
246
249
|
|
247
|
-
|
248
250
|
More coming and stay tuned 👀!
|
249
251
|
|
250
252
|
## 📖 Documentation
|
253
|
+
|
251
254
|
For detailed documentation, visit [CocoIndex Documentation](https://cocoindex.io/docs), including a [Quickstart guide](https://cocoindex.io/docs/getting_started/quickstart).
|
252
255
|
|
253
256
|
## 🤝 Contributing
|
257
|
+
|
254
258
|
We love contributions from our community ❤️. For details on contributing or running the project for development, check out our [contributing guide](https://cocoindex.io/docs/about/contributing).
|
255
259
|
|
256
260
|
## 👥 Community
|
261
|
+
|
257
262
|
Welcome with a huge coconut hug 🥥⋆。˚🤗. We are super excited for community contributions of all kinds - whether it's code improvements, documentation updates, issue reports, feature requests, and discussions in our Discord.
|
258
263
|
|
259
264
|
Join our community here:
|
@@ -263,9 +268,11 @@ Join our community here:
|
|
263
268
|
- ▶️ [Subscribe to our YouTube channel](https://www.youtube.com/@cocoindex-io)
|
264
269
|
- 📜 [Read our blog posts](https://cocoindex.io/blogs/)
|
265
270
|
|
266
|
-
## Support us
|
271
|
+
## Support us
|
272
|
+
|
267
273
|
We are constantly improving, and more features and examples are coming soon. If you love this project, please drop us a star ⭐ at GitHub repo [](https://github.com/cocoindex-io/cocoindex) to stay tuned and help us grow.
|
268
274
|
|
269
275
|
## License
|
276
|
+
|
270
277
|
CocoIndex is Apache 2.0 licensed.
|
271
278
|
|
@@ -1,22 +1,21 @@
|
|
1
|
-
cocoindex-0.2.
|
2
|
-
cocoindex-0.2.
|
3
|
-
cocoindex-0.2.
|
4
|
-
cocoindex-0.2.
|
1
|
+
cocoindex-0.2.20.dist-info/METADATA,sha256=PMLqa8rFhhAtRQCDWSvUSQbKy3vLYHdHatftA49W0e4,13644
|
2
|
+
cocoindex-0.2.20.dist-info/WHEEL,sha256=T94Vf-8hBLuJYmQaKIvspCD375-5CHbUeNmaNVtwQwY,108
|
3
|
+
cocoindex-0.2.20.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
|
4
|
+
cocoindex-0.2.20.dist-info/licenses/THIRD_PARTY_NOTICES.html,sha256=SJ-7q0eqT40cFyT1cXqQkxWocFEuLT6PrETn5dhxiX8,719620
|
5
5
|
cocoindex/__init__.py,sha256=6qZWVkK4WZ01BIAg3CPh_bRRdA6Clk4d4Q6OnZ2jFa4,2630
|
6
|
-
cocoindex/_engine.abi3.so,sha256=
|
6
|
+
cocoindex/_engine.abi3.so,sha256=YiQOMxjygiJrhOFTPvtHZ_mpvpfvDRy2KEhrRG_XwBQ,74720848
|
7
7
|
cocoindex/auth_registry.py,sha256=g-uLDWLYW5NMbYe7q4Y-sU5dSyrlJXBEciyWtAiP9KE,1340
|
8
8
|
cocoindex/cli.py,sha256=19IszBXOzqGn0xOV1SaS-oR9NupTmIm18uzFNET7NTQ,23978
|
9
9
|
cocoindex/engine_object.py,sha256=5YTuWoR3WILhyt3PW-d9es3MAas_xD6tZZqvipN-sjg,10050
|
10
10
|
cocoindex/engine_value.py,sha256=8M7MbwVG2bfd3kFptGGbQHBAp9pD3TVjrBiBDOAhD5M,23211
|
11
11
|
cocoindex/flow.py,sha256=JWPTR2G6TdPJkO5ZlrCcyDyQ8utUS4zZWNR8zsHTeW8,40074
|
12
|
-
cocoindex/functions.py,sha256=V4ljBnCprvA25XlCVvNLwK5ergXiEcKU76jkOGC-X3A,12882
|
13
12
|
cocoindex/functions/__init__.py,sha256=V2IF4h-Cqq4OD_GN3Oqdry-FArORyRCKmqJ7g5UlJr8,1021
|
14
13
|
cocoindex/functions/_engine_builtin_specs.py,sha256=WpCGrjUfJBa8xZP5JiEmA8kLu7fp9Rcs7ynpuJmvSGg,1786
|
15
14
|
cocoindex/functions/colpali.py,sha256=oACyG3qG2dquyCJ6bT7FkMkua5rXDLSxnOHcgoz9waU,8865
|
16
15
|
cocoindex/functions/sbert.py,sha256=1z5OJT-blXT6tVN5vEvEzvYAzOnzs1RCnu1UbCUP6wM,2162
|
17
16
|
cocoindex/index.py,sha256=tz5ilvmOp0BtroGehCQDqWK_pIX9m6ghkhcxsDVU8WE,982
|
18
17
|
cocoindex/lib.py,sha256=spfdU4IbzdffHyGdrQPIw_qGo9aX0OAAboqsjj8bTiQ,2290
|
19
|
-
cocoindex/llm.py,sha256=
|
18
|
+
cocoindex/llm.py,sha256=8ZdJhOmhdb2xEcCxk6rDpnj6hlhCyFBmJdhCNMqAOP4,875
|
20
19
|
cocoindex/op.py,sha256=Ycvr6lJf7hcCCjYUqHtXZqzSeDD-FQdP3_jcmZUV_zI,26896
|
21
20
|
cocoindex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
21
|
cocoindex/query_handler.py,sha256=X-SQT71LHiOOXn6-TJlQcGodJk-iT8p_1TcIMvRLBRI,1344
|
@@ -24,7 +23,7 @@ cocoindex/runtime.py,sha256=4NxcltaDZvA3RR3Pnt6gH_f99jcWSyMH_1Xi5BjbtwY,1342
|
|
24
23
|
cocoindex/setting.py,sha256=1Dx8ktjwf-8BiXrbsmfn5Mzudb2SQYqFdRnSNGVKaLk,4960
|
25
24
|
cocoindex/setup.py,sha256=7uIHKN4FOCuoidPXcKyGTrkqpkl9luL49-6UcnMxYzw,3068
|
26
25
|
cocoindex/sources/__init__.py,sha256=Yu9VHNaGlOEE3jpqfIseswsg25Le3HzwDr6XJAn22Ns,78
|
27
|
-
cocoindex/sources/_engine_builtin_specs.py,sha256=
|
26
|
+
cocoindex/sources/_engine_builtin_specs.py,sha256=s4AxMLi2j3ZHmzACVEGAdVe05gY8PRZ_mYMxWR7scDY,3304
|
28
27
|
cocoindex/subprocess_exec.py,sha256=r1xO84uek4VP4I6i87JMwsH5xFm3vKW0ABvgn0jskt4,10088
|
29
28
|
cocoindex/targets/__init__.py,sha256=HQG7I4U0xQhHiYctiUvwEBLxT2727oHP3xwrqotjmhk,78
|
30
29
|
cocoindex/targets/_engine_builtin_specs.py,sha256=glXUN5bj11Jxky1VPvmGnWnMHXTQWEh08INcbldo3F4,3375
|
@@ -40,4 +39,4 @@ cocoindex/typing.py,sha256=so_RusbhBmg_uLoZTY7W_pqU0aIJwFarkTF5NQufl4o,23944
|
|
40
39
|
cocoindex/user_app_loader.py,sha256=bc3Af-gYRxJ9GpObtpjegZY855oQBCv5FGkrkWV2yGY,1873
|
41
40
|
cocoindex/utils.py,sha256=hUhX-XV6XGCtJSEIpBOuDv6VvqImwPlgBxztBTw7u0U,598
|
42
41
|
cocoindex/validation.py,sha256=PZnJoby4sLbsmPv9fOjOQXuefjfZ7gmtsiTGU8SH-tc,3090
|
43
|
-
cocoindex-0.2.
|
42
|
+
cocoindex-0.2.20.dist-info/RECORD,,
|
@@ -2428,7 +2428,7 @@ Software.
|
|
2428
2428
|
<h3 id="Apache-2.0">Apache License 2.0</h3>
|
2429
2429
|
<h4>Used by:</h4>
|
2430
2430
|
<ul class="license-used-by">
|
2431
|
-
<li><a href=" https://crates.io/crates/cocoindex ">cocoindex 0.2.
|
2431
|
+
<li><a href=" https://crates.io/crates/cocoindex ">cocoindex 0.2.20</a></li>
|
2432
2432
|
<li><a href=" https://github.com/awesomized/crc-fast-rust ">crc-fast 1.3.0</a></li>
|
2433
2433
|
<li><a href=" https://github.com/qdrant/rust-client ">qdrant-client 1.15.0</a></li>
|
2434
2434
|
</ul>
|
cocoindex/functions.py
DELETED
@@ -1,375 +0,0 @@
|
|
1
|
-
"""All builtin functions."""
|
2
|
-
|
3
|
-
import dataclasses
|
4
|
-
import functools
|
5
|
-
from typing import Any, Literal
|
6
|
-
|
7
|
-
import numpy as np
|
8
|
-
from numpy.typing import NDArray
|
9
|
-
|
10
|
-
from . import llm, op
|
11
|
-
from .typing import Vector
|
12
|
-
|
13
|
-
|
14
|
-
class ParseJson(op.FunctionSpec):
|
15
|
-
"""Parse a text into a JSON object."""
|
16
|
-
|
17
|
-
|
18
|
-
@dataclasses.dataclass
|
19
|
-
class CustomLanguageSpec:
|
20
|
-
"""Custom language specification."""
|
21
|
-
|
22
|
-
language_name: str
|
23
|
-
separators_regex: list[str]
|
24
|
-
aliases: list[str] = dataclasses.field(default_factory=list)
|
25
|
-
|
26
|
-
|
27
|
-
@dataclasses.dataclass
|
28
|
-
class ColPaliModelInfo:
|
29
|
-
"""Data structure for ColPali model and processor."""
|
30
|
-
|
31
|
-
model: Any
|
32
|
-
processor: Any
|
33
|
-
dimension: int
|
34
|
-
device: Any
|
35
|
-
|
36
|
-
|
37
|
-
class SplitRecursively(op.FunctionSpec):
|
38
|
-
"""Split a document (in string) recursively."""
|
39
|
-
|
40
|
-
custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
|
41
|
-
|
42
|
-
|
43
|
-
class SplitBySeparators(op.FunctionSpec):
|
44
|
-
"""
|
45
|
-
Split text by specified regex separators only.
|
46
|
-
Output schema matches SplitRecursively for drop-in compatibility:
|
47
|
-
KTable rows with fields: location (Range), text (Str), start, end.
|
48
|
-
Args:
|
49
|
-
separators_regex: list[str] # e.g., [r"\\n\\n+"]
|
50
|
-
keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
|
51
|
-
include_empty: bool = False
|
52
|
-
trim: bool = True
|
53
|
-
"""
|
54
|
-
|
55
|
-
separators_regex: list[str] = dataclasses.field(default_factory=list)
|
56
|
-
keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
|
57
|
-
include_empty: bool = False
|
58
|
-
trim: bool = True
|
59
|
-
|
60
|
-
|
61
|
-
class EmbedText(op.FunctionSpec):
|
62
|
-
"""Embed a text into a vector space."""
|
63
|
-
|
64
|
-
api_type: llm.LlmApiType
|
65
|
-
model: str
|
66
|
-
address: str | None = None
|
67
|
-
output_dimension: int | None = None
|
68
|
-
task_type: str | None = None
|
69
|
-
api_config: llm.VertexAiConfig | None = None
|
70
|
-
|
71
|
-
|
72
|
-
class ExtractByLlm(op.FunctionSpec):
|
73
|
-
"""Extract information from a text using a LLM."""
|
74
|
-
|
75
|
-
llm_spec: llm.LlmSpec
|
76
|
-
output_type: type
|
77
|
-
instruction: str | None = None
|
78
|
-
|
79
|
-
|
80
|
-
class SentenceTransformerEmbed(op.FunctionSpec):
|
81
|
-
"""
|
82
|
-
`SentenceTransformerEmbed` embeds a text into a vector space using the [SentenceTransformer](https://huggingface.co/sentence-transformers) library.
|
83
|
-
|
84
|
-
Args:
|
85
|
-
|
86
|
-
model: The name of the SentenceTransformer model to use.
|
87
|
-
args: Additional arguments to pass to the SentenceTransformer constructor. e.g. {"trust_remote_code": True}
|
88
|
-
|
89
|
-
Note:
|
90
|
-
This function requires the optional sentence-transformers dependency.
|
91
|
-
Install it with: pip install 'cocoindex[embeddings]'
|
92
|
-
"""
|
93
|
-
|
94
|
-
model: str
|
95
|
-
args: dict[str, Any] | None = None
|
96
|
-
|
97
|
-
|
98
|
-
@op.executor_class(
|
99
|
-
gpu=True,
|
100
|
-
cache=True,
|
101
|
-
behavior_version=1,
|
102
|
-
arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),
|
103
|
-
)
|
104
|
-
class SentenceTransformerEmbedExecutor:
|
105
|
-
"""Executor for SentenceTransformerEmbed."""
|
106
|
-
|
107
|
-
spec: SentenceTransformerEmbed
|
108
|
-
_model: Any | None = None
|
109
|
-
|
110
|
-
def analyze(self) -> type:
|
111
|
-
try:
|
112
|
-
# Only import sentence_transformers locally when it's needed, as its import is very slow.
|
113
|
-
import sentence_transformers # pylint: disable=import-outside-toplevel
|
114
|
-
except ImportError as e:
|
115
|
-
raise ImportError(
|
116
|
-
"sentence_transformers is required for SentenceTransformerEmbed function. "
|
117
|
-
"Install it with one of these commands:\n"
|
118
|
-
" pip install 'cocoindex[embeddings]'\n"
|
119
|
-
" pip install sentence-transformers"
|
120
|
-
) from e
|
121
|
-
|
122
|
-
args = self.spec.args or {}
|
123
|
-
self._model = sentence_transformers.SentenceTransformer(self.spec.model, **args)
|
124
|
-
dim = self._model.get_sentence_embedding_dimension()
|
125
|
-
return Vector[np.float32, Literal[dim]] # type: ignore
|
126
|
-
|
127
|
-
def __call__(self, text: str) -> NDArray[np.float32]:
|
128
|
-
assert self._model is not None
|
129
|
-
result: NDArray[np.float32] = self._model.encode(text, convert_to_numpy=True)
|
130
|
-
return result
|
131
|
-
|
132
|
-
|
133
|
-
@functools.cache
|
134
|
-
def _get_colpali_model_and_processor(model_name: str) -> ColPaliModelInfo:
|
135
|
-
"""Get or load ColPali model and processor, with caching."""
|
136
|
-
try:
|
137
|
-
from colpali_engine.models import ( # type: ignore[import-untyped]
|
138
|
-
ColPali,
|
139
|
-
ColPaliProcessor,
|
140
|
-
ColQwen2,
|
141
|
-
ColQwen2Processor,
|
142
|
-
ColQwen2_5,
|
143
|
-
ColQwen2_5_Processor,
|
144
|
-
ColIdefics3,
|
145
|
-
ColIdefics3Processor,
|
146
|
-
)
|
147
|
-
from colpali_engine.utils.torch_utils import get_torch_device # type: ignore[import-untyped]
|
148
|
-
import torch
|
149
|
-
except ImportError as e:
|
150
|
-
raise ImportError(
|
151
|
-
"ColVision models are not available. Make sure cocoindex is installed with ColPali support."
|
152
|
-
) from e
|
153
|
-
|
154
|
-
device = get_torch_device("auto")
|
155
|
-
|
156
|
-
# Manual model detection based on model name
|
157
|
-
model_name_lower = model_name.lower()
|
158
|
-
|
159
|
-
try:
|
160
|
-
if "qwen2.5" in model_name_lower:
|
161
|
-
model = ColQwen2_5.from_pretrained(
|
162
|
-
model_name,
|
163
|
-
torch_dtype=torch.bfloat16,
|
164
|
-
device_map=device,
|
165
|
-
).eval()
|
166
|
-
processor = ColQwen2_5_Processor.from_pretrained(model_name)
|
167
|
-
elif "qwen2" in model_name_lower:
|
168
|
-
model = ColQwen2.from_pretrained(
|
169
|
-
model_name,
|
170
|
-
torch_dtype=torch.bfloat16,
|
171
|
-
device_map=device,
|
172
|
-
).eval()
|
173
|
-
processor = ColQwen2Processor.from_pretrained(model_name)
|
174
|
-
elif "colsmol" in model_name_lower or "smol" in model_name_lower:
|
175
|
-
# ColSmol models use Idefics3 architecture
|
176
|
-
model = ColIdefics3.from_pretrained(
|
177
|
-
model_name,
|
178
|
-
torch_dtype=torch.bfloat16,
|
179
|
-
device_map=device,
|
180
|
-
).eval()
|
181
|
-
processor = ColIdefics3Processor.from_pretrained(model_name)
|
182
|
-
else:
|
183
|
-
# Default to ColPali
|
184
|
-
model = ColPali.from_pretrained(
|
185
|
-
model_name,
|
186
|
-
torch_dtype=torch.bfloat16,
|
187
|
-
device_map=device,
|
188
|
-
).eval()
|
189
|
-
processor = ColPaliProcessor.from_pretrained(model_name)
|
190
|
-
|
191
|
-
except Exception as e:
|
192
|
-
raise RuntimeError(f"Failed to load model {model_name}: {e}")
|
193
|
-
|
194
|
-
# Get dimension from the actual model
|
195
|
-
dimension = _detect_colpali_dimension(model, processor, device)
|
196
|
-
|
197
|
-
return ColPaliModelInfo(
|
198
|
-
model=model,
|
199
|
-
processor=processor,
|
200
|
-
dimension=dimension,
|
201
|
-
device=device,
|
202
|
-
)
|
203
|
-
|
204
|
-
|
205
|
-
def _detect_colpali_dimension(model: Any, processor: Any, device: Any) -> int:
|
206
|
-
"""Detect ColPali embedding dimension from the actual model config."""
|
207
|
-
# Try to access embedding dimension
|
208
|
-
if hasattr(model.config, "embedding_dim"):
|
209
|
-
dim = model.config.embedding_dim
|
210
|
-
else:
|
211
|
-
# Fallback: infer from output shape with dummy data
|
212
|
-
from PIL import Image
|
213
|
-
import numpy as np
|
214
|
-
import torch
|
215
|
-
|
216
|
-
dummy_img = Image.fromarray(np.zeros((224, 224, 3), np.uint8))
|
217
|
-
# Use the processor to process the dummy image
|
218
|
-
processed = processor.process_images([dummy_img]).to(device)
|
219
|
-
with torch.no_grad():
|
220
|
-
output = model(**processed)
|
221
|
-
dim = int(output.shape[-1])
|
222
|
-
if isinstance(dim, int):
|
223
|
-
return dim
|
224
|
-
else:
|
225
|
-
raise ValueError(f"Expected integer dimension, got {type(dim)}: {dim}")
|
226
|
-
return dim
|
227
|
-
|
228
|
-
|
229
|
-
class ColPaliEmbedImage(op.FunctionSpec):
|
230
|
-
"""
|
231
|
-
`ColPaliEmbedImage` embeds images using ColVision multimodal models.
|
232
|
-
|
233
|
-
Supports ALL models available in the colpali-engine library, including:
|
234
|
-
- ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
|
235
|
-
- ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
|
236
|
-
- ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
|
237
|
-
- Any future ColVision models supported by colpali-engine
|
238
|
-
|
239
|
-
These models use late interaction between image patch embeddings and text token
|
240
|
-
embeddings for retrieval.
|
241
|
-
|
242
|
-
Args:
|
243
|
-
model: Any ColVision model name supported by colpali-engine
|
244
|
-
(e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
|
245
|
-
See https://github.com/illuin-tech/colpali for the complete list of supported models.
|
246
|
-
|
247
|
-
Note:
|
248
|
-
This function requires the optional colpali-engine dependency.
|
249
|
-
Install it with: pip install 'cocoindex[colpali]'
|
250
|
-
"""
|
251
|
-
|
252
|
-
model: str
|
253
|
-
|
254
|
-
|
255
|
-
@op.executor_class(
|
256
|
-
gpu=True,
|
257
|
-
cache=True,
|
258
|
-
behavior_version=1,
|
259
|
-
)
|
260
|
-
class ColPaliEmbedImageExecutor:
|
261
|
-
"""Executor for ColVision image embedding (ColPali, ColQwen2, ColSmol, etc.)."""
|
262
|
-
|
263
|
-
spec: ColPaliEmbedImage
|
264
|
-
_model_info: ColPaliModelInfo
|
265
|
-
|
266
|
-
def analyze(self) -> type:
|
267
|
-
# Get shared model and dimension
|
268
|
-
self._model_info = _get_colpali_model_and_processor(self.spec.model)
|
269
|
-
|
270
|
-
# Return multi-vector type: Variable patches x Fixed hidden dimension
|
271
|
-
dimension = self._model_info.dimension
|
272
|
-
return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
|
273
|
-
|
274
|
-
def __call__(self, img_bytes: bytes) -> Any:
|
275
|
-
try:
|
276
|
-
from PIL import Image
|
277
|
-
import torch
|
278
|
-
import io
|
279
|
-
except ImportError as e:
|
280
|
-
raise ImportError(
|
281
|
-
"Required dependencies (PIL, torch) are missing for ColVision image embedding."
|
282
|
-
) from e
|
283
|
-
|
284
|
-
model = self._model_info.model
|
285
|
-
processor = self._model_info.processor
|
286
|
-
device = self._model_info.device
|
287
|
-
|
288
|
-
pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
289
|
-
inputs = processor.process_images([pil_image]).to(device)
|
290
|
-
with torch.no_grad():
|
291
|
-
embeddings = model(**inputs)
|
292
|
-
|
293
|
-
# Return multi-vector format: [patches, hidden_dim]
|
294
|
-
if len(embeddings.shape) != 3:
|
295
|
-
raise ValueError(
|
296
|
-
f"Expected 3D tensor [batch, patches, hidden_dim], got shape {embeddings.shape}"
|
297
|
-
)
|
298
|
-
|
299
|
-
# Keep patch-level embeddings: [batch, patches, hidden_dim] -> [patches, hidden_dim]
|
300
|
-
patch_embeddings = embeddings[0] # Remove batch dimension
|
301
|
-
|
302
|
-
return patch_embeddings.cpu().to(torch.float32).numpy()
|
303
|
-
|
304
|
-
|
305
|
-
class ColPaliEmbedQuery(op.FunctionSpec):
|
306
|
-
"""
|
307
|
-
`ColPaliEmbedQuery` embeds text queries using ColVision multimodal models.
|
308
|
-
|
309
|
-
Supports ALL models available in the colpali-engine library, including:
|
310
|
-
- ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
|
311
|
-
- ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
|
312
|
-
- ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
|
313
|
-
- Any future ColVision models supported by colpali-engine
|
314
|
-
|
315
|
-
This produces query embeddings compatible with ColVision image embeddings
|
316
|
-
for late interaction scoring (MaxSim).
|
317
|
-
|
318
|
-
Args:
|
319
|
-
model: Any ColVision model name supported by colpali-engine
|
320
|
-
(e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
|
321
|
-
See https://github.com/illuin-tech/colpali for the complete list of supported models.
|
322
|
-
|
323
|
-
Note:
|
324
|
-
This function requires the optional colpali-engine dependency.
|
325
|
-
Install it with: pip install 'cocoindex[colpali]'
|
326
|
-
"""
|
327
|
-
|
328
|
-
model: str
|
329
|
-
|
330
|
-
|
331
|
-
@op.executor_class(
|
332
|
-
gpu=True,
|
333
|
-
cache=True,
|
334
|
-
behavior_version=1,
|
335
|
-
)
|
336
|
-
class ColPaliEmbedQueryExecutor:
|
337
|
-
"""Executor for ColVision query embedding (ColPali, ColQwen2, ColSmol, etc.)."""
|
338
|
-
|
339
|
-
spec: ColPaliEmbedQuery
|
340
|
-
_model_info: ColPaliModelInfo
|
341
|
-
|
342
|
-
def analyze(self) -> type:
|
343
|
-
# Get shared model and dimension
|
344
|
-
self._model_info = _get_colpali_model_and_processor(self.spec.model)
|
345
|
-
|
346
|
-
# Return multi-vector type: Variable tokens x Fixed hidden dimension
|
347
|
-
dimension = self._model_info.dimension
|
348
|
-
return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
|
349
|
-
|
350
|
-
def __call__(self, query: str) -> Any:
|
351
|
-
try:
|
352
|
-
import torch
|
353
|
-
except ImportError as e:
|
354
|
-
raise ImportError(
|
355
|
-
"Required dependencies (torch) are missing for ColVision query embedding."
|
356
|
-
) from e
|
357
|
-
|
358
|
-
model = self._model_info.model
|
359
|
-
processor = self._model_info.processor
|
360
|
-
device = self._model_info.device
|
361
|
-
|
362
|
-
inputs = processor.process_queries([query]).to(device)
|
363
|
-
with torch.no_grad():
|
364
|
-
embeddings = model(**inputs)
|
365
|
-
|
366
|
-
# Return multi-vector format: [tokens, hidden_dim]
|
367
|
-
if len(embeddings.shape) != 3:
|
368
|
-
raise ValueError(
|
369
|
-
f"Expected 3D tensor [batch, tokens, hidden_dim], got shape {embeddings.shape}"
|
370
|
-
)
|
371
|
-
|
372
|
-
# Keep token-level embeddings: [batch, tokens, hidden_dim] -> [tokens, hidden_dim]
|
373
|
-
token_embeddings = embeddings[0] # Remove batch dimension
|
374
|
-
|
375
|
-
return token_embeddings.cpu().to(torch.float32).numpy()
|
File without changes
|
File without changes
|