cocoindex 0.2.3__cp311-abi3-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoindex/__init__.py +92 -0
- cocoindex/_engine.abi3.so +0 -0
- cocoindex/auth_registry.py +51 -0
- cocoindex/cli.py +697 -0
- cocoindex/convert.py +621 -0
- cocoindex/flow.py +1205 -0
- cocoindex/functions.py +357 -0
- cocoindex/index.py +29 -0
- cocoindex/lib.py +32 -0
- cocoindex/llm.py +46 -0
- cocoindex/op.py +628 -0
- cocoindex/py.typed +0 -0
- cocoindex/runtime.py +37 -0
- cocoindex/setting.py +181 -0
- cocoindex/setup.py +92 -0
- cocoindex/sources.py +102 -0
- cocoindex/subprocess_exec.py +279 -0
- cocoindex/targets.py +135 -0
- cocoindex/tests/__init__.py +0 -0
- cocoindex/tests/conftest.py +38 -0
- cocoindex/tests/test_convert.py +1543 -0
- cocoindex/tests/test_optional_database.py +249 -0
- cocoindex/tests/test_transform_flow.py +207 -0
- cocoindex/tests/test_typing.py +429 -0
- cocoindex/tests/test_validation.py +134 -0
- cocoindex/typing.py +473 -0
- cocoindex/user_app_loader.py +51 -0
- cocoindex/utils.py +20 -0
- cocoindex/validation.py +104 -0
- cocoindex-0.2.3.dist-info/METADATA +262 -0
- cocoindex-0.2.3.dist-info/RECORD +34 -0
- cocoindex-0.2.3.dist-info/WHEEL +4 -0
- cocoindex-0.2.3.dist-info/entry_points.txt +2 -0
- cocoindex-0.2.3.dist-info/licenses/LICENSE +201 -0
cocoindex/functions.py
ADDED
@@ -0,0 +1,357 @@
|
|
1
|
+
"""All builtin functions."""
|
2
|
+
|
3
|
+
import dataclasses
|
4
|
+
import functools
|
5
|
+
from typing import Annotated, Any, Literal
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from numpy.typing import NDArray
|
9
|
+
|
10
|
+
from . import llm, op
|
11
|
+
from .typing import TypeAttr, Vector
|
12
|
+
|
13
|
+
|
14
|
+
class ParseJson(op.FunctionSpec):
|
15
|
+
"""Parse a text into a JSON object."""
|
16
|
+
|
17
|
+
|
18
|
+
@dataclasses.dataclass
|
19
|
+
class CustomLanguageSpec:
|
20
|
+
"""Custom language specification."""
|
21
|
+
|
22
|
+
language_name: str
|
23
|
+
separators_regex: list[str]
|
24
|
+
aliases: list[str] = dataclasses.field(default_factory=list)
|
25
|
+
|
26
|
+
|
27
|
+
@dataclasses.dataclass
|
28
|
+
class ColPaliModelInfo:
|
29
|
+
"""Data structure for ColPali model and processor."""
|
30
|
+
|
31
|
+
model: Any
|
32
|
+
processor: Any
|
33
|
+
dimension: int
|
34
|
+
device: Any
|
35
|
+
|
36
|
+
|
37
|
+
class SplitRecursively(op.FunctionSpec):
|
38
|
+
"""Split a document (in string) recursively."""
|
39
|
+
|
40
|
+
custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
|
41
|
+
|
42
|
+
|
43
|
+
class EmbedText(op.FunctionSpec):
|
44
|
+
"""Embed a text into a vector space."""
|
45
|
+
|
46
|
+
api_type: llm.LlmApiType
|
47
|
+
model: str
|
48
|
+
address: str | None = None
|
49
|
+
output_dimension: int | None = None
|
50
|
+
task_type: str | None = None
|
51
|
+
api_config: llm.VertexAiConfig | None = None
|
52
|
+
|
53
|
+
|
54
|
+
class ExtractByLlm(op.FunctionSpec):
|
55
|
+
"""Extract information from a text using a LLM."""
|
56
|
+
|
57
|
+
llm_spec: llm.LlmSpec
|
58
|
+
output_type: type
|
59
|
+
instruction: str | None = None
|
60
|
+
|
61
|
+
|
62
|
+
class SentenceTransformerEmbed(op.FunctionSpec):
|
63
|
+
"""
|
64
|
+
`SentenceTransformerEmbed` embeds a text into a vector space using the [SentenceTransformer](https://huggingface.co/sentence-transformers) library.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
|
68
|
+
model: The name of the SentenceTransformer model to use.
|
69
|
+
args: Additional arguments to pass to the SentenceTransformer constructor. e.g. {"trust_remote_code": True}
|
70
|
+
|
71
|
+
Note:
|
72
|
+
This function requires the optional sentence-transformers dependency.
|
73
|
+
Install it with: pip install 'cocoindex[embeddings]'
|
74
|
+
"""
|
75
|
+
|
76
|
+
model: str
|
77
|
+
args: dict[str, Any] | None = None
|
78
|
+
|
79
|
+
|
80
|
+
@op.executor_class(
|
81
|
+
gpu=True,
|
82
|
+
cache=True,
|
83
|
+
behavior_version=1,
|
84
|
+
arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),
|
85
|
+
)
|
86
|
+
class SentenceTransformerEmbedExecutor:
|
87
|
+
"""Executor for SentenceTransformerEmbed."""
|
88
|
+
|
89
|
+
spec: SentenceTransformerEmbed
|
90
|
+
_model: Any | None = None
|
91
|
+
|
92
|
+
def analyze(self) -> type:
|
93
|
+
try:
|
94
|
+
# Only import sentence_transformers locally when it's needed, as its import is very slow.
|
95
|
+
import sentence_transformers # pylint: disable=import-outside-toplevel
|
96
|
+
except ImportError as e:
|
97
|
+
raise ImportError(
|
98
|
+
"sentence_transformers is required for SentenceTransformerEmbed function. "
|
99
|
+
"Install it with one of these commands:\n"
|
100
|
+
" pip install 'cocoindex[embeddings]'\n"
|
101
|
+
" pip install sentence-transformers"
|
102
|
+
) from e
|
103
|
+
|
104
|
+
args = self.spec.args or {}
|
105
|
+
self._model = sentence_transformers.SentenceTransformer(self.spec.model, **args)
|
106
|
+
dim = self._model.get_sentence_embedding_dimension()
|
107
|
+
return Vector[np.float32, Literal[dim]] # type: ignore
|
108
|
+
|
109
|
+
def __call__(self, text: str) -> NDArray[np.float32]:
|
110
|
+
assert self._model is not None
|
111
|
+
result: NDArray[np.float32] = self._model.encode(text, convert_to_numpy=True)
|
112
|
+
return result
|
113
|
+
|
114
|
+
|
115
|
+
@functools.cache
|
116
|
+
def _get_colpali_model_and_processor(model_name: str) -> ColPaliModelInfo:
|
117
|
+
"""Get or load ColPali model and processor, with caching."""
|
118
|
+
try:
|
119
|
+
from colpali_engine.models import ( # type: ignore[import-untyped]
|
120
|
+
ColPali,
|
121
|
+
ColPaliProcessor,
|
122
|
+
ColQwen2,
|
123
|
+
ColQwen2Processor,
|
124
|
+
ColQwen2_5,
|
125
|
+
ColQwen2_5_Processor,
|
126
|
+
ColIdefics3,
|
127
|
+
ColIdefics3Processor,
|
128
|
+
)
|
129
|
+
from colpali_engine.utils.torch_utils import get_torch_device # type: ignore[import-untyped]
|
130
|
+
import torch
|
131
|
+
except ImportError as e:
|
132
|
+
raise ImportError(
|
133
|
+
"ColVision models are not available. Make sure cocoindex is installed with ColPali support."
|
134
|
+
) from e
|
135
|
+
|
136
|
+
device = get_torch_device("auto")
|
137
|
+
|
138
|
+
# Manual model detection based on model name
|
139
|
+
model_name_lower = model_name.lower()
|
140
|
+
|
141
|
+
try:
|
142
|
+
if "qwen2.5" in model_name_lower:
|
143
|
+
model = ColQwen2_5.from_pretrained(
|
144
|
+
model_name,
|
145
|
+
torch_dtype=torch.bfloat16,
|
146
|
+
device_map=device,
|
147
|
+
).eval()
|
148
|
+
processor = ColQwen2_5_Processor.from_pretrained(model_name)
|
149
|
+
elif "qwen2" in model_name_lower:
|
150
|
+
model = ColQwen2.from_pretrained(
|
151
|
+
model_name,
|
152
|
+
torch_dtype=torch.bfloat16,
|
153
|
+
device_map=device,
|
154
|
+
).eval()
|
155
|
+
processor = ColQwen2Processor.from_pretrained(model_name)
|
156
|
+
elif "colsmol" in model_name_lower or "smol" in model_name_lower:
|
157
|
+
# ColSmol models use Idefics3 architecture
|
158
|
+
model = ColIdefics3.from_pretrained(
|
159
|
+
model_name,
|
160
|
+
torch_dtype=torch.bfloat16,
|
161
|
+
device_map=device,
|
162
|
+
).eval()
|
163
|
+
processor = ColIdefics3Processor.from_pretrained(model_name)
|
164
|
+
else:
|
165
|
+
# Default to ColPali
|
166
|
+
model = ColPali.from_pretrained(
|
167
|
+
model_name,
|
168
|
+
torch_dtype=torch.bfloat16,
|
169
|
+
device_map=device,
|
170
|
+
).eval()
|
171
|
+
processor = ColPaliProcessor.from_pretrained(model_name)
|
172
|
+
|
173
|
+
except Exception as e:
|
174
|
+
raise RuntimeError(f"Failed to load model {model_name}: {e}")
|
175
|
+
|
176
|
+
# Get dimension from the actual model
|
177
|
+
dimension = _detect_colpali_dimension(model, processor, device)
|
178
|
+
|
179
|
+
return ColPaliModelInfo(
|
180
|
+
model=model,
|
181
|
+
processor=processor,
|
182
|
+
dimension=dimension,
|
183
|
+
device=device,
|
184
|
+
)
|
185
|
+
|
186
|
+
|
187
|
+
def _detect_colpali_dimension(model: Any, processor: Any, device: Any) -> int:
|
188
|
+
"""Detect ColPali embedding dimension from the actual model config."""
|
189
|
+
# Try to access embedding dimension
|
190
|
+
if hasattr(model.config, "embedding_dim"):
|
191
|
+
dim = model.config.embedding_dim
|
192
|
+
else:
|
193
|
+
# Fallback: infer from output shape with dummy data
|
194
|
+
from PIL import Image
|
195
|
+
import numpy as np
|
196
|
+
import torch
|
197
|
+
|
198
|
+
dummy_img = Image.fromarray(np.zeros((224, 224, 3), np.uint8))
|
199
|
+
# Use the processor to process the dummy image
|
200
|
+
processed = processor.process_images([dummy_img]).to(device)
|
201
|
+
with torch.no_grad():
|
202
|
+
output = model(**processed)
|
203
|
+
dim = int(output.shape[-1])
|
204
|
+
if isinstance(dim, int):
|
205
|
+
return dim
|
206
|
+
else:
|
207
|
+
raise ValueError(f"Expected integer dimension, got {type(dim)}: {dim}")
|
208
|
+
return dim
|
209
|
+
|
210
|
+
|
211
|
+
class ColPaliEmbedImage(op.FunctionSpec):
|
212
|
+
"""
|
213
|
+
`ColPaliEmbedImage` embeds images using ColVision multimodal models.
|
214
|
+
|
215
|
+
Supports ALL models available in the colpali-engine library, including:
|
216
|
+
- ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
|
217
|
+
- ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
|
218
|
+
- ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
|
219
|
+
- Any future ColVision models supported by colpali-engine
|
220
|
+
|
221
|
+
These models use late interaction between image patch embeddings and text token
|
222
|
+
embeddings for retrieval.
|
223
|
+
|
224
|
+
Args:
|
225
|
+
model: Any ColVision model name supported by colpali-engine
|
226
|
+
(e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
|
227
|
+
See https://github.com/illuin-tech/colpali for the complete list of supported models.
|
228
|
+
|
229
|
+
Note:
|
230
|
+
This function requires the optional colpali-engine dependency.
|
231
|
+
Install it with: pip install 'cocoindex[colpali]'
|
232
|
+
"""
|
233
|
+
|
234
|
+
model: str
|
235
|
+
|
236
|
+
|
237
|
+
@op.executor_class(
|
238
|
+
gpu=True,
|
239
|
+
cache=True,
|
240
|
+
behavior_version=1,
|
241
|
+
)
|
242
|
+
class ColPaliEmbedImageExecutor:
|
243
|
+
"""Executor for ColVision image embedding (ColPali, ColQwen2, ColSmol, etc.)."""
|
244
|
+
|
245
|
+
spec: ColPaliEmbedImage
|
246
|
+
_model_info: ColPaliModelInfo
|
247
|
+
|
248
|
+
def analyze(self) -> type:
|
249
|
+
# Get shared model and dimension
|
250
|
+
self._model_info = _get_colpali_model_and_processor(self.spec.model)
|
251
|
+
|
252
|
+
# Return multi-vector type: Variable patches x Fixed hidden dimension
|
253
|
+
dimension = self._model_info.dimension
|
254
|
+
return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
|
255
|
+
|
256
|
+
def __call__(self, img_bytes: bytes) -> Any:
|
257
|
+
try:
|
258
|
+
from PIL import Image
|
259
|
+
import torch
|
260
|
+
import io
|
261
|
+
except ImportError as e:
|
262
|
+
raise ImportError(
|
263
|
+
"Required dependencies (PIL, torch) are missing for ColVision image embedding."
|
264
|
+
) from e
|
265
|
+
|
266
|
+
model = self._model_info.model
|
267
|
+
processor = self._model_info.processor
|
268
|
+
device = self._model_info.device
|
269
|
+
|
270
|
+
pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
271
|
+
inputs = processor.process_images([pil_image]).to(device)
|
272
|
+
with torch.no_grad():
|
273
|
+
embeddings = model(**inputs)
|
274
|
+
|
275
|
+
# Return multi-vector format: [patches, hidden_dim]
|
276
|
+
if len(embeddings.shape) != 3:
|
277
|
+
raise ValueError(
|
278
|
+
f"Expected 3D tensor [batch, patches, hidden_dim], got shape {embeddings.shape}"
|
279
|
+
)
|
280
|
+
|
281
|
+
# Keep patch-level embeddings: [batch, patches, hidden_dim] -> [patches, hidden_dim]
|
282
|
+
patch_embeddings = embeddings[0] # Remove batch dimension
|
283
|
+
|
284
|
+
return patch_embeddings.cpu().to(torch.float32).numpy()
|
285
|
+
|
286
|
+
|
287
|
+
class ColPaliEmbedQuery(op.FunctionSpec):
|
288
|
+
"""
|
289
|
+
`ColPaliEmbedQuery` embeds text queries using ColVision multimodal models.
|
290
|
+
|
291
|
+
Supports ALL models available in the colpali-engine library, including:
|
292
|
+
- ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
|
293
|
+
- ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
|
294
|
+
- ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
|
295
|
+
- Any future ColVision models supported by colpali-engine
|
296
|
+
|
297
|
+
This produces query embeddings compatible with ColVision image embeddings
|
298
|
+
for late interaction scoring (MaxSim).
|
299
|
+
|
300
|
+
Args:
|
301
|
+
model: Any ColVision model name supported by colpali-engine
|
302
|
+
(e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
|
303
|
+
See https://github.com/illuin-tech/colpali for the complete list of supported models.
|
304
|
+
|
305
|
+
Note:
|
306
|
+
This function requires the optional colpali-engine dependency.
|
307
|
+
Install it with: pip install 'cocoindex[colpali]'
|
308
|
+
"""
|
309
|
+
|
310
|
+
model: str
|
311
|
+
|
312
|
+
|
313
|
+
@op.executor_class(
|
314
|
+
gpu=True,
|
315
|
+
cache=True,
|
316
|
+
behavior_version=1,
|
317
|
+
)
|
318
|
+
class ColPaliEmbedQueryExecutor:
|
319
|
+
"""Executor for ColVision query embedding (ColPali, ColQwen2, ColSmol, etc.)."""
|
320
|
+
|
321
|
+
spec: ColPaliEmbedQuery
|
322
|
+
_model_info: ColPaliModelInfo
|
323
|
+
|
324
|
+
def analyze(self) -> type:
|
325
|
+
# Get shared model and dimension
|
326
|
+
self._model_info = _get_colpali_model_and_processor(self.spec.model)
|
327
|
+
|
328
|
+
# Return multi-vector type: Variable tokens x Fixed hidden dimension
|
329
|
+
dimension = self._model_info.dimension
|
330
|
+
return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
|
331
|
+
|
332
|
+
def __call__(self, query: str) -> Any:
|
333
|
+
try:
|
334
|
+
import torch
|
335
|
+
except ImportError as e:
|
336
|
+
raise ImportError(
|
337
|
+
"Required dependencies (torch) are missing for ColVision query embedding."
|
338
|
+
) from e
|
339
|
+
|
340
|
+
model = self._model_info.model
|
341
|
+
processor = self._model_info.processor
|
342
|
+
device = self._model_info.device
|
343
|
+
|
344
|
+
inputs = processor.process_queries([query]).to(device)
|
345
|
+
with torch.no_grad():
|
346
|
+
embeddings = model(**inputs)
|
347
|
+
|
348
|
+
# Return multi-vector format: [tokens, hidden_dim]
|
349
|
+
if len(embeddings.shape) != 3:
|
350
|
+
raise ValueError(
|
351
|
+
f"Expected 3D tensor [batch, tokens, hidden_dim], got shape {embeddings.shape}"
|
352
|
+
)
|
353
|
+
|
354
|
+
# Keep token-level embeddings: [batch, tokens, hidden_dim] -> [tokens, hidden_dim]
|
355
|
+
token_embeddings = embeddings[0] # Remove batch dimension
|
356
|
+
|
357
|
+
return token_embeddings.cpu().to(torch.float32).numpy()
|
cocoindex/index.py
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Sequence
|
4
|
+
|
5
|
+
|
6
|
+
class VectorSimilarityMetric(Enum):
|
7
|
+
COSINE_SIMILARITY = "CosineSimilarity"
|
8
|
+
L2_DISTANCE = "L2Distance"
|
9
|
+
INNER_PRODUCT = "InnerProduct"
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class VectorIndexDef:
|
14
|
+
"""
|
15
|
+
Define a vector index on a field.
|
16
|
+
"""
|
17
|
+
|
18
|
+
field_name: str
|
19
|
+
metric: VectorSimilarityMetric
|
20
|
+
|
21
|
+
|
22
|
+
@dataclass
|
23
|
+
class IndexOptions:
|
24
|
+
"""
|
25
|
+
Options for an index.
|
26
|
+
"""
|
27
|
+
|
28
|
+
primary_key_fields: Sequence[str]
|
29
|
+
vector_indexes: Sequence[VectorIndexDef] = ()
|
cocoindex/lib.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
"""
|
2
|
+
Library level functions and states.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
from typing import Callable, Any
|
7
|
+
|
8
|
+
from . import _engine # type: ignore
|
9
|
+
from . import flow, setting
|
10
|
+
from .convert import dump_engine_object
|
11
|
+
|
12
|
+
|
13
|
+
def init(settings: setting.Settings | None = None) -> None:
|
14
|
+
"""
|
15
|
+
Initialize the cocoindex library.
|
16
|
+
|
17
|
+
If the settings are not provided, they are loaded from the environment variables.
|
18
|
+
"""
|
19
|
+
settings = settings or setting.Settings.from_env()
|
20
|
+
_engine.init(dump_engine_object(settings))
|
21
|
+
setting.set_app_namespace(settings.app_namespace)
|
22
|
+
|
23
|
+
|
24
|
+
def start_server(settings: setting.ServerSettings) -> None:
|
25
|
+
"""Start the cocoindex server."""
|
26
|
+
flow.ensure_all_flows_built()
|
27
|
+
_engine.start_server(settings.__dict__)
|
28
|
+
|
29
|
+
|
30
|
+
def stop() -> None:
|
31
|
+
"""Stop the cocoindex library."""
|
32
|
+
_engine.stop()
|
cocoindex/llm.py
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from enum import Enum
|
3
|
+
|
4
|
+
|
5
|
+
class LlmApiType(Enum):
|
6
|
+
"""The type of LLM API to use."""
|
7
|
+
|
8
|
+
OPENAI = "OpenAi"
|
9
|
+
OLLAMA = "Ollama"
|
10
|
+
GEMINI = "Gemini"
|
11
|
+
VERTEX_AI = "VertexAi"
|
12
|
+
ANTHROPIC = "Anthropic"
|
13
|
+
LITE_LLM = "LiteLlm"
|
14
|
+
OPEN_ROUTER = "OpenRouter"
|
15
|
+
VOYAGE = "Voyage"
|
16
|
+
VLLM = "Vllm"
|
17
|
+
|
18
|
+
|
19
|
+
@dataclass
|
20
|
+
class VertexAiConfig:
|
21
|
+
"""A specification for a Vertex AI LLM."""
|
22
|
+
|
23
|
+
kind = "VertexAi"
|
24
|
+
|
25
|
+
project: str
|
26
|
+
region: str | None = None
|
27
|
+
|
28
|
+
|
29
|
+
@dataclass
|
30
|
+
class OpenAiConfig:
|
31
|
+
"""A specification for a OpenAI LLM."""
|
32
|
+
|
33
|
+
kind = "OpenAi"
|
34
|
+
|
35
|
+
org_id: str | None = None
|
36
|
+
project_id: str | None = None
|
37
|
+
|
38
|
+
|
39
|
+
@dataclass
|
40
|
+
class LlmSpec:
|
41
|
+
"""A specification for a LLM."""
|
42
|
+
|
43
|
+
api_type: LlmApiType
|
44
|
+
model: str
|
45
|
+
address: str | None = None
|
46
|
+
api_config: VertexAiConfig | OpenAiConfig | None = None
|