cocoindex 0.2.3__cp311-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cocoindex/functions.py ADDED
@@ -0,0 +1,357 @@
1
+ """All builtin functions."""
2
+
3
+ import dataclasses
4
+ import functools
5
+ from typing import Annotated, Any, Literal
6
+
7
+ import numpy as np
8
+ from numpy.typing import NDArray
9
+
10
+ from . import llm, op
11
+ from .typing import TypeAttr, Vector
12
+
13
+
14
+ class ParseJson(op.FunctionSpec):
15
+ """Parse a text into a JSON object."""
16
+
17
+
18
+ @dataclasses.dataclass
19
+ class CustomLanguageSpec:
20
+ """Custom language specification."""
21
+
22
+ language_name: str
23
+ separators_regex: list[str]
24
+ aliases: list[str] = dataclasses.field(default_factory=list)
25
+
26
+
27
+ @dataclasses.dataclass
28
+ class ColPaliModelInfo:
29
+ """Data structure for ColPali model and processor."""
30
+
31
+ model: Any
32
+ processor: Any
33
+ dimension: int
34
+ device: Any
35
+
36
+
37
+ class SplitRecursively(op.FunctionSpec):
38
+ """Split a document (in string) recursively."""
39
+
40
+ custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
41
+
42
+
43
+ class EmbedText(op.FunctionSpec):
44
+ """Embed a text into a vector space."""
45
+
46
+ api_type: llm.LlmApiType
47
+ model: str
48
+ address: str | None = None
49
+ output_dimension: int | None = None
50
+ task_type: str | None = None
51
+ api_config: llm.VertexAiConfig | None = None
52
+
53
+
54
+ class ExtractByLlm(op.FunctionSpec):
55
+ """Extract information from a text using a LLM."""
56
+
57
+ llm_spec: llm.LlmSpec
58
+ output_type: type
59
+ instruction: str | None = None
60
+
61
+
62
+ class SentenceTransformerEmbed(op.FunctionSpec):
63
+ """
64
+ `SentenceTransformerEmbed` embeds a text into a vector space using the [SentenceTransformer](https://huggingface.co/sentence-transformers) library.
65
+
66
+ Args:
67
+
68
+ model: The name of the SentenceTransformer model to use.
69
+ args: Additional arguments to pass to the SentenceTransformer constructor. e.g. {"trust_remote_code": True}
70
+
71
+ Note:
72
+ This function requires the optional sentence-transformers dependency.
73
+ Install it with: pip install 'cocoindex[embeddings]'
74
+ """
75
+
76
+ model: str
77
+ args: dict[str, Any] | None = None
78
+
79
+
80
+ @op.executor_class(
81
+ gpu=True,
82
+ cache=True,
83
+ behavior_version=1,
84
+ arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),
85
+ )
86
+ class SentenceTransformerEmbedExecutor:
87
+ """Executor for SentenceTransformerEmbed."""
88
+
89
+ spec: SentenceTransformerEmbed
90
+ _model: Any | None = None
91
+
92
+ def analyze(self) -> type:
93
+ try:
94
+ # Only import sentence_transformers locally when it's needed, as its import is very slow.
95
+ import sentence_transformers # pylint: disable=import-outside-toplevel
96
+ except ImportError as e:
97
+ raise ImportError(
98
+ "sentence_transformers is required for SentenceTransformerEmbed function. "
99
+ "Install it with one of these commands:\n"
100
+ " pip install 'cocoindex[embeddings]'\n"
101
+ " pip install sentence-transformers"
102
+ ) from e
103
+
104
+ args = self.spec.args or {}
105
+ self._model = sentence_transformers.SentenceTransformer(self.spec.model, **args)
106
+ dim = self._model.get_sentence_embedding_dimension()
107
+ return Vector[np.float32, Literal[dim]] # type: ignore
108
+
109
+ def __call__(self, text: str) -> NDArray[np.float32]:
110
+ assert self._model is not None
111
+ result: NDArray[np.float32] = self._model.encode(text, convert_to_numpy=True)
112
+ return result
113
+
114
+
115
+ @functools.cache
116
+ def _get_colpali_model_and_processor(model_name: str) -> ColPaliModelInfo:
117
+ """Get or load ColPali model and processor, with caching."""
118
+ try:
119
+ from colpali_engine.models import ( # type: ignore[import-untyped]
120
+ ColPali,
121
+ ColPaliProcessor,
122
+ ColQwen2,
123
+ ColQwen2Processor,
124
+ ColQwen2_5,
125
+ ColQwen2_5_Processor,
126
+ ColIdefics3,
127
+ ColIdefics3Processor,
128
+ )
129
+ from colpali_engine.utils.torch_utils import get_torch_device # type: ignore[import-untyped]
130
+ import torch
131
+ except ImportError as e:
132
+ raise ImportError(
133
+ "ColVision models are not available. Make sure cocoindex is installed with ColPali support."
134
+ ) from e
135
+
136
+ device = get_torch_device("auto")
137
+
138
+ # Manual model detection based on model name
139
+ model_name_lower = model_name.lower()
140
+
141
+ try:
142
+ if "qwen2.5" in model_name_lower:
143
+ model = ColQwen2_5.from_pretrained(
144
+ model_name,
145
+ torch_dtype=torch.bfloat16,
146
+ device_map=device,
147
+ ).eval()
148
+ processor = ColQwen2_5_Processor.from_pretrained(model_name)
149
+ elif "qwen2" in model_name_lower:
150
+ model = ColQwen2.from_pretrained(
151
+ model_name,
152
+ torch_dtype=torch.bfloat16,
153
+ device_map=device,
154
+ ).eval()
155
+ processor = ColQwen2Processor.from_pretrained(model_name)
156
+ elif "colsmol" in model_name_lower or "smol" in model_name_lower:
157
+ # ColSmol models use Idefics3 architecture
158
+ model = ColIdefics3.from_pretrained(
159
+ model_name,
160
+ torch_dtype=torch.bfloat16,
161
+ device_map=device,
162
+ ).eval()
163
+ processor = ColIdefics3Processor.from_pretrained(model_name)
164
+ else:
165
+ # Default to ColPali
166
+ model = ColPali.from_pretrained(
167
+ model_name,
168
+ torch_dtype=torch.bfloat16,
169
+ device_map=device,
170
+ ).eval()
171
+ processor = ColPaliProcessor.from_pretrained(model_name)
172
+
173
+ except Exception as e:
174
+ raise RuntimeError(f"Failed to load model {model_name}: {e}")
175
+
176
+ # Get dimension from the actual model
177
+ dimension = _detect_colpali_dimension(model, processor, device)
178
+
179
+ return ColPaliModelInfo(
180
+ model=model,
181
+ processor=processor,
182
+ dimension=dimension,
183
+ device=device,
184
+ )
185
+
186
+
187
+ def _detect_colpali_dimension(model: Any, processor: Any, device: Any) -> int:
188
+ """Detect ColPali embedding dimension from the actual model config."""
189
+ # Try to access embedding dimension
190
+ if hasattr(model.config, "embedding_dim"):
191
+ dim = model.config.embedding_dim
192
+ else:
193
+ # Fallback: infer from output shape with dummy data
194
+ from PIL import Image
195
+ import numpy as np
196
+ import torch
197
+
198
+ dummy_img = Image.fromarray(np.zeros((224, 224, 3), np.uint8))
199
+ # Use the processor to process the dummy image
200
+ processed = processor.process_images([dummy_img]).to(device)
201
+ with torch.no_grad():
202
+ output = model(**processed)
203
+ dim = int(output.shape[-1])
204
+ if isinstance(dim, int):
205
+ return dim
206
+ else:
207
+ raise ValueError(f"Expected integer dimension, got {type(dim)}: {dim}")
208
+ return dim
209
+
210
+
211
+ class ColPaliEmbedImage(op.FunctionSpec):
212
+ """
213
+ `ColPaliEmbedImage` embeds images using ColVision multimodal models.
214
+
215
+ Supports ALL models available in the colpali-engine library, including:
216
+ - ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
217
+ - ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
218
+ - ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
219
+ - Any future ColVision models supported by colpali-engine
220
+
221
+ These models use late interaction between image patch embeddings and text token
222
+ embeddings for retrieval.
223
+
224
+ Args:
225
+ model: Any ColVision model name supported by colpali-engine
226
+ (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
227
+ See https://github.com/illuin-tech/colpali for the complete list of supported models.
228
+
229
+ Note:
230
+ This function requires the optional colpali-engine dependency.
231
+ Install it with: pip install 'cocoindex[colpali]'
232
+ """
233
+
234
+ model: str
235
+
236
+
237
+ @op.executor_class(
238
+ gpu=True,
239
+ cache=True,
240
+ behavior_version=1,
241
+ )
242
+ class ColPaliEmbedImageExecutor:
243
+ """Executor for ColVision image embedding (ColPali, ColQwen2, ColSmol, etc.)."""
244
+
245
+ spec: ColPaliEmbedImage
246
+ _model_info: ColPaliModelInfo
247
+
248
+ def analyze(self) -> type:
249
+ # Get shared model and dimension
250
+ self._model_info = _get_colpali_model_and_processor(self.spec.model)
251
+
252
+ # Return multi-vector type: Variable patches x Fixed hidden dimension
253
+ dimension = self._model_info.dimension
254
+ return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
255
+
256
+ def __call__(self, img_bytes: bytes) -> Any:
257
+ try:
258
+ from PIL import Image
259
+ import torch
260
+ import io
261
+ except ImportError as e:
262
+ raise ImportError(
263
+ "Required dependencies (PIL, torch) are missing for ColVision image embedding."
264
+ ) from e
265
+
266
+ model = self._model_info.model
267
+ processor = self._model_info.processor
268
+ device = self._model_info.device
269
+
270
+ pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
271
+ inputs = processor.process_images([pil_image]).to(device)
272
+ with torch.no_grad():
273
+ embeddings = model(**inputs)
274
+
275
+ # Return multi-vector format: [patches, hidden_dim]
276
+ if len(embeddings.shape) != 3:
277
+ raise ValueError(
278
+ f"Expected 3D tensor [batch, patches, hidden_dim], got shape {embeddings.shape}"
279
+ )
280
+
281
+ # Keep patch-level embeddings: [batch, patches, hidden_dim] -> [patches, hidden_dim]
282
+ patch_embeddings = embeddings[0] # Remove batch dimension
283
+
284
+ return patch_embeddings.cpu().to(torch.float32).numpy()
285
+
286
+
287
+ class ColPaliEmbedQuery(op.FunctionSpec):
288
+ """
289
+ `ColPaliEmbedQuery` embeds text queries using ColVision multimodal models.
290
+
291
+ Supports ALL models available in the colpali-engine library, including:
292
+ - ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
293
+ - ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
294
+ - ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
295
+ - Any future ColVision models supported by colpali-engine
296
+
297
+ This produces query embeddings compatible with ColVision image embeddings
298
+ for late interaction scoring (MaxSim).
299
+
300
+ Args:
301
+ model: Any ColVision model name supported by colpali-engine
302
+ (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
303
+ See https://github.com/illuin-tech/colpali for the complete list of supported models.
304
+
305
+ Note:
306
+ This function requires the optional colpali-engine dependency.
307
+ Install it with: pip install 'cocoindex[colpali]'
308
+ """
309
+
310
+ model: str
311
+
312
+
313
+ @op.executor_class(
314
+ gpu=True,
315
+ cache=True,
316
+ behavior_version=1,
317
+ )
318
+ class ColPaliEmbedQueryExecutor:
319
+ """Executor for ColVision query embedding (ColPali, ColQwen2, ColSmol, etc.)."""
320
+
321
+ spec: ColPaliEmbedQuery
322
+ _model_info: ColPaliModelInfo
323
+
324
+ def analyze(self) -> type:
325
+ # Get shared model and dimension
326
+ self._model_info = _get_colpali_model_and_processor(self.spec.model)
327
+
328
+ # Return multi-vector type: Variable tokens x Fixed hidden dimension
329
+ dimension = self._model_info.dimension
330
+ return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
331
+
332
+ def __call__(self, query: str) -> Any:
333
+ try:
334
+ import torch
335
+ except ImportError as e:
336
+ raise ImportError(
337
+ "Required dependencies (torch) are missing for ColVision query embedding."
338
+ ) from e
339
+
340
+ model = self._model_info.model
341
+ processor = self._model_info.processor
342
+ device = self._model_info.device
343
+
344
+ inputs = processor.process_queries([query]).to(device)
345
+ with torch.no_grad():
346
+ embeddings = model(**inputs)
347
+
348
+ # Return multi-vector format: [tokens, hidden_dim]
349
+ if len(embeddings.shape) != 3:
350
+ raise ValueError(
351
+ f"Expected 3D tensor [batch, tokens, hidden_dim], got shape {embeddings.shape}"
352
+ )
353
+
354
+ # Keep token-level embeddings: [batch, tokens, hidden_dim] -> [tokens, hidden_dim]
355
+ token_embeddings = embeddings[0] # Remove batch dimension
356
+
357
+ return token_embeddings.cpu().to(torch.float32).numpy()
cocoindex/index.py ADDED
@@ -0,0 +1,29 @@
1
+ from enum import Enum
2
+ from dataclasses import dataclass
3
+ from typing import Sequence
4
+
5
+
6
+ class VectorSimilarityMetric(Enum):
7
+ COSINE_SIMILARITY = "CosineSimilarity"
8
+ L2_DISTANCE = "L2Distance"
9
+ INNER_PRODUCT = "InnerProduct"
10
+
11
+
12
+ @dataclass
13
+ class VectorIndexDef:
14
+ """
15
+ Define a vector index on a field.
16
+ """
17
+
18
+ field_name: str
19
+ metric: VectorSimilarityMetric
20
+
21
+
22
+ @dataclass
23
+ class IndexOptions:
24
+ """
25
+ Options for an index.
26
+ """
27
+
28
+ primary_key_fields: Sequence[str]
29
+ vector_indexes: Sequence[VectorIndexDef] = ()
cocoindex/lib.py ADDED
@@ -0,0 +1,32 @@
1
+ """
2
+ Library level functions and states.
3
+ """
4
+
5
+ import warnings
6
+ from typing import Callable, Any
7
+
8
+ from . import _engine # type: ignore
9
+ from . import flow, setting
10
+ from .convert import dump_engine_object
11
+
12
+
13
+ def init(settings: setting.Settings | None = None) -> None:
14
+ """
15
+ Initialize the cocoindex library.
16
+
17
+ If the settings are not provided, they are loaded from the environment variables.
18
+ """
19
+ settings = settings or setting.Settings.from_env()
20
+ _engine.init(dump_engine_object(settings))
21
+ setting.set_app_namespace(settings.app_namespace)
22
+
23
+
24
+ def start_server(settings: setting.ServerSettings) -> None:
25
+ """Start the cocoindex server."""
26
+ flow.ensure_all_flows_built()
27
+ _engine.start_server(settings.__dict__)
28
+
29
+
30
+ def stop() -> None:
31
+ """Stop the cocoindex library."""
32
+ _engine.stop()
cocoindex/llm.py ADDED
@@ -0,0 +1,46 @@
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+
5
+ class LlmApiType(Enum):
6
+ """The type of LLM API to use."""
7
+
8
+ OPENAI = "OpenAi"
9
+ OLLAMA = "Ollama"
10
+ GEMINI = "Gemini"
11
+ VERTEX_AI = "VertexAi"
12
+ ANTHROPIC = "Anthropic"
13
+ LITE_LLM = "LiteLlm"
14
+ OPEN_ROUTER = "OpenRouter"
15
+ VOYAGE = "Voyage"
16
+ VLLM = "Vllm"
17
+
18
+
19
+ @dataclass
20
+ class VertexAiConfig:
21
+ """A specification for a Vertex AI LLM."""
22
+
23
+ kind = "VertexAi"
24
+
25
+ project: str
26
+ region: str | None = None
27
+
28
+
29
+ @dataclass
30
+ class OpenAiConfig:
31
+ """A specification for a OpenAI LLM."""
32
+
33
+ kind = "OpenAi"
34
+
35
+ org_id: str | None = None
36
+ project_id: str | None = None
37
+
38
+
39
+ @dataclass
40
+ class LlmSpec:
41
+ """A specification for a LLM."""
42
+
43
+ api_type: LlmApiType
44
+ model: str
45
+ address: str | None = None
46
+ api_config: VertexAiConfig | OpenAiConfig | None = None