cocoindex 0.2.19__cp311-abi3-macosx_10_12_x86_64.whl → 0.2.21__cp311-abi3-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cocoindex might be problematic. Click here for more details.

cocoindex/_engine.abi3.so CHANGED
Binary file
cocoindex/cli.py CHANGED
@@ -84,9 +84,7 @@ def _load_user_app(app_target: str) -> None:
84
84
  try:
85
85
  load_user_app(app_target)
86
86
  except UserAppLoaderError as e:
87
- raise click.ClickException(
88
- f"Failed to load APP_TARGET '{app_target}': {e}"
89
- ) from e
87
+ raise ValueError(f"Failed to load APP_TARGET '{app_target}'") from e
90
88
 
91
89
  add_user_app(app_target)
92
90
 
cocoindex/engine_value.py CHANGED
@@ -70,6 +70,17 @@ def _is_type_kind_convertible_to(src_type_kind: str, dst_type_kind: str) -> bool
70
70
  ANY_TYPE_INFO = analyze_type_info(inspect.Parameter.empty)
71
71
 
72
72
 
73
+ def make_engine_key_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], Any]:
74
+ """
75
+ Create an encoder closure for a key type.
76
+ """
77
+ value_encoder = make_engine_value_encoder(type_info)
78
+ if isinstance(type_info.variant, AnalyzedBasicType):
79
+ return lambda value: [value_encoder(value)]
80
+ else:
81
+ return value_encoder
82
+
83
+
73
84
  def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], Any]:
74
85
  """
75
86
  Create an encoder closure for a specific type.
@@ -94,6 +105,9 @@ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], An
94
105
  # Otherwise it's a vector, falling into basic type in the engine.
95
106
 
96
107
  if isinstance(variant, AnalyzedDictType):
108
+ key_type_info = analyze_type_info(variant.key_type)
109
+ key_encoder = make_engine_key_encoder(key_type_info)
110
+
97
111
  value_type_info = analyze_type_info(variant.value_type)
98
112
  if not isinstance(value_type_info.variant, AnalyzedStructType):
99
113
  raise ValueError(
@@ -102,22 +116,10 @@ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], An
102
116
  )
103
117
  value_encoder = make_engine_value_encoder(value_type_info)
104
118
 
105
- key_type_info = analyze_type_info(variant.key_type)
106
- key_encoder = make_engine_value_encoder(key_type_info)
107
- if isinstance(key_type_info.variant, AnalyzedBasicType):
108
-
109
- def encode_row(k: Any, v: Any) -> Any:
110
- return [key_encoder(k)] + value_encoder(v)
111
-
112
- else:
113
-
114
- def encode_row(k: Any, v: Any) -> Any:
115
- return key_encoder(k) + value_encoder(v)
116
-
117
119
  def encode_struct_dict(value: Any) -> Any:
118
120
  if not value:
119
121
  return []
120
- return [encode_row(k, v) for k, v in value.items()]
122
+ return [key_encoder(k) + value_encoder(v) for k, v in value.items()]
121
123
 
122
124
  return encode_struct_dict
123
125
 
cocoindex/flow.py CHANGED
@@ -459,7 +459,9 @@ class _FlowBuilderState:
459
459
  field_name_builder: _NameBuilder
460
460
 
461
461
  def __init__(self, full_name: str):
462
- self.engine_flow_builder = _engine.FlowBuilder(full_name)
462
+ self.engine_flow_builder = _engine.FlowBuilder(
463
+ full_name, execution_context.event_loop
464
+ )
463
465
  self.field_name_builder = _NameBuilder()
464
466
 
465
467
  def get_data_slice(self, v: Any) -> _engine.DataSlice:
@@ -931,9 +933,7 @@ def _create_lazy_flow(
931
933
  flow_builder_state, flow_builder_state.engine_flow_builder.root_scope()
932
934
  )
933
935
  fl_def(FlowBuilder(flow_builder_state), root_scope)
934
- return flow_builder_state.engine_flow_builder.build_flow(
935
- execution_context.event_loop
936
- )
936
+ return flow_builder_state.engine_flow_builder.build_flow()
937
937
 
938
938
  return Flow(flow_name, _create_engine_flow)
939
939
 
cocoindex/llm.py CHANGED
@@ -14,6 +14,7 @@ class LlmApiType(Enum):
14
14
  OPEN_ROUTER = "OpenRouter"
15
15
  VOYAGE = "Voyage"
16
16
  VLLM = "Vllm"
17
+ BEDROCK = "Bedrock"
17
18
 
18
19
 
19
20
  @dataclass
cocoindex/op.py CHANGED
@@ -9,22 +9,33 @@ from typing import (
9
9
  Any,
10
10
  Awaitable,
11
11
  Callable,
12
+ Iterator,
12
13
  Protocol,
13
14
  dataclass_transform,
14
15
  Annotated,
16
+ TypeVar,
17
+ Generic,
18
+ Literal,
15
19
  get_args,
16
20
  )
21
+ from collections.abc import AsyncIterator
17
22
 
18
23
  from . import _engine # type: ignore
19
24
  from .subprocess_exec import executor_stub
20
25
  from .engine_object import dump_engine_object, load_engine_object
21
26
  from .engine_value import (
27
+ make_engine_key_encoder,
22
28
  make_engine_value_encoder,
23
29
  make_engine_value_decoder,
24
30
  make_engine_key_decoder,
25
31
  make_engine_struct_decoder,
26
32
  )
27
33
  from .typing import (
34
+ KEY_FIELD_NAME,
35
+ AnalyzedTypeInfo,
36
+ StructSchema,
37
+ StructType,
38
+ TableType,
28
39
  TypeAttr,
29
40
  encode_enriched_type_info,
30
41
  resolve_forward_ref,
@@ -96,12 +107,12 @@ class Executor(Protocol):
96
107
  op_category: OpCategory
97
108
 
98
109
 
99
- def _get_required_method(cls: type, name: str) -> Callable[..., Any]:
100
- method = getattr(cls, name, None)
110
+ def _get_required_method(obj: type, name: str) -> Callable[..., Any]:
111
+ method = getattr(obj, name, None)
101
112
  if method is None:
102
- raise ValueError(f"Method {name}() is required for {cls.__name__}")
103
- if not inspect.isfunction(method):
104
- raise ValueError(f"Method {cls.__name__}.{name}() is not a function")
113
+ raise ValueError(f"Method {name}() is required for {obj}")
114
+ if not inspect.isfunction(method) and not inspect.ismethod(method):
115
+ raise ValueError(f"{obj}.{name}() is not a function; {method}")
105
116
  return method
106
117
 
107
118
 
@@ -421,6 +432,252 @@ def function(**args: Any) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
421
432
  return _inner
422
433
 
423
434
 
435
+ ########################################################
436
+ # Custom source connector
437
+ ########################################################
438
+
439
+
440
+ @dataclasses.dataclass
441
+ class SourceReadOptions:
442
+ """
443
+ The options for reading a source row.
444
+ This is argument for both `list()` and `get_value()` methods.
445
+ Note that in most cases (unless spelled out otherwise below) it's not a mandatory requirement, but more like a hint to say it's useful under the current context.
446
+
447
+ - include_ordinal: Whether to include the ordinal of the source row.
448
+ When provides_ordinal() returns True, you must provide `ordinal` in `list()` when `include_ordinal` is True.
449
+ It's optional for other cases. It's helpful to skip unnecessary reprocessing early, and avoid output from older version of input over-writing the latest one when there's concurrency (especially multiple processes) and source updates frequently.
450
+
451
+ - include_content_version_fp: Whether to include the content version fingerprint of the source row.
452
+ It's always optional even if this is True.
453
+ It's helpful to skip unnecessary reprocessing early.
454
+ You should only consider providing it if you can directly get it without computing the hash on the content.
455
+
456
+ - include_value: Whether to include the value of the source row.
457
+ You must provide it in `get_value()` when `include_value` is True.
458
+ It's optional for `list()`.
459
+ Consider providing it when it's significantly cheaper then calling another `get_value()` for each row.
460
+ It will save costs of individual `get_value()` calls.
461
+ """
462
+
463
+ include_ordinal: bool = False
464
+ include_content_version_fp: bool = False
465
+ include_value: bool = False
466
+
467
+
468
+ K = TypeVar("K")
469
+ V = TypeVar("V")
470
+
471
+ NON_EXISTENCE: Literal["NON_EXISTENCE"] = "NON_EXISTENCE"
472
+ NO_ORDINAL: Literal["NO_ORDINAL"] = "NO_ORDINAL"
473
+
474
+
475
+ @dataclasses.dataclass
476
+ class PartialSourceRowData(Generic[V]):
477
+ """
478
+ The data of a source row.
479
+
480
+ - value: The value of the source row. NON_EXISTENCE means the row does not exist.
481
+ - ordinal: The ordinal of the source row. NO_ORDINAL means ordinal is not available for the source.
482
+ - content_version_fp: The content version fingerprint of the source row.
483
+ """
484
+
485
+ value: V | Literal["NON_EXISTENCE"] | None = None
486
+ ordinal: int | Literal["NO_ORDINAL"] | None = None
487
+ content_version_fp: bytes | None = None
488
+
489
+
490
+ @dataclasses.dataclass
491
+ class PartialSourceRow(Generic[K, V]):
492
+ key: K
493
+ data: PartialSourceRowData[V]
494
+
495
+
496
+ class _SourceExecutorContext:
497
+ _executor: Any
498
+
499
+ _key_encoder: Callable[[Any], Any]
500
+ _key_decoder: Callable[[Any], Any]
501
+
502
+ _value_encoder: Callable[[Any], Any]
503
+
504
+ _list_fn: Callable[
505
+ [SourceReadOptions],
506
+ AsyncIterator[PartialSourceRow[Any, Any]]
507
+ | Iterator[PartialSourceRow[Any, Any]],
508
+ ]
509
+ _orig_get_value_fn: Callable[..., Any]
510
+ _get_value_fn: Callable[..., Awaitable[PartialSourceRowData[Any]]]
511
+ _provides_ordinal_fn: Callable[[], bool] | None
512
+
513
+ def __init__(
514
+ self,
515
+ executor: Any,
516
+ key_type_info: AnalyzedTypeInfo,
517
+ key_decoder: Callable[[Any], Any],
518
+ value_type_info: AnalyzedTypeInfo,
519
+ ):
520
+ self._executor = executor
521
+
522
+ self._key_encoder = make_engine_key_encoder(key_type_info)
523
+ self._key_decoder = key_decoder
524
+ self._value_encoder = make_engine_value_encoder(value_type_info)
525
+
526
+ self._list_fn = _get_required_method(executor, "list")
527
+ self._orig_get_value_fn = _get_required_method(executor, "get_value")
528
+ self._get_value_fn = to_async_call(self._orig_get_value_fn)
529
+ self._provides_ordinal_fn = getattr(executor, "provides_ordinal", None)
530
+
531
+ def provides_ordinal(self) -> bool:
532
+ if self._provides_ordinal_fn is not None:
533
+ result = self._provides_ordinal_fn()
534
+ return bool(result)
535
+ else:
536
+ return False
537
+
538
+ async def list_async(
539
+ self, options: dict[str, Any]
540
+ ) -> AsyncIterator[tuple[Any, dict[str, Any]]]:
541
+ """
542
+ Return an async iterator that yields individual rows one by one.
543
+ Each yielded item is a tuple of (key, data).
544
+ """
545
+ read_options = load_engine_object(SourceReadOptions, options)
546
+ args = _build_args(self._list_fn, 0, options=read_options)
547
+ list_result = self._list_fn(*args)
548
+
549
+ # Handle both sync and async iterators
550
+ if hasattr(list_result, "__aiter__"):
551
+ async for partial_row in list_result:
552
+ yield (
553
+ self._key_encoder(partial_row.key),
554
+ self._encode_source_row_data(partial_row.data),
555
+ )
556
+ else:
557
+ for partial_row in list_result:
558
+ yield (
559
+ self._key_encoder(partial_row.key),
560
+ self._encode_source_row_data(partial_row.data),
561
+ )
562
+
563
+ async def get_value_async(
564
+ self,
565
+ raw_key: Any,
566
+ options: dict[str, Any],
567
+ ) -> dict[str, Any]:
568
+ key = self._key_decoder(raw_key)
569
+ read_options = load_engine_object(SourceReadOptions, options)
570
+ args = _build_args(self._orig_get_value_fn, 1, key=key, options=read_options)
571
+ row_data = await self._get_value_fn(*args)
572
+ return self._encode_source_row_data(row_data)
573
+
574
+ def _encode_source_row_data(
575
+ self, row_data: PartialSourceRowData[Any]
576
+ ) -> dict[str, Any]:
577
+ """Convert Python PartialSourceRowData to the format expected by Rust."""
578
+ return {
579
+ "ordinal": row_data.ordinal,
580
+ "content_version_fp": row_data.content_version_fp,
581
+ "value": (
582
+ NON_EXISTENCE
583
+ if row_data.value == NON_EXISTENCE
584
+ else self._value_encoder(row_data.value)
585
+ ),
586
+ }
587
+
588
+
589
+ class _SourceConnector:
590
+ """
591
+ The connector class passed to the engine.
592
+ """
593
+
594
+ _spec_cls: type[Any]
595
+ _key_type_info: AnalyzedTypeInfo
596
+ _key_decoder: Callable[[Any], Any]
597
+ _value_type_info: AnalyzedTypeInfo
598
+ _table_type: EnrichedValueType
599
+ _connector_cls: type[Any]
600
+
601
+ _create_fn: Callable[[Any], Awaitable[Any]]
602
+
603
+ def __init__(
604
+ self,
605
+ spec_cls: type[Any],
606
+ key_type: Any,
607
+ value_type: Any,
608
+ connector_cls: type[Any],
609
+ ):
610
+ self._spec_cls = spec_cls
611
+ self._key_type_info = analyze_type_info(key_type)
612
+ self._value_type_info = analyze_type_info(value_type)
613
+ self._connector_cls = connector_cls
614
+
615
+ # TODO: We can save the intermediate step after #1083 is fixed.
616
+ encoded_engine_key_type = encode_enriched_type_info(self._key_type_info)
617
+ engine_key_type = EnrichedValueType.decode(encoded_engine_key_type)
618
+
619
+ # TODO: We can save the intermediate step after #1083 is fixed.
620
+ encoded_engine_value_type = encode_enriched_type_info(self._value_type_info)
621
+ engine_value_type = EnrichedValueType.decode(encoded_engine_value_type)
622
+
623
+ if not isinstance(engine_value_type.type, StructType):
624
+ raise ValueError(f"Expected a StructType, got {engine_value_type.type}")
625
+
626
+ if isinstance(engine_key_type.type, StructType):
627
+ key_fields_schema = engine_key_type.type.fields
628
+ else:
629
+ key_fields_schema = [
630
+ FieldSchema(name=KEY_FIELD_NAME, value_type=engine_key_type)
631
+ ]
632
+ self._key_decoder = make_engine_key_decoder(
633
+ [], key_fields_schema, self._key_type_info
634
+ )
635
+ self._table_type = EnrichedValueType(
636
+ type=TableType(
637
+ kind="KTable",
638
+ row=StructSchema(
639
+ fields=key_fields_schema + engine_value_type.type.fields
640
+ ),
641
+ num_key_parts=len(key_fields_schema),
642
+ ),
643
+ )
644
+
645
+ self._create_fn = to_async_call(_get_required_method(connector_cls, "create"))
646
+
647
+ async def create_executor(self, raw_spec: dict[str, Any]) -> _SourceExecutorContext:
648
+ spec = load_engine_object(self._spec_cls, raw_spec)
649
+ executor = await self._create_fn(spec)
650
+ return _SourceExecutorContext(
651
+ executor, self._key_type_info, self._key_decoder, self._value_type_info
652
+ )
653
+
654
+ def get_table_type(self) -> Any:
655
+ return dump_engine_object(self._table_type)
656
+
657
+
658
+ def source_connector(
659
+ *,
660
+ spec_cls: type[Any],
661
+ key_type: Any = Any,
662
+ value_type: Any = Any,
663
+ ) -> Callable[[type], type]:
664
+ """
665
+ Decorate a class to provide a source connector for an op.
666
+ """
667
+
668
+ # Validate the spec_cls is a SourceSpec.
669
+ if not issubclass(spec_cls, SourceSpec):
670
+ raise ValueError(f"Expect a SourceSpec, got {spec_cls}")
671
+
672
+ # Register the source connector.
673
+ def _inner(connector_cls: type) -> type:
674
+ connector = _SourceConnector(spec_cls, key_type, value_type, connector_cls)
675
+ _engine.register_source_connector(spec_cls.__name__, connector)
676
+ return connector_cls
677
+
678
+ return _inner
679
+
680
+
424
681
  ########################################################
425
682
  # Custom target connector
426
683
  ########################################################
cocoindex/typing.py CHANGED
@@ -475,16 +475,16 @@ def _encode_type(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
475
475
  }
476
476
 
477
477
 
478
- def encode_enriched_type_info(enriched_type_info: AnalyzedTypeInfo) -> dict[str, Any]:
478
+ def encode_enriched_type_info(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
479
479
  """
480
- Encode an enriched type info to a CocoIndex engine's type representation
480
+ Encode an `AnalyzedTypeInfo` to a CocoIndex engine's `EnrichedValueType` representation
481
481
  """
482
- encoded: dict[str, Any] = {"type": _encode_type(enriched_type_info)}
482
+ encoded: dict[str, Any] = {"type": _encode_type(type_info)}
483
483
 
484
- if enriched_type_info.attrs is not None:
485
- encoded["attrs"] = enriched_type_info.attrs
484
+ if type_info.attrs is not None:
485
+ encoded["attrs"] = type_info.attrs
486
486
 
487
- if enriched_type_info.nullable:
487
+ if type_info.nullable:
488
488
  encoded["nullable"] = True
489
489
 
490
490
  return encoded
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cocoindex
3
- Version: 0.2.19
3
+ Version: 0.2.21
4
4
  Classifier: Development Status :: 3 - Alpha
5
5
  Classifier: License :: OSI Approved :: Apache Software License
6
6
  Classifier: Operating System :: OS Independent
@@ -75,7 +75,6 @@ Project-URL: Homepage, https://cocoindex.io/
75
75
  <a href="https://trendshift.io/repositories/13939" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13939" alt="cocoindex-io%2Fcocoindex | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
76
76
  </div>
77
77
 
78
-
79
78
  Ultra performant data transformation framework for AI, with core engine written in Rust. Support incremental processing and data lineage out-of-box. Exceptional developer velocity. Production-ready at day 0.
80
79
 
81
80
  ⭐ Drop a star to help us grow!
@@ -113,9 +112,8 @@ CocoIndex makes it effortless to transform data with AI, and keep source data an
113
112
 
114
113
  </br>
115
114
 
116
-
117
-
118
115
  ## Exceptional velocity
116
+
119
117
  Just declare transformation in dataflow with ~100 lines of python
120
118
 
121
119
  ```python
@@ -139,6 +137,7 @@ CocoIndex follows the idea of [Dataflow](https://en.wikipedia.org/wiki/Dataflow_
139
137
  **Particularly**, developers don't explicitly mutate data by creating, updating and deleting. They just need to define transformation/formula for a set of source data.
140
138
 
141
139
  ## Plug-and-Play Building Blocks
140
+
142
141
  Native builtins for different source, targets and transformations. Standardize interface, make it 1-line code switch between different components - as easy as assembling building blocks.
143
142
 
144
143
  <p align="center">
@@ -146,6 +145,7 @@ Native builtins for different source, targets and transformations. Standardize i
146
145
  </p>
147
146
 
148
147
  ## Data Freshness
148
+
149
149
  CocoIndex keep source data and target in sync effortlessly.
150
150
 
151
151
  <p align="center">
@@ -153,11 +153,14 @@ CocoIndex keep source data and target in sync effortlessly.
153
153
  </p>
154
154
 
155
155
  It has out-of-box support for incremental indexing:
156
+
156
157
  - minimal recomputation on source or logic change.
157
158
  - (re-)processing necessary portions; reuse cache when possible
158
159
 
159
- ## Quick Start:
160
+ ## Quick Start
161
+
160
162
  If you're new to CocoIndex, we recommend checking out
163
+
161
164
  - 📖 [Documentation](https://cocoindex.io/docs)
162
165
  - ⚡ [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart)
163
166
  - 🎬 [Quick Start Video Tutorial](https://youtu.be/gv5R8nOXsWU?si=9ioeKYkMEnYevTXT)
@@ -172,7 +175,6 @@ pip install -U cocoindex
172
175
 
173
176
  2. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. CocoIndex uses it for incremental processing.
174
177
 
175
-
176
178
  ## Define data flow
177
179
 
178
180
  Follow [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart) to define your first indexing flow. An example flow looks like:
@@ -228,6 +230,7 @@ It defines an index flow like this:
228
230
  | [Text Embedding](examples/text_embedding) | Index text documents with embeddings for semantic search |
229
231
  | [Code Embedding](examples/code_embedding) | Index code embeddings for semantic search |
230
232
  | [PDF Embedding](examples/pdf_embedding) | Parse PDF and index text embeddings for semantic search |
233
+ | [PDF Elements Embedding](examples/pdf_elements_embedding) | Extract text and images from PDFs; embed text with SentenceTransformers and images with CLIP; store in Qdrant for multimodal search |
231
234
  | [Manuals LLM Extraction](examples/manuals_llm_extraction) | Extract structured information from a manual using LLM |
232
235
  | [Amazon S3 Embedding](examples/amazon_s3_embedding) | Index text documents from Amazon S3 |
233
236
  | [Azure Blob Storage Embedding](examples/azure_blob_embedding) | Index text documents from Azure Blob Storage |
@@ -244,16 +247,18 @@ It defines an index flow like this:
244
247
  | [Custom Output Files](examples/custom_output_files) | Convert markdown files to HTML files and save them to a local directory, using *CocoIndex Custom Targets* |
245
248
  | [Patient intake form extraction](examples/patient_intake_extraction) | Use LLM to extract structured data from patient intake forms with different formats |
246
249
 
247
-
248
250
  More coming and stay tuned 👀!
249
251
 
250
252
  ## 📖 Documentation
253
+
251
254
  For detailed documentation, visit [CocoIndex Documentation](https://cocoindex.io/docs), including a [Quickstart guide](https://cocoindex.io/docs/getting_started/quickstart).
252
255
 
253
256
  ## 🤝 Contributing
257
+
254
258
  We love contributions from our community ❤️. For details on contributing or running the project for development, check out our [contributing guide](https://cocoindex.io/docs/about/contributing).
255
259
 
256
260
  ## 👥 Community
261
+
257
262
  Welcome with a huge coconut hug 🥥⋆。˚🤗. We are super excited for community contributions of all kinds - whether it's code improvements, documentation updates, issue reports, feature requests, and discussions in our Discord.
258
263
 
259
264
  Join our community here:
@@ -263,9 +268,11 @@ Join our community here:
263
268
  - ▶️ [Subscribe to our YouTube channel](https://www.youtube.com/@cocoindex-io)
264
269
  - 📜 [Read our blog posts](https://cocoindex.io/blogs/)
265
270
 
266
- ## Support us:
271
+ ## Support us
272
+
267
273
  We are constantly improving, and more features and examples are coming soon. If you love this project, please drop us a star ⭐ at GitHub repo [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) to stay tuned and help us grow.
268
274
 
269
275
  ## License
276
+
270
277
  CocoIndex is Apache 2.0 licensed.
271
278
 
@@ -1,23 +1,22 @@
1
- cocoindex-0.2.19.dist-info/METADATA,sha256=z-JkcBlnCXXhcn7yDzAi2OFeeyUKmozPm-p12261vU8,13444
2
- cocoindex-0.2.19.dist-info/WHEEL,sha256=eDlp9unULyyDxD2Zd14qZwSC_Y-kO5nuRBdlMsoCXEY,105
3
- cocoindex-0.2.19.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
4
- cocoindex-0.2.19.dist-info/licenses/THIRD_PARTY_NOTICES.html,sha256=FpJNEsnvjyAB1qtTee_XMRM_2Thi2mbI0mfm-AVM9Ag,719620
1
+ cocoindex-0.2.21.dist-info/METADATA,sha256=dmcpI0cptqzSRSMvU4SdifiOithkSaYcgIPZCEBp2Mk,13644
2
+ cocoindex-0.2.21.dist-info/WHEEL,sha256=eDlp9unULyyDxD2Zd14qZwSC_Y-kO5nuRBdlMsoCXEY,105
3
+ cocoindex-0.2.21.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
4
+ cocoindex-0.2.21.dist-info/licenses/THIRD_PARTY_NOTICES.html,sha256=_rafrUNfyvjjCzp4oqD9ti424cq-OkzpGbOW9RFmwls,719655
5
5
  cocoindex/__init__.py,sha256=6qZWVkK4WZ01BIAg3CPh_bRRdA6Clk4d4Q6OnZ2jFa4,2630
6
- cocoindex/_engine.abi3.so,sha256=g0xBUi6nEZZsrskMfyHPHfLPkEefxznFgBk94CWt6P8,73872996
6
+ cocoindex/_engine.abi3.so,sha256=QE0mlwKln9iI462G41R-VFfRHlyocabZipLD5ji6Mb8,69615752
7
7
  cocoindex/auth_registry.py,sha256=g-uLDWLYW5NMbYe7q4Y-sU5dSyrlJXBEciyWtAiP9KE,1340
8
- cocoindex/cli.py,sha256=19IszBXOzqGn0xOV1SaS-oR9NupTmIm18uzFNET7NTQ,23978
8
+ cocoindex/cli.py,sha256=vk_YtGMPXTuu1U4J_VxzjWfTFv8Fu3tdyaVocpoxb5g,23941
9
9
  cocoindex/engine_object.py,sha256=5YTuWoR3WILhyt3PW-d9es3MAas_xD6tZZqvipN-sjg,10050
10
- cocoindex/engine_value.py,sha256=8M7MbwVG2bfd3kFptGGbQHBAp9pD3TVjrBiBDOAhD5M,23211
11
- cocoindex/flow.py,sha256=JWPTR2G6TdPJkO5ZlrCcyDyQ8utUS4zZWNR8zsHTeW8,40074
12
- cocoindex/functions.py,sha256=V4ljBnCprvA25XlCVvNLwK5ergXiEcKU76jkOGC-X3A,12882
10
+ cocoindex/engine_value.py,sha256=WJw8ymYAqF2CCyg9SBiQzx8z9bl7XNVuD6ffgYvRRWQ,23277
11
+ cocoindex/flow.py,sha256=xDz3rOo4RhbboknvC-KnbWq8RBykEO0YsjGSBfXqIEg,40076
13
12
  cocoindex/functions/__init__.py,sha256=V2IF4h-Cqq4OD_GN3Oqdry-FArORyRCKmqJ7g5UlJr8,1021
14
13
  cocoindex/functions/_engine_builtin_specs.py,sha256=WpCGrjUfJBa8xZP5JiEmA8kLu7fp9Rcs7ynpuJmvSGg,1786
15
14
  cocoindex/functions/colpali.py,sha256=oACyG3qG2dquyCJ6bT7FkMkua5rXDLSxnOHcgoz9waU,8865
16
15
  cocoindex/functions/sbert.py,sha256=1z5OJT-blXT6tVN5vEvEzvYAzOnzs1RCnu1UbCUP6wM,2162
17
16
  cocoindex/index.py,sha256=tz5ilvmOp0BtroGehCQDqWK_pIX9m6ghkhcxsDVU8WE,982
18
17
  cocoindex/lib.py,sha256=spfdU4IbzdffHyGdrQPIw_qGo9aX0OAAboqsjj8bTiQ,2290
19
- cocoindex/llm.py,sha256=Pv_cdnRngTLtuLU9AUmS8izIHhcKVnuBNolC33f9BDI,851
20
- cocoindex/op.py,sha256=Ycvr6lJf7hcCCjYUqHtXZqzSeDD-FQdP3_jcmZUV_zI,26896
18
+ cocoindex/llm.py,sha256=8ZdJhOmhdb2xEcCxk6rDpnj6hlhCyFBmJdhCNMqAOP4,875
19
+ cocoindex/op.py,sha256=TO-ETk3qXgnNS51NlWuLrOw_TfQ2mw83-_iswqULcQI,36095
21
20
  cocoindex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
21
  cocoindex/query_handler.py,sha256=X-SQT71LHiOOXn6-TJlQcGodJk-iT8p_1TcIMvRLBRI,1344
23
22
  cocoindex/runtime.py,sha256=4NxcltaDZvA3RR3Pnt6gH_f99jcWSyMH_1Xi5BjbtwY,1342
@@ -36,8 +35,8 @@ cocoindex/tests/test_optional_database.py,sha256=snAmkNa6wtOSaxoZE1HgjvL5v_ylitt
36
35
  cocoindex/tests/test_transform_flow.py,sha256=G69w-n-vnCTo3r9hVIk2lJNAQEkGUA7PZfHsXna3oS0,6030
37
36
  cocoindex/tests/test_typing.py,sha256=JoR-oMK-ZWjOGQi0pH5Etg5jp4oL_JSIreGBH247GCg,16291
38
37
  cocoindex/tests/test_validation.py,sha256=X6AQzVs-hVKIXcrHMEMQnhfUE8at7iXQnPq8nHNhZ2Q,4543
39
- cocoindex/typing.py,sha256=so_RusbhBmg_uLoZTY7W_pqU0aIJwFarkTF5NQufl4o,23944
38
+ cocoindex/typing.py,sha256=qQj5uM6XAKHzRJ2BIEs7X-xeOXVcM9p_xz5SVqPVvS8,23914
40
39
  cocoindex/user_app_loader.py,sha256=bc3Af-gYRxJ9GpObtpjegZY855oQBCv5FGkrkWV2yGY,1873
41
40
  cocoindex/utils.py,sha256=hUhX-XV6XGCtJSEIpBOuDv6VvqImwPlgBxztBTw7u0U,598
42
41
  cocoindex/validation.py,sha256=PZnJoby4sLbsmPv9fOjOQXuefjfZ7gmtsiTGU8SH-tc,3090
43
- cocoindex-0.2.19.dist-info/RECORD,,
42
+ cocoindex-0.2.21.dist-info/RECORD,,
@@ -2428,7 +2428,7 @@ Software.
2428
2428
  <h3 id="Apache-2.0">Apache License 2.0</h3>
2429
2429
  <h4>Used by:</h4>
2430
2430
  <ul class="license-used-by">
2431
- <li><a href=" https://crates.io/crates/cocoindex ">cocoindex 0.2.19</a></li>
2431
+ <li><a href=" https://crates.io/crates/cocoindex ">cocoindex 0.2.21</a></li>
2432
2432
  <li><a href=" https://github.com/awesomized/crc-fast-rust ">crc-fast 1.3.0</a></li>
2433
2433
  <li><a href=" https://github.com/qdrant/rust-client ">qdrant-client 1.15.0</a></li>
2434
2434
  </ul>
@@ -10677,6 +10677,38 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
10677
10677
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
10678
10678
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
10679
10679
  THE SOFTWARE.</pre>
10680
+ </li>
10681
+ <li class="license">
10682
+ <h3 id="MIT">MIT License</h3>
10683
+ <h4>Used by:</h4>
10684
+ <ul class="license-used-by">
10685
+ <li><a href=" https://github.com/tree-sitter/tree-sitter-scala ">tree-sitter-scala 0.24.0</a></li>
10686
+ </ul>
10687
+ <pre class="license-text">(The MIT License)
10688
+
10689
+ Copyright (c) 2014 Nathan Rajlich &lt;nathan@tootallnate.net&gt;
10690
+
10691
+ Permission is hereby granted, free of charge, to any person
10692
+ obtaining a copy of this software and associated documentation
10693
+ files (the &quot;Software&quot;), to deal in the Software without
10694
+ restriction, including without limitation the rights to use,
10695
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
10696
+ copies of the Software, and to permit persons to whom the
10697
+ Software is furnished to do so, subject to the following
10698
+ conditions:
10699
+
10700
+ The above copyright notice and this permission notice shall be
10701
+ included in all copies or substantial portions of the Software.
10702
+
10703
+ THE SOFTWARE IS PROVIDED &quot;AS IS&quot;, WITHOUT WARRANTY OF ANY KIND,
10704
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
10705
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
10706
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
10707
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
10708
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
10709
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
10710
+ OTHER DEALINGS IN THE SOFTWARE.
10711
+ </pre>
10680
10712
  </li>
10681
10713
  <li class="license">
10682
10714
  <h3 id="MIT">MIT License</h3>
@@ -12300,32 +12332,6 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
12300
12332
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
12301
12333
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
12302
12334
  THE SOFTWARE.
12303
- </pre>
12304
- </li>
12305
- <li class="license">
12306
- <h3 id="MIT">MIT License</h3>
12307
- <h4>Used by:</h4>
12308
- <ul class="license-used-by">
12309
- <li><a href=" https://github.com/tree-sitter/tree-sitter-scala ">tree-sitter-scala 0.24.0</a></li>
12310
- </ul>
12311
- <pre class="license-text">This software is released under the MIT license:
12312
-
12313
- Permission is hereby granted, free of charge, to any person obtaining a copy of
12314
- this software and associated documentation files (the &quot;Software&quot;), to deal in
12315
- the Software without restriction, including without limitation the rights to
12316
- use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
12317
- the Software, and to permit persons to whom the Software is furnished to do so,
12318
- subject to the following conditions:
12319
-
12320
- The above copyright notice and this permission notice shall be included in all
12321
- copies or substantial portions of the Software.
12322
-
12323
- THE SOFTWARE IS PROVIDED &quot;AS IS&quot;, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12324
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
12325
- FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12326
- COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
12327
- IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
12328
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
12329
12335
  </pre>
12330
12336
  </li>
12331
12337
  <li class="license">
cocoindex/functions.py DELETED
@@ -1,375 +0,0 @@
1
- """All builtin functions."""
2
-
3
- import dataclasses
4
- import functools
5
- from typing import Any, Literal
6
-
7
- import numpy as np
8
- from numpy.typing import NDArray
9
-
10
- from . import llm, op
11
- from .typing import Vector
12
-
13
-
14
- class ParseJson(op.FunctionSpec):
15
- """Parse a text into a JSON object."""
16
-
17
-
18
- @dataclasses.dataclass
19
- class CustomLanguageSpec:
20
- """Custom language specification."""
21
-
22
- language_name: str
23
- separators_regex: list[str]
24
- aliases: list[str] = dataclasses.field(default_factory=list)
25
-
26
-
27
- @dataclasses.dataclass
28
- class ColPaliModelInfo:
29
- """Data structure for ColPali model and processor."""
30
-
31
- model: Any
32
- processor: Any
33
- dimension: int
34
- device: Any
35
-
36
-
37
- class SplitRecursively(op.FunctionSpec):
38
- """Split a document (in string) recursively."""
39
-
40
- custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
41
-
42
-
43
- class SplitBySeparators(op.FunctionSpec):
44
- """
45
- Split text by specified regex separators only.
46
- Output schema matches SplitRecursively for drop-in compatibility:
47
- KTable rows with fields: location (Range), text (Str), start, end.
48
- Args:
49
- separators_regex: list[str] # e.g., [r"\\n\\n+"]
50
- keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
51
- include_empty: bool = False
52
- trim: bool = True
53
- """
54
-
55
- separators_regex: list[str] = dataclasses.field(default_factory=list)
56
- keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
57
- include_empty: bool = False
58
- trim: bool = True
59
-
60
-
61
- class EmbedText(op.FunctionSpec):
62
- """Embed a text into a vector space."""
63
-
64
- api_type: llm.LlmApiType
65
- model: str
66
- address: str | None = None
67
- output_dimension: int | None = None
68
- task_type: str | None = None
69
- api_config: llm.VertexAiConfig | None = None
70
-
71
-
72
- class ExtractByLlm(op.FunctionSpec):
73
- """Extract information from a text using a LLM."""
74
-
75
- llm_spec: llm.LlmSpec
76
- output_type: type
77
- instruction: str | None = None
78
-
79
-
80
- class SentenceTransformerEmbed(op.FunctionSpec):
81
- """
82
- `SentenceTransformerEmbed` embeds a text into a vector space using the [SentenceTransformer](https://huggingface.co/sentence-transformers) library.
83
-
84
- Args:
85
-
86
- model: The name of the SentenceTransformer model to use.
87
- args: Additional arguments to pass to the SentenceTransformer constructor. e.g. {"trust_remote_code": True}
88
-
89
- Note:
90
- This function requires the optional sentence-transformers dependency.
91
- Install it with: pip install 'cocoindex[embeddings]'
92
- """
93
-
94
- model: str
95
- args: dict[str, Any] | None = None
96
-
97
-
98
- @op.executor_class(
99
- gpu=True,
100
- cache=True,
101
- behavior_version=1,
102
- arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),
103
- )
104
- class SentenceTransformerEmbedExecutor:
105
- """Executor for SentenceTransformerEmbed."""
106
-
107
- spec: SentenceTransformerEmbed
108
- _model: Any | None = None
109
-
110
- def analyze(self) -> type:
111
- try:
112
- # Only import sentence_transformers locally when it's needed, as its import is very slow.
113
- import sentence_transformers # pylint: disable=import-outside-toplevel
114
- except ImportError as e:
115
- raise ImportError(
116
- "sentence_transformers is required for SentenceTransformerEmbed function. "
117
- "Install it with one of these commands:\n"
118
- " pip install 'cocoindex[embeddings]'\n"
119
- " pip install sentence-transformers"
120
- ) from e
121
-
122
- args = self.spec.args or {}
123
- self._model = sentence_transformers.SentenceTransformer(self.spec.model, **args)
124
- dim = self._model.get_sentence_embedding_dimension()
125
- return Vector[np.float32, Literal[dim]] # type: ignore
126
-
127
- def __call__(self, text: str) -> NDArray[np.float32]:
128
- assert self._model is not None
129
- result: NDArray[np.float32] = self._model.encode(text, convert_to_numpy=True)
130
- return result
131
-
132
-
133
- @functools.cache
134
- def _get_colpali_model_and_processor(model_name: str) -> ColPaliModelInfo:
135
- """Get or load ColPali model and processor, with caching."""
136
- try:
137
- from colpali_engine.models import ( # type: ignore[import-untyped]
138
- ColPali,
139
- ColPaliProcessor,
140
- ColQwen2,
141
- ColQwen2Processor,
142
- ColQwen2_5,
143
- ColQwen2_5_Processor,
144
- ColIdefics3,
145
- ColIdefics3Processor,
146
- )
147
- from colpali_engine.utils.torch_utils import get_torch_device # type: ignore[import-untyped]
148
- import torch
149
- except ImportError as e:
150
- raise ImportError(
151
- "ColVision models are not available. Make sure cocoindex is installed with ColPali support."
152
- ) from e
153
-
154
- device = get_torch_device("auto")
155
-
156
- # Manual model detection based on model name
157
- model_name_lower = model_name.lower()
158
-
159
- try:
160
- if "qwen2.5" in model_name_lower:
161
- model = ColQwen2_5.from_pretrained(
162
- model_name,
163
- torch_dtype=torch.bfloat16,
164
- device_map=device,
165
- ).eval()
166
- processor = ColQwen2_5_Processor.from_pretrained(model_name)
167
- elif "qwen2" in model_name_lower:
168
- model = ColQwen2.from_pretrained(
169
- model_name,
170
- torch_dtype=torch.bfloat16,
171
- device_map=device,
172
- ).eval()
173
- processor = ColQwen2Processor.from_pretrained(model_name)
174
- elif "colsmol" in model_name_lower or "smol" in model_name_lower:
175
- # ColSmol models use Idefics3 architecture
176
- model = ColIdefics3.from_pretrained(
177
- model_name,
178
- torch_dtype=torch.bfloat16,
179
- device_map=device,
180
- ).eval()
181
- processor = ColIdefics3Processor.from_pretrained(model_name)
182
- else:
183
- # Default to ColPali
184
- model = ColPali.from_pretrained(
185
- model_name,
186
- torch_dtype=torch.bfloat16,
187
- device_map=device,
188
- ).eval()
189
- processor = ColPaliProcessor.from_pretrained(model_name)
190
-
191
- except Exception as e:
192
- raise RuntimeError(f"Failed to load model {model_name}: {e}")
193
-
194
- # Get dimension from the actual model
195
- dimension = _detect_colpali_dimension(model, processor, device)
196
-
197
- return ColPaliModelInfo(
198
- model=model,
199
- processor=processor,
200
- dimension=dimension,
201
- device=device,
202
- )
203
-
204
-
205
- def _detect_colpali_dimension(model: Any, processor: Any, device: Any) -> int:
206
- """Detect ColPali embedding dimension from the actual model config."""
207
- # Try to access embedding dimension
208
- if hasattr(model.config, "embedding_dim"):
209
- dim = model.config.embedding_dim
210
- else:
211
- # Fallback: infer from output shape with dummy data
212
- from PIL import Image
213
- import numpy as np
214
- import torch
215
-
216
- dummy_img = Image.fromarray(np.zeros((224, 224, 3), np.uint8))
217
- # Use the processor to process the dummy image
218
- processed = processor.process_images([dummy_img]).to(device)
219
- with torch.no_grad():
220
- output = model(**processed)
221
- dim = int(output.shape[-1])
222
- if isinstance(dim, int):
223
- return dim
224
- else:
225
- raise ValueError(f"Expected integer dimension, got {type(dim)}: {dim}")
226
- return dim
227
-
228
-
229
- class ColPaliEmbedImage(op.FunctionSpec):
230
- """
231
- `ColPaliEmbedImage` embeds images using ColVision multimodal models.
232
-
233
- Supports ALL models available in the colpali-engine library, including:
234
- - ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
235
- - ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
236
- - ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
237
- - Any future ColVision models supported by colpali-engine
238
-
239
- These models use late interaction between image patch embeddings and text token
240
- embeddings for retrieval.
241
-
242
- Args:
243
- model: Any ColVision model name supported by colpali-engine
244
- (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
245
- See https://github.com/illuin-tech/colpali for the complete list of supported models.
246
-
247
- Note:
248
- This function requires the optional colpali-engine dependency.
249
- Install it with: pip install 'cocoindex[colpali]'
250
- """
251
-
252
- model: str
253
-
254
-
255
- @op.executor_class(
256
- gpu=True,
257
- cache=True,
258
- behavior_version=1,
259
- )
260
- class ColPaliEmbedImageExecutor:
261
- """Executor for ColVision image embedding (ColPali, ColQwen2, ColSmol, etc.)."""
262
-
263
- spec: ColPaliEmbedImage
264
- _model_info: ColPaliModelInfo
265
-
266
- def analyze(self) -> type:
267
- # Get shared model and dimension
268
- self._model_info = _get_colpali_model_and_processor(self.spec.model)
269
-
270
- # Return multi-vector type: Variable patches x Fixed hidden dimension
271
- dimension = self._model_info.dimension
272
- return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
273
-
274
- def __call__(self, img_bytes: bytes) -> Any:
275
- try:
276
- from PIL import Image
277
- import torch
278
- import io
279
- except ImportError as e:
280
- raise ImportError(
281
- "Required dependencies (PIL, torch) are missing for ColVision image embedding."
282
- ) from e
283
-
284
- model = self._model_info.model
285
- processor = self._model_info.processor
286
- device = self._model_info.device
287
-
288
- pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
289
- inputs = processor.process_images([pil_image]).to(device)
290
- with torch.no_grad():
291
- embeddings = model(**inputs)
292
-
293
- # Return multi-vector format: [patches, hidden_dim]
294
- if len(embeddings.shape) != 3:
295
- raise ValueError(
296
- f"Expected 3D tensor [batch, patches, hidden_dim], got shape {embeddings.shape}"
297
- )
298
-
299
- # Keep patch-level embeddings: [batch, patches, hidden_dim] -> [patches, hidden_dim]
300
- patch_embeddings = embeddings[0] # Remove batch dimension
301
-
302
- return patch_embeddings.cpu().to(torch.float32).numpy()
303
-
304
-
305
- class ColPaliEmbedQuery(op.FunctionSpec):
306
- """
307
- `ColPaliEmbedQuery` embeds text queries using ColVision multimodal models.
308
-
309
- Supports ALL models available in the colpali-engine library, including:
310
- - ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
311
- - ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
312
- - ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
313
- - Any future ColVision models supported by colpali-engine
314
-
315
- This produces query embeddings compatible with ColVision image embeddings
316
- for late interaction scoring (MaxSim).
317
-
318
- Args:
319
- model: Any ColVision model name supported by colpali-engine
320
- (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
321
- See https://github.com/illuin-tech/colpali for the complete list of supported models.
322
-
323
- Note:
324
- This function requires the optional colpali-engine dependency.
325
- Install it with: pip install 'cocoindex[colpali]'
326
- """
327
-
328
- model: str
329
-
330
-
331
- @op.executor_class(
332
- gpu=True,
333
- cache=True,
334
- behavior_version=1,
335
- )
336
- class ColPaliEmbedQueryExecutor:
337
- """Executor for ColVision query embedding (ColPali, ColQwen2, ColSmol, etc.)."""
338
-
339
- spec: ColPaliEmbedQuery
340
- _model_info: ColPaliModelInfo
341
-
342
- def analyze(self) -> type:
343
- # Get shared model and dimension
344
- self._model_info = _get_colpali_model_and_processor(self.spec.model)
345
-
346
- # Return multi-vector type: Variable tokens x Fixed hidden dimension
347
- dimension = self._model_info.dimension
348
- return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
349
-
350
- def __call__(self, query: str) -> Any:
351
- try:
352
- import torch
353
- except ImportError as e:
354
- raise ImportError(
355
- "Required dependencies (torch) are missing for ColVision query embedding."
356
- ) from e
357
-
358
- model = self._model_info.model
359
- processor = self._model_info.processor
360
- device = self._model_info.device
361
-
362
- inputs = processor.process_queries([query]).to(device)
363
- with torch.no_grad():
364
- embeddings = model(**inputs)
365
-
366
- # Return multi-vector format: [tokens, hidden_dim]
367
- if len(embeddings.shape) != 3:
368
- raise ValueError(
369
- f"Expected 3D tensor [batch, tokens, hidden_dim], got shape {embeddings.shape}"
370
- )
371
-
372
- # Keep token-level embeddings: [batch, tokens, hidden_dim] -> [tokens, hidden_dim]
373
- token_embeddings = embeddings[0] # Remove batch dimension
374
-
375
- return token_embeddings.cpu().to(torch.float32).numpy()