cocoindex 0.2.19__cp311-abi3-win_amd64.whl → 0.2.21__cp311-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cocoindex might be problematic. Click here for more details.
- cocoindex/_engine.pyd +0 -0
- cocoindex/cli.py +1 -3
- cocoindex/engine_value.py +15 -13
- cocoindex/flow.py +4 -4
- cocoindex/llm.py +1 -0
- cocoindex/op.py +262 -5
- cocoindex/typing.py +6 -6
- {cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/METADATA +15 -8
- {cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/RECORD +12 -13
- {cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/licenses/THIRD_PARTY_NOTICES.html +33 -27
- cocoindex/functions.py +0 -375
- {cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/WHEEL +0 -0
- {cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/entry_points.txt +0 -0
cocoindex/_engine.pyd
CHANGED
|
Binary file
|
cocoindex/cli.py
CHANGED
|
@@ -84,9 +84,7 @@ def _load_user_app(app_target: str) -> None:
|
|
|
84
84
|
try:
|
|
85
85
|
load_user_app(app_target)
|
|
86
86
|
except UserAppLoaderError as e:
|
|
87
|
-
raise
|
|
88
|
-
f"Failed to load APP_TARGET '{app_target}': {e}"
|
|
89
|
-
) from e
|
|
87
|
+
raise ValueError(f"Failed to load APP_TARGET '{app_target}'") from e
|
|
90
88
|
|
|
91
89
|
add_user_app(app_target)
|
|
92
90
|
|
cocoindex/engine_value.py
CHANGED
|
@@ -70,6 +70,17 @@ def _is_type_kind_convertible_to(src_type_kind: str, dst_type_kind: str) -> bool
|
|
|
70
70
|
ANY_TYPE_INFO = analyze_type_info(inspect.Parameter.empty)
|
|
71
71
|
|
|
72
72
|
|
|
73
|
+
def make_engine_key_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], Any]:
|
|
74
|
+
"""
|
|
75
|
+
Create an encoder closure for a key type.
|
|
76
|
+
"""
|
|
77
|
+
value_encoder = make_engine_value_encoder(type_info)
|
|
78
|
+
if isinstance(type_info.variant, AnalyzedBasicType):
|
|
79
|
+
return lambda value: [value_encoder(value)]
|
|
80
|
+
else:
|
|
81
|
+
return value_encoder
|
|
82
|
+
|
|
83
|
+
|
|
73
84
|
def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], Any]:
|
|
74
85
|
"""
|
|
75
86
|
Create an encoder closure for a specific type.
|
|
@@ -94,6 +105,9 @@ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], An
|
|
|
94
105
|
# Otherwise it's a vector, falling into basic type in the engine.
|
|
95
106
|
|
|
96
107
|
if isinstance(variant, AnalyzedDictType):
|
|
108
|
+
key_type_info = analyze_type_info(variant.key_type)
|
|
109
|
+
key_encoder = make_engine_key_encoder(key_type_info)
|
|
110
|
+
|
|
97
111
|
value_type_info = analyze_type_info(variant.value_type)
|
|
98
112
|
if not isinstance(value_type_info.variant, AnalyzedStructType):
|
|
99
113
|
raise ValueError(
|
|
@@ -102,22 +116,10 @@ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], An
|
|
|
102
116
|
)
|
|
103
117
|
value_encoder = make_engine_value_encoder(value_type_info)
|
|
104
118
|
|
|
105
|
-
key_type_info = analyze_type_info(variant.key_type)
|
|
106
|
-
key_encoder = make_engine_value_encoder(key_type_info)
|
|
107
|
-
if isinstance(key_type_info.variant, AnalyzedBasicType):
|
|
108
|
-
|
|
109
|
-
def encode_row(k: Any, v: Any) -> Any:
|
|
110
|
-
return [key_encoder(k)] + value_encoder(v)
|
|
111
|
-
|
|
112
|
-
else:
|
|
113
|
-
|
|
114
|
-
def encode_row(k: Any, v: Any) -> Any:
|
|
115
|
-
return key_encoder(k) + value_encoder(v)
|
|
116
|
-
|
|
117
119
|
def encode_struct_dict(value: Any) -> Any:
|
|
118
120
|
if not value:
|
|
119
121
|
return []
|
|
120
|
-
return [
|
|
122
|
+
return [key_encoder(k) + value_encoder(v) for k, v in value.items()]
|
|
121
123
|
|
|
122
124
|
return encode_struct_dict
|
|
123
125
|
|
cocoindex/flow.py
CHANGED
|
@@ -459,7 +459,9 @@ class _FlowBuilderState:
|
|
|
459
459
|
field_name_builder: _NameBuilder
|
|
460
460
|
|
|
461
461
|
def __init__(self, full_name: str):
|
|
462
|
-
self.engine_flow_builder = _engine.FlowBuilder(
|
|
462
|
+
self.engine_flow_builder = _engine.FlowBuilder(
|
|
463
|
+
full_name, execution_context.event_loop
|
|
464
|
+
)
|
|
463
465
|
self.field_name_builder = _NameBuilder()
|
|
464
466
|
|
|
465
467
|
def get_data_slice(self, v: Any) -> _engine.DataSlice:
|
|
@@ -931,9 +933,7 @@ def _create_lazy_flow(
|
|
|
931
933
|
flow_builder_state, flow_builder_state.engine_flow_builder.root_scope()
|
|
932
934
|
)
|
|
933
935
|
fl_def(FlowBuilder(flow_builder_state), root_scope)
|
|
934
|
-
return flow_builder_state.engine_flow_builder.build_flow(
|
|
935
|
-
execution_context.event_loop
|
|
936
|
-
)
|
|
936
|
+
return flow_builder_state.engine_flow_builder.build_flow()
|
|
937
937
|
|
|
938
938
|
return Flow(flow_name, _create_engine_flow)
|
|
939
939
|
|
cocoindex/llm.py
CHANGED
cocoindex/op.py
CHANGED
|
@@ -9,22 +9,33 @@ from typing import (
|
|
|
9
9
|
Any,
|
|
10
10
|
Awaitable,
|
|
11
11
|
Callable,
|
|
12
|
+
Iterator,
|
|
12
13
|
Protocol,
|
|
13
14
|
dataclass_transform,
|
|
14
15
|
Annotated,
|
|
16
|
+
TypeVar,
|
|
17
|
+
Generic,
|
|
18
|
+
Literal,
|
|
15
19
|
get_args,
|
|
16
20
|
)
|
|
21
|
+
from collections.abc import AsyncIterator
|
|
17
22
|
|
|
18
23
|
from . import _engine # type: ignore
|
|
19
24
|
from .subprocess_exec import executor_stub
|
|
20
25
|
from .engine_object import dump_engine_object, load_engine_object
|
|
21
26
|
from .engine_value import (
|
|
27
|
+
make_engine_key_encoder,
|
|
22
28
|
make_engine_value_encoder,
|
|
23
29
|
make_engine_value_decoder,
|
|
24
30
|
make_engine_key_decoder,
|
|
25
31
|
make_engine_struct_decoder,
|
|
26
32
|
)
|
|
27
33
|
from .typing import (
|
|
34
|
+
KEY_FIELD_NAME,
|
|
35
|
+
AnalyzedTypeInfo,
|
|
36
|
+
StructSchema,
|
|
37
|
+
StructType,
|
|
38
|
+
TableType,
|
|
28
39
|
TypeAttr,
|
|
29
40
|
encode_enriched_type_info,
|
|
30
41
|
resolve_forward_ref,
|
|
@@ -96,12 +107,12 @@ class Executor(Protocol):
|
|
|
96
107
|
op_category: OpCategory
|
|
97
108
|
|
|
98
109
|
|
|
99
|
-
def _get_required_method(
|
|
100
|
-
method = getattr(
|
|
110
|
+
def _get_required_method(obj: type, name: str) -> Callable[..., Any]:
|
|
111
|
+
method = getattr(obj, name, None)
|
|
101
112
|
if method is None:
|
|
102
|
-
raise ValueError(f"Method {name}() is required for {
|
|
103
|
-
if not inspect.isfunction(method):
|
|
104
|
-
raise ValueError(f"
|
|
113
|
+
raise ValueError(f"Method {name}() is required for {obj}")
|
|
114
|
+
if not inspect.isfunction(method) and not inspect.ismethod(method):
|
|
115
|
+
raise ValueError(f"{obj}.{name}() is not a function; {method}")
|
|
105
116
|
return method
|
|
106
117
|
|
|
107
118
|
|
|
@@ -421,6 +432,252 @@ def function(**args: Any) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
|
|
421
432
|
return _inner
|
|
422
433
|
|
|
423
434
|
|
|
435
|
+
########################################################
|
|
436
|
+
# Custom source connector
|
|
437
|
+
########################################################
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
@dataclasses.dataclass
|
|
441
|
+
class SourceReadOptions:
|
|
442
|
+
"""
|
|
443
|
+
The options for reading a source row.
|
|
444
|
+
This is argument for both `list()` and `get_value()` methods.
|
|
445
|
+
Note that in most cases (unless spelled out otherwise below) it's not a mandatory requirement, but more like a hint to say it's useful under the current context.
|
|
446
|
+
|
|
447
|
+
- include_ordinal: Whether to include the ordinal of the source row.
|
|
448
|
+
When provides_ordinal() returns True, you must provide `ordinal` in `list()` when `include_ordinal` is True.
|
|
449
|
+
It's optional for other cases. It's helpful to skip unnecessary reprocessing early, and avoid output from older version of input over-writing the latest one when there's concurrency (especially multiple processes) and source updates frequently.
|
|
450
|
+
|
|
451
|
+
- include_content_version_fp: Whether to include the content version fingerprint of the source row.
|
|
452
|
+
It's always optional even if this is True.
|
|
453
|
+
It's helpful to skip unnecessary reprocessing early.
|
|
454
|
+
You should only consider providing it if you can directly get it without computing the hash on the content.
|
|
455
|
+
|
|
456
|
+
- include_value: Whether to include the value of the source row.
|
|
457
|
+
You must provide it in `get_value()` when `include_value` is True.
|
|
458
|
+
It's optional for `list()`.
|
|
459
|
+
Consider providing it when it's significantly cheaper then calling another `get_value()` for each row.
|
|
460
|
+
It will save costs of individual `get_value()` calls.
|
|
461
|
+
"""
|
|
462
|
+
|
|
463
|
+
include_ordinal: bool = False
|
|
464
|
+
include_content_version_fp: bool = False
|
|
465
|
+
include_value: bool = False
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
K = TypeVar("K")
|
|
469
|
+
V = TypeVar("V")
|
|
470
|
+
|
|
471
|
+
NON_EXISTENCE: Literal["NON_EXISTENCE"] = "NON_EXISTENCE"
|
|
472
|
+
NO_ORDINAL: Literal["NO_ORDINAL"] = "NO_ORDINAL"
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
@dataclasses.dataclass
|
|
476
|
+
class PartialSourceRowData(Generic[V]):
|
|
477
|
+
"""
|
|
478
|
+
The data of a source row.
|
|
479
|
+
|
|
480
|
+
- value: The value of the source row. NON_EXISTENCE means the row does not exist.
|
|
481
|
+
- ordinal: The ordinal of the source row. NO_ORDINAL means ordinal is not available for the source.
|
|
482
|
+
- content_version_fp: The content version fingerprint of the source row.
|
|
483
|
+
"""
|
|
484
|
+
|
|
485
|
+
value: V | Literal["NON_EXISTENCE"] | None = None
|
|
486
|
+
ordinal: int | Literal["NO_ORDINAL"] | None = None
|
|
487
|
+
content_version_fp: bytes | None = None
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
@dataclasses.dataclass
|
|
491
|
+
class PartialSourceRow(Generic[K, V]):
|
|
492
|
+
key: K
|
|
493
|
+
data: PartialSourceRowData[V]
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
class _SourceExecutorContext:
|
|
497
|
+
_executor: Any
|
|
498
|
+
|
|
499
|
+
_key_encoder: Callable[[Any], Any]
|
|
500
|
+
_key_decoder: Callable[[Any], Any]
|
|
501
|
+
|
|
502
|
+
_value_encoder: Callable[[Any], Any]
|
|
503
|
+
|
|
504
|
+
_list_fn: Callable[
|
|
505
|
+
[SourceReadOptions],
|
|
506
|
+
AsyncIterator[PartialSourceRow[Any, Any]]
|
|
507
|
+
| Iterator[PartialSourceRow[Any, Any]],
|
|
508
|
+
]
|
|
509
|
+
_orig_get_value_fn: Callable[..., Any]
|
|
510
|
+
_get_value_fn: Callable[..., Awaitable[PartialSourceRowData[Any]]]
|
|
511
|
+
_provides_ordinal_fn: Callable[[], bool] | None
|
|
512
|
+
|
|
513
|
+
def __init__(
|
|
514
|
+
self,
|
|
515
|
+
executor: Any,
|
|
516
|
+
key_type_info: AnalyzedTypeInfo,
|
|
517
|
+
key_decoder: Callable[[Any], Any],
|
|
518
|
+
value_type_info: AnalyzedTypeInfo,
|
|
519
|
+
):
|
|
520
|
+
self._executor = executor
|
|
521
|
+
|
|
522
|
+
self._key_encoder = make_engine_key_encoder(key_type_info)
|
|
523
|
+
self._key_decoder = key_decoder
|
|
524
|
+
self._value_encoder = make_engine_value_encoder(value_type_info)
|
|
525
|
+
|
|
526
|
+
self._list_fn = _get_required_method(executor, "list")
|
|
527
|
+
self._orig_get_value_fn = _get_required_method(executor, "get_value")
|
|
528
|
+
self._get_value_fn = to_async_call(self._orig_get_value_fn)
|
|
529
|
+
self._provides_ordinal_fn = getattr(executor, "provides_ordinal", None)
|
|
530
|
+
|
|
531
|
+
def provides_ordinal(self) -> bool:
|
|
532
|
+
if self._provides_ordinal_fn is not None:
|
|
533
|
+
result = self._provides_ordinal_fn()
|
|
534
|
+
return bool(result)
|
|
535
|
+
else:
|
|
536
|
+
return False
|
|
537
|
+
|
|
538
|
+
async def list_async(
|
|
539
|
+
self, options: dict[str, Any]
|
|
540
|
+
) -> AsyncIterator[tuple[Any, dict[str, Any]]]:
|
|
541
|
+
"""
|
|
542
|
+
Return an async iterator that yields individual rows one by one.
|
|
543
|
+
Each yielded item is a tuple of (key, data).
|
|
544
|
+
"""
|
|
545
|
+
read_options = load_engine_object(SourceReadOptions, options)
|
|
546
|
+
args = _build_args(self._list_fn, 0, options=read_options)
|
|
547
|
+
list_result = self._list_fn(*args)
|
|
548
|
+
|
|
549
|
+
# Handle both sync and async iterators
|
|
550
|
+
if hasattr(list_result, "__aiter__"):
|
|
551
|
+
async for partial_row in list_result:
|
|
552
|
+
yield (
|
|
553
|
+
self._key_encoder(partial_row.key),
|
|
554
|
+
self._encode_source_row_data(partial_row.data),
|
|
555
|
+
)
|
|
556
|
+
else:
|
|
557
|
+
for partial_row in list_result:
|
|
558
|
+
yield (
|
|
559
|
+
self._key_encoder(partial_row.key),
|
|
560
|
+
self._encode_source_row_data(partial_row.data),
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
async def get_value_async(
|
|
564
|
+
self,
|
|
565
|
+
raw_key: Any,
|
|
566
|
+
options: dict[str, Any],
|
|
567
|
+
) -> dict[str, Any]:
|
|
568
|
+
key = self._key_decoder(raw_key)
|
|
569
|
+
read_options = load_engine_object(SourceReadOptions, options)
|
|
570
|
+
args = _build_args(self._orig_get_value_fn, 1, key=key, options=read_options)
|
|
571
|
+
row_data = await self._get_value_fn(*args)
|
|
572
|
+
return self._encode_source_row_data(row_data)
|
|
573
|
+
|
|
574
|
+
def _encode_source_row_data(
|
|
575
|
+
self, row_data: PartialSourceRowData[Any]
|
|
576
|
+
) -> dict[str, Any]:
|
|
577
|
+
"""Convert Python PartialSourceRowData to the format expected by Rust."""
|
|
578
|
+
return {
|
|
579
|
+
"ordinal": row_data.ordinal,
|
|
580
|
+
"content_version_fp": row_data.content_version_fp,
|
|
581
|
+
"value": (
|
|
582
|
+
NON_EXISTENCE
|
|
583
|
+
if row_data.value == NON_EXISTENCE
|
|
584
|
+
else self._value_encoder(row_data.value)
|
|
585
|
+
),
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
class _SourceConnector:
|
|
590
|
+
"""
|
|
591
|
+
The connector class passed to the engine.
|
|
592
|
+
"""
|
|
593
|
+
|
|
594
|
+
_spec_cls: type[Any]
|
|
595
|
+
_key_type_info: AnalyzedTypeInfo
|
|
596
|
+
_key_decoder: Callable[[Any], Any]
|
|
597
|
+
_value_type_info: AnalyzedTypeInfo
|
|
598
|
+
_table_type: EnrichedValueType
|
|
599
|
+
_connector_cls: type[Any]
|
|
600
|
+
|
|
601
|
+
_create_fn: Callable[[Any], Awaitable[Any]]
|
|
602
|
+
|
|
603
|
+
def __init__(
|
|
604
|
+
self,
|
|
605
|
+
spec_cls: type[Any],
|
|
606
|
+
key_type: Any,
|
|
607
|
+
value_type: Any,
|
|
608
|
+
connector_cls: type[Any],
|
|
609
|
+
):
|
|
610
|
+
self._spec_cls = spec_cls
|
|
611
|
+
self._key_type_info = analyze_type_info(key_type)
|
|
612
|
+
self._value_type_info = analyze_type_info(value_type)
|
|
613
|
+
self._connector_cls = connector_cls
|
|
614
|
+
|
|
615
|
+
# TODO: We can save the intermediate step after #1083 is fixed.
|
|
616
|
+
encoded_engine_key_type = encode_enriched_type_info(self._key_type_info)
|
|
617
|
+
engine_key_type = EnrichedValueType.decode(encoded_engine_key_type)
|
|
618
|
+
|
|
619
|
+
# TODO: We can save the intermediate step after #1083 is fixed.
|
|
620
|
+
encoded_engine_value_type = encode_enriched_type_info(self._value_type_info)
|
|
621
|
+
engine_value_type = EnrichedValueType.decode(encoded_engine_value_type)
|
|
622
|
+
|
|
623
|
+
if not isinstance(engine_value_type.type, StructType):
|
|
624
|
+
raise ValueError(f"Expected a StructType, got {engine_value_type.type}")
|
|
625
|
+
|
|
626
|
+
if isinstance(engine_key_type.type, StructType):
|
|
627
|
+
key_fields_schema = engine_key_type.type.fields
|
|
628
|
+
else:
|
|
629
|
+
key_fields_schema = [
|
|
630
|
+
FieldSchema(name=KEY_FIELD_NAME, value_type=engine_key_type)
|
|
631
|
+
]
|
|
632
|
+
self._key_decoder = make_engine_key_decoder(
|
|
633
|
+
[], key_fields_schema, self._key_type_info
|
|
634
|
+
)
|
|
635
|
+
self._table_type = EnrichedValueType(
|
|
636
|
+
type=TableType(
|
|
637
|
+
kind="KTable",
|
|
638
|
+
row=StructSchema(
|
|
639
|
+
fields=key_fields_schema + engine_value_type.type.fields
|
|
640
|
+
),
|
|
641
|
+
num_key_parts=len(key_fields_schema),
|
|
642
|
+
),
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
self._create_fn = to_async_call(_get_required_method(connector_cls, "create"))
|
|
646
|
+
|
|
647
|
+
async def create_executor(self, raw_spec: dict[str, Any]) -> _SourceExecutorContext:
|
|
648
|
+
spec = load_engine_object(self._spec_cls, raw_spec)
|
|
649
|
+
executor = await self._create_fn(spec)
|
|
650
|
+
return _SourceExecutorContext(
|
|
651
|
+
executor, self._key_type_info, self._key_decoder, self._value_type_info
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
def get_table_type(self) -> Any:
|
|
655
|
+
return dump_engine_object(self._table_type)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def source_connector(
|
|
659
|
+
*,
|
|
660
|
+
spec_cls: type[Any],
|
|
661
|
+
key_type: Any = Any,
|
|
662
|
+
value_type: Any = Any,
|
|
663
|
+
) -> Callable[[type], type]:
|
|
664
|
+
"""
|
|
665
|
+
Decorate a class to provide a source connector for an op.
|
|
666
|
+
"""
|
|
667
|
+
|
|
668
|
+
# Validate the spec_cls is a SourceSpec.
|
|
669
|
+
if not issubclass(spec_cls, SourceSpec):
|
|
670
|
+
raise ValueError(f"Expect a SourceSpec, got {spec_cls}")
|
|
671
|
+
|
|
672
|
+
# Register the source connector.
|
|
673
|
+
def _inner(connector_cls: type) -> type:
|
|
674
|
+
connector = _SourceConnector(spec_cls, key_type, value_type, connector_cls)
|
|
675
|
+
_engine.register_source_connector(spec_cls.__name__, connector)
|
|
676
|
+
return connector_cls
|
|
677
|
+
|
|
678
|
+
return _inner
|
|
679
|
+
|
|
680
|
+
|
|
424
681
|
########################################################
|
|
425
682
|
# Custom target connector
|
|
426
683
|
########################################################
|
cocoindex/typing.py
CHANGED
|
@@ -475,16 +475,16 @@ def _encode_type(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
|
|
|
475
475
|
}
|
|
476
476
|
|
|
477
477
|
|
|
478
|
-
def encode_enriched_type_info(
|
|
478
|
+
def encode_enriched_type_info(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
|
|
479
479
|
"""
|
|
480
|
-
Encode an
|
|
480
|
+
Encode an `AnalyzedTypeInfo` to a CocoIndex engine's `EnrichedValueType` representation
|
|
481
481
|
"""
|
|
482
|
-
encoded: dict[str, Any] = {"type": _encode_type(
|
|
482
|
+
encoded: dict[str, Any] = {"type": _encode_type(type_info)}
|
|
483
483
|
|
|
484
|
-
if
|
|
485
|
-
encoded["attrs"] =
|
|
484
|
+
if type_info.attrs is not None:
|
|
485
|
+
encoded["attrs"] = type_info.attrs
|
|
486
486
|
|
|
487
|
-
if
|
|
487
|
+
if type_info.nullable:
|
|
488
488
|
encoded["nullable"] = True
|
|
489
489
|
|
|
490
490
|
return encoded
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cocoindex
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.21
|
|
4
4
|
Classifier: Development Status :: 3 - Alpha
|
|
5
5
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
6
6
|
Classifier: Operating System :: OS Independent
|
|
@@ -75,7 +75,6 @@ Project-URL: Homepage, https://cocoindex.io/
|
|
|
75
75
|
<a href="https://trendshift.io/repositories/13939" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13939" alt="cocoindex-io%2Fcocoindex | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
76
76
|
</div>
|
|
77
77
|
|
|
78
|
-
|
|
79
78
|
Ultra performant data transformation framework for AI, with core engine written in Rust. Support incremental processing and data lineage out-of-box. Exceptional developer velocity. Production-ready at day 0.
|
|
80
79
|
|
|
81
80
|
⭐ Drop a star to help us grow!
|
|
@@ -113,9 +112,8 @@ CocoIndex makes it effortless to transform data with AI, and keep source data an
|
|
|
113
112
|
|
|
114
113
|
</br>
|
|
115
114
|
|
|
116
|
-
|
|
117
|
-
|
|
118
115
|
## Exceptional velocity
|
|
116
|
+
|
|
119
117
|
Just declare transformation in dataflow with ~100 lines of python
|
|
120
118
|
|
|
121
119
|
```python
|
|
@@ -139,6 +137,7 @@ CocoIndex follows the idea of [Dataflow](https://en.wikipedia.org/wiki/Dataflow_
|
|
|
139
137
|
**Particularly**, developers don't explicitly mutate data by creating, updating and deleting. They just need to define transformation/formula for a set of source data.
|
|
140
138
|
|
|
141
139
|
## Plug-and-Play Building Blocks
|
|
140
|
+
|
|
142
141
|
Native builtins for different source, targets and transformations. Standardize interface, make it 1-line code switch between different components - as easy as assembling building blocks.
|
|
143
142
|
|
|
144
143
|
<p align="center">
|
|
@@ -146,6 +145,7 @@ Native builtins for different source, targets and transformations. Standardize i
|
|
|
146
145
|
</p>
|
|
147
146
|
|
|
148
147
|
## Data Freshness
|
|
148
|
+
|
|
149
149
|
CocoIndex keep source data and target in sync effortlessly.
|
|
150
150
|
|
|
151
151
|
<p align="center">
|
|
@@ -153,11 +153,14 @@ CocoIndex keep source data and target in sync effortlessly.
|
|
|
153
153
|
</p>
|
|
154
154
|
|
|
155
155
|
It has out-of-box support for incremental indexing:
|
|
156
|
+
|
|
156
157
|
- minimal recomputation on source or logic change.
|
|
157
158
|
- (re-)processing necessary portions; reuse cache when possible
|
|
158
159
|
|
|
159
|
-
## Quick Start
|
|
160
|
+
## Quick Start
|
|
161
|
+
|
|
160
162
|
If you're new to CocoIndex, we recommend checking out
|
|
163
|
+
|
|
161
164
|
- 📖 [Documentation](https://cocoindex.io/docs)
|
|
162
165
|
- ⚡ [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart)
|
|
163
166
|
- 🎬 [Quick Start Video Tutorial](https://youtu.be/gv5R8nOXsWU?si=9ioeKYkMEnYevTXT)
|
|
@@ -172,7 +175,6 @@ pip install -U cocoindex
|
|
|
172
175
|
|
|
173
176
|
2. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. CocoIndex uses it for incremental processing.
|
|
174
177
|
|
|
175
|
-
|
|
176
178
|
## Define data flow
|
|
177
179
|
|
|
178
180
|
Follow [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart) to define your first indexing flow. An example flow looks like:
|
|
@@ -228,6 +230,7 @@ It defines an index flow like this:
|
|
|
228
230
|
| [Text Embedding](examples/text_embedding) | Index text documents with embeddings for semantic search |
|
|
229
231
|
| [Code Embedding](examples/code_embedding) | Index code embeddings for semantic search |
|
|
230
232
|
| [PDF Embedding](examples/pdf_embedding) | Parse PDF and index text embeddings for semantic search |
|
|
233
|
+
| [PDF Elements Embedding](examples/pdf_elements_embedding) | Extract text and images from PDFs; embed text with SentenceTransformers and images with CLIP; store in Qdrant for multimodal search |
|
|
231
234
|
| [Manuals LLM Extraction](examples/manuals_llm_extraction) | Extract structured information from a manual using LLM |
|
|
232
235
|
| [Amazon S3 Embedding](examples/amazon_s3_embedding) | Index text documents from Amazon S3 |
|
|
233
236
|
| [Azure Blob Storage Embedding](examples/azure_blob_embedding) | Index text documents from Azure Blob Storage |
|
|
@@ -244,16 +247,18 @@ It defines an index flow like this:
|
|
|
244
247
|
| [Custom Output Files](examples/custom_output_files) | Convert markdown files to HTML files and save them to a local directory, using *CocoIndex Custom Targets* |
|
|
245
248
|
| [Patient intake form extraction](examples/patient_intake_extraction) | Use LLM to extract structured data from patient intake forms with different formats |
|
|
246
249
|
|
|
247
|
-
|
|
248
250
|
More coming and stay tuned 👀!
|
|
249
251
|
|
|
250
252
|
## 📖 Documentation
|
|
253
|
+
|
|
251
254
|
For detailed documentation, visit [CocoIndex Documentation](https://cocoindex.io/docs), including a [Quickstart guide](https://cocoindex.io/docs/getting_started/quickstart).
|
|
252
255
|
|
|
253
256
|
## 🤝 Contributing
|
|
257
|
+
|
|
254
258
|
We love contributions from our community ❤️. For details on contributing or running the project for development, check out our [contributing guide](https://cocoindex.io/docs/about/contributing).
|
|
255
259
|
|
|
256
260
|
## 👥 Community
|
|
261
|
+
|
|
257
262
|
Welcome with a huge coconut hug 🥥⋆。˚🤗. We are super excited for community contributions of all kinds - whether it's code improvements, documentation updates, issue reports, feature requests, and discussions in our Discord.
|
|
258
263
|
|
|
259
264
|
Join our community here:
|
|
@@ -263,9 +268,11 @@ Join our community here:
|
|
|
263
268
|
- ▶️ [Subscribe to our YouTube channel](https://www.youtube.com/@cocoindex-io)
|
|
264
269
|
- 📜 [Read our blog posts](https://cocoindex.io/blogs/)
|
|
265
270
|
|
|
266
|
-
## Support us
|
|
271
|
+
## Support us
|
|
272
|
+
|
|
267
273
|
We are constantly improving, and more features and examples are coming soon. If you love this project, please drop us a star ⭐ at GitHub repo [](https://github.com/cocoindex-io/cocoindex) to stay tuned and help us grow.
|
|
268
274
|
|
|
269
275
|
## License
|
|
276
|
+
|
|
270
277
|
CocoIndex is Apache 2.0 licensed.
|
|
271
278
|
|
|
@@ -1,23 +1,22 @@
|
|
|
1
|
-
cocoindex-0.2.
|
|
2
|
-
cocoindex-0.2.
|
|
3
|
-
cocoindex-0.2.
|
|
4
|
-
cocoindex-0.2.
|
|
1
|
+
cocoindex-0.2.21.dist-info/METADATA,sha256=ftiSHB0STa71A93RBOaTAPC3orGHXXAUvQXqxx7QquM,13868
|
|
2
|
+
cocoindex-0.2.21.dist-info/WHEEL,sha256=QC3zdlgimDC1GtRrc0qfjqbzuc7G6nDvPrjaINbNOTw,95
|
|
3
|
+
cocoindex-0.2.21.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
|
|
4
|
+
cocoindex-0.2.21.dist-info/licenses/THIRD_PARTY_NOTICES.html,sha256=_rafrUNfyvjjCzp4oqD9ti424cq-OkzpGbOW9RFmwls,719655
|
|
5
5
|
cocoindex/__init__.py,sha256=MsjYflfJHL_sKL4OxcExfRwl57JFKwqvt9AWbiHsZ3Q,2744
|
|
6
|
-
cocoindex/_engine.pyd,sha256=
|
|
6
|
+
cocoindex/_engine.pyd,sha256=hNPO5H7npijlntG777est-3gf91KinSDQWEjAADmBsM,76973056
|
|
7
7
|
cocoindex/auth_registry.py,sha256=HK1vfKQh_6z310c8kgFDIQf9RdoiA3vWUwvFYbgybr0,1384
|
|
8
|
-
cocoindex/cli.py,sha256=
|
|
8
|
+
cocoindex/cli.py,sha256=9Io46yDs2pj-AdKkNTbbLoThFbB1ZyHUUYzMIgPxHlE,24758
|
|
9
9
|
cocoindex/engine_object.py,sha256=JH27f2MLNUw3HQy6JuCpvRhMxuKZBkxplLIo0m7PU04,10322
|
|
10
|
-
cocoindex/engine_value.py,sha256=
|
|
11
|
-
cocoindex/flow.py,sha256=
|
|
12
|
-
cocoindex/functions.py,sha256=iBdqJbPhtqjFrA7yLevnJ0M0piS7N6TBP7Nqhpeh5Ww,13257
|
|
10
|
+
cocoindex/engine_value.py,sha256=CYS_rdJQZoLEcKVcUXupcH0u8k-6ce80Xrzd3rgUBJE,23914
|
|
11
|
+
cocoindex/flow.py,sha256=MznA23dsIOlu-1uEBe6XVTfYlDjvH3bvky1BCUEnEng,41357
|
|
13
12
|
cocoindex/functions/__init__.py,sha256=AZ4f7dBVZMkWyR85z9Gy1AgOAB0f1tex62CMP8MDLX4,1061
|
|
14
13
|
cocoindex/functions/_engine_builtin_specs.py,sha256=6ZV91MUosqGSolnKNUjyRHZ_oTOOsQ_jMMEBGZW1EYo,1852
|
|
15
14
|
cocoindex/functions/colpali.py,sha256=kinPow46CSW0PW_-a5PT9JfsCxpvxZG4ZFzQ_OYt9GU,9115
|
|
16
15
|
cocoindex/functions/sbert.py,sha256=5X381jQFqnmomj_943Xrhcs1sWecA1sWZfb4On8mONg,2225
|
|
17
16
|
cocoindex/index.py,sha256=C__LzwIC918VIDGsBsyLwvNBO-4BiC5Coq01Fp1zXkI,1032
|
|
18
17
|
cocoindex/lib.py,sha256=cyKGdn8cfH9bkYfrnJ7dlUBO8OVZkKyrkYhWHsMFW_g,2365
|
|
19
|
-
cocoindex/llm.py,sha256=
|
|
20
|
-
cocoindex/op.py,sha256=
|
|
18
|
+
cocoindex/llm.py,sha256=zvdvdFPsLdkVVRdNawOubkEtvdKXI_BlfJIFcDu0S6o,922
|
|
19
|
+
cocoindex/op.py,sha256=pMMgPJ7KVuU1ptTIDJB9yVEntaeOrZ4in16H0Zqa1JE,37100
|
|
21
20
|
cocoindex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
21
|
cocoindex/query_handler.py,sha256=vsw-VQ-Fh0ekr4boKchI0wpKHqaiBoeNStClrW4Zwog,1401
|
|
23
22
|
cocoindex/runtime.py,sha256=ZjyPmohIWGFDRaOdS0MgeVuf4Ra2Ggbp6xraxwvHFBo,1386
|
|
@@ -36,8 +35,8 @@ cocoindex/tests/test_optional_database.py,sha256=dnzmTgaJf37D3q8fQsjP5UDER6FYETa
|
|
|
36
35
|
cocoindex/tests/test_transform_flow.py,sha256=DxM-7_kWeU-QzOpH77Vd5Jehbbq00xCBBgRK7mRn0kI,6237
|
|
37
36
|
cocoindex/tests/test_typing.py,sha256=d75mjzAk9dDklC3llwSfPJI91d9GO_6WsgYwELGIPso,16844
|
|
38
37
|
cocoindex/tests/test_validation.py,sha256=I4wr8lAMAjmy5xgG5N_OJKveXt8XIa96MsQTXhw5AnA,4677
|
|
39
|
-
cocoindex/typing.py,sha256=
|
|
38
|
+
cocoindex/typing.py,sha256=dC8CmltIbT85TAgiFXwmHT_lePdiHRQLrURjgZWF6oo,24716
|
|
40
39
|
cocoindex/user_app_loader.py,sha256=ZkvUG9aJNNECAjwTY0ZYtNpFd9dNBPVoPKGTtB7dSZg,1926
|
|
41
40
|
cocoindex/utils.py,sha256=U3W39zD2uZpXX8v84tJD7sRmbC5ar3z_ljAP1cJrYXI,618
|
|
42
41
|
cocoindex/validation.py,sha256=4ZjsW-SZT8X_TEEhEE6QG6D-8Oq_TkPAhTqP0mdFYSE,3194
|
|
43
|
-
cocoindex-0.2.
|
|
42
|
+
cocoindex-0.2.21.dist-info/RECORD,,
|
|
@@ -2428,7 +2428,7 @@ Software.
|
|
|
2428
2428
|
<h3 id="Apache-2.0">Apache License 2.0</h3>
|
|
2429
2429
|
<h4>Used by:</h4>
|
|
2430
2430
|
<ul class="license-used-by">
|
|
2431
|
-
<li><a href=" https://crates.io/crates/cocoindex ">cocoindex 0.2.
|
|
2431
|
+
<li><a href=" https://crates.io/crates/cocoindex ">cocoindex 0.2.21</a></li>
|
|
2432
2432
|
<li><a href=" https://github.com/awesomized/crc-fast-rust ">crc-fast 1.3.0</a></li>
|
|
2433
2433
|
<li><a href=" https://github.com/qdrant/rust-client ">qdrant-client 1.15.0</a></li>
|
|
2434
2434
|
</ul>
|
|
@@ -10677,6 +10677,38 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
10677
10677
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
10678
10678
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
10679
10679
|
THE SOFTWARE.</pre>
|
|
10680
|
+
</li>
|
|
10681
|
+
<li class="license">
|
|
10682
|
+
<h3 id="MIT">MIT License</h3>
|
|
10683
|
+
<h4>Used by:</h4>
|
|
10684
|
+
<ul class="license-used-by">
|
|
10685
|
+
<li><a href=" https://github.com/tree-sitter/tree-sitter-scala ">tree-sitter-scala 0.24.0</a></li>
|
|
10686
|
+
</ul>
|
|
10687
|
+
<pre class="license-text">(The MIT License)
|
|
10688
|
+
|
|
10689
|
+
Copyright (c) 2014 Nathan Rajlich <nathan@tootallnate.net>
|
|
10690
|
+
|
|
10691
|
+
Permission is hereby granted, free of charge, to any person
|
|
10692
|
+
obtaining a copy of this software and associated documentation
|
|
10693
|
+
files (the "Software"), to deal in the Software without
|
|
10694
|
+
restriction, including without limitation the rights to use,
|
|
10695
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10696
|
+
copies of the Software, and to permit persons to whom the
|
|
10697
|
+
Software is furnished to do so, subject to the following
|
|
10698
|
+
conditions:
|
|
10699
|
+
|
|
10700
|
+
The above copyright notice and this permission notice shall be
|
|
10701
|
+
included in all copies or substantial portions of the Software.
|
|
10702
|
+
|
|
10703
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
10704
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
10705
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
10706
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
10707
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
10708
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
10709
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
10710
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
|
10711
|
+
</pre>
|
|
10680
10712
|
</li>
|
|
10681
10713
|
<li class="license">
|
|
10682
10714
|
<h3 id="MIT">MIT License</h3>
|
|
@@ -12300,32 +12332,6 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
12300
12332
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
12301
12333
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
12302
12334
|
THE SOFTWARE.
|
|
12303
|
-
</pre>
|
|
12304
|
-
</li>
|
|
12305
|
-
<li class="license">
|
|
12306
|
-
<h3 id="MIT">MIT License</h3>
|
|
12307
|
-
<h4>Used by:</h4>
|
|
12308
|
-
<ul class="license-used-by">
|
|
12309
|
-
<li><a href=" https://github.com/tree-sitter/tree-sitter-scala ">tree-sitter-scala 0.24.0</a></li>
|
|
12310
|
-
</ul>
|
|
12311
|
-
<pre class="license-text">This software is released under the MIT license:
|
|
12312
|
-
|
|
12313
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
12314
|
-
this software and associated documentation files (the "Software"), to deal in
|
|
12315
|
-
the Software without restriction, including without limitation the rights to
|
|
12316
|
-
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
12317
|
-
the Software, and to permit persons to whom the Software is furnished to do so,
|
|
12318
|
-
subject to the following conditions:
|
|
12319
|
-
|
|
12320
|
-
The above copyright notice and this permission notice shall be included in all
|
|
12321
|
-
copies or substantial portions of the Software.
|
|
12322
|
-
|
|
12323
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
12324
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
12325
|
-
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
12326
|
-
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
12327
|
-
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
12328
|
-
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
12329
12335
|
</pre>
|
|
12330
12336
|
</li>
|
|
12331
12337
|
<li class="license">
|
cocoindex/functions.py
DELETED
|
@@ -1,375 +0,0 @@
|
|
|
1
|
-
"""All builtin functions."""
|
|
2
|
-
|
|
3
|
-
import dataclasses
|
|
4
|
-
import functools
|
|
5
|
-
from typing import Any, Literal
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
from numpy.typing import NDArray
|
|
9
|
-
|
|
10
|
-
from . import llm, op
|
|
11
|
-
from .typing import Vector
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class ParseJson(op.FunctionSpec):
|
|
15
|
-
"""Parse a text into a JSON object."""
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@dataclasses.dataclass
|
|
19
|
-
class CustomLanguageSpec:
|
|
20
|
-
"""Custom language specification."""
|
|
21
|
-
|
|
22
|
-
language_name: str
|
|
23
|
-
separators_regex: list[str]
|
|
24
|
-
aliases: list[str] = dataclasses.field(default_factory=list)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dataclasses.dataclass
|
|
28
|
-
class ColPaliModelInfo:
|
|
29
|
-
"""Data structure for ColPali model and processor."""
|
|
30
|
-
|
|
31
|
-
model: Any
|
|
32
|
-
processor: Any
|
|
33
|
-
dimension: int
|
|
34
|
-
device: Any
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class SplitRecursively(op.FunctionSpec):
|
|
38
|
-
"""Split a document (in string) recursively."""
|
|
39
|
-
|
|
40
|
-
custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class SplitBySeparators(op.FunctionSpec):
|
|
44
|
-
"""
|
|
45
|
-
Split text by specified regex separators only.
|
|
46
|
-
Output schema matches SplitRecursively for drop-in compatibility:
|
|
47
|
-
KTable rows with fields: location (Range), text (Str), start, end.
|
|
48
|
-
Args:
|
|
49
|
-
separators_regex: list[str] # e.g., [r"\\n\\n+"]
|
|
50
|
-
keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
|
|
51
|
-
include_empty: bool = False
|
|
52
|
-
trim: bool = True
|
|
53
|
-
"""
|
|
54
|
-
|
|
55
|
-
separators_regex: list[str] = dataclasses.field(default_factory=list)
|
|
56
|
-
keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
|
|
57
|
-
include_empty: bool = False
|
|
58
|
-
trim: bool = True
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class EmbedText(op.FunctionSpec):
|
|
62
|
-
"""Embed a text into a vector space."""
|
|
63
|
-
|
|
64
|
-
api_type: llm.LlmApiType
|
|
65
|
-
model: str
|
|
66
|
-
address: str | None = None
|
|
67
|
-
output_dimension: int | None = None
|
|
68
|
-
task_type: str | None = None
|
|
69
|
-
api_config: llm.VertexAiConfig | None = None
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
class ExtractByLlm(op.FunctionSpec):
|
|
73
|
-
"""Extract information from a text using a LLM."""
|
|
74
|
-
|
|
75
|
-
llm_spec: llm.LlmSpec
|
|
76
|
-
output_type: type
|
|
77
|
-
instruction: str | None = None
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
class SentenceTransformerEmbed(op.FunctionSpec):
|
|
81
|
-
"""
|
|
82
|
-
`SentenceTransformerEmbed` embeds a text into a vector space using the [SentenceTransformer](https://huggingface.co/sentence-transformers) library.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
|
|
86
|
-
model: The name of the SentenceTransformer model to use.
|
|
87
|
-
args: Additional arguments to pass to the SentenceTransformer constructor. e.g. {"trust_remote_code": True}
|
|
88
|
-
|
|
89
|
-
Note:
|
|
90
|
-
This function requires the optional sentence-transformers dependency.
|
|
91
|
-
Install it with: pip install 'cocoindex[embeddings]'
|
|
92
|
-
"""
|
|
93
|
-
|
|
94
|
-
model: str
|
|
95
|
-
args: dict[str, Any] | None = None
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
@op.executor_class(
|
|
99
|
-
gpu=True,
|
|
100
|
-
cache=True,
|
|
101
|
-
behavior_version=1,
|
|
102
|
-
arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),
|
|
103
|
-
)
|
|
104
|
-
class SentenceTransformerEmbedExecutor:
|
|
105
|
-
"""Executor for SentenceTransformerEmbed."""
|
|
106
|
-
|
|
107
|
-
spec: SentenceTransformerEmbed
|
|
108
|
-
_model: Any | None = None
|
|
109
|
-
|
|
110
|
-
def analyze(self) -> type:
|
|
111
|
-
try:
|
|
112
|
-
# Only import sentence_transformers locally when it's needed, as its import is very slow.
|
|
113
|
-
import sentence_transformers # pylint: disable=import-outside-toplevel
|
|
114
|
-
except ImportError as e:
|
|
115
|
-
raise ImportError(
|
|
116
|
-
"sentence_transformers is required for SentenceTransformerEmbed function. "
|
|
117
|
-
"Install it with one of these commands:\n"
|
|
118
|
-
" pip install 'cocoindex[embeddings]'\n"
|
|
119
|
-
" pip install sentence-transformers"
|
|
120
|
-
) from e
|
|
121
|
-
|
|
122
|
-
args = self.spec.args or {}
|
|
123
|
-
self._model = sentence_transformers.SentenceTransformer(self.spec.model, **args)
|
|
124
|
-
dim = self._model.get_sentence_embedding_dimension()
|
|
125
|
-
return Vector[np.float32, Literal[dim]] # type: ignore
|
|
126
|
-
|
|
127
|
-
def __call__(self, text: str) -> NDArray[np.float32]:
|
|
128
|
-
assert self._model is not None
|
|
129
|
-
result: NDArray[np.float32] = self._model.encode(text, convert_to_numpy=True)
|
|
130
|
-
return result
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
@functools.cache
|
|
134
|
-
def _get_colpali_model_and_processor(model_name: str) -> ColPaliModelInfo:
|
|
135
|
-
"""Get or load ColPali model and processor, with caching."""
|
|
136
|
-
try:
|
|
137
|
-
from colpali_engine.models import ( # type: ignore[import-untyped]
|
|
138
|
-
ColPali,
|
|
139
|
-
ColPaliProcessor,
|
|
140
|
-
ColQwen2,
|
|
141
|
-
ColQwen2Processor,
|
|
142
|
-
ColQwen2_5,
|
|
143
|
-
ColQwen2_5_Processor,
|
|
144
|
-
ColIdefics3,
|
|
145
|
-
ColIdefics3Processor,
|
|
146
|
-
)
|
|
147
|
-
from colpali_engine.utils.torch_utils import get_torch_device # type: ignore[import-untyped]
|
|
148
|
-
import torch
|
|
149
|
-
except ImportError as e:
|
|
150
|
-
raise ImportError(
|
|
151
|
-
"ColVision models are not available. Make sure cocoindex is installed with ColPali support."
|
|
152
|
-
) from e
|
|
153
|
-
|
|
154
|
-
device = get_torch_device("auto")
|
|
155
|
-
|
|
156
|
-
# Manual model detection based on model name
|
|
157
|
-
model_name_lower = model_name.lower()
|
|
158
|
-
|
|
159
|
-
try:
|
|
160
|
-
if "qwen2.5" in model_name_lower:
|
|
161
|
-
model = ColQwen2_5.from_pretrained(
|
|
162
|
-
model_name,
|
|
163
|
-
torch_dtype=torch.bfloat16,
|
|
164
|
-
device_map=device,
|
|
165
|
-
).eval()
|
|
166
|
-
processor = ColQwen2_5_Processor.from_pretrained(model_name)
|
|
167
|
-
elif "qwen2" in model_name_lower:
|
|
168
|
-
model = ColQwen2.from_pretrained(
|
|
169
|
-
model_name,
|
|
170
|
-
torch_dtype=torch.bfloat16,
|
|
171
|
-
device_map=device,
|
|
172
|
-
).eval()
|
|
173
|
-
processor = ColQwen2Processor.from_pretrained(model_name)
|
|
174
|
-
elif "colsmol" in model_name_lower or "smol" in model_name_lower:
|
|
175
|
-
# ColSmol models use Idefics3 architecture
|
|
176
|
-
model = ColIdefics3.from_pretrained(
|
|
177
|
-
model_name,
|
|
178
|
-
torch_dtype=torch.bfloat16,
|
|
179
|
-
device_map=device,
|
|
180
|
-
).eval()
|
|
181
|
-
processor = ColIdefics3Processor.from_pretrained(model_name)
|
|
182
|
-
else:
|
|
183
|
-
# Default to ColPali
|
|
184
|
-
model = ColPali.from_pretrained(
|
|
185
|
-
model_name,
|
|
186
|
-
torch_dtype=torch.bfloat16,
|
|
187
|
-
device_map=device,
|
|
188
|
-
).eval()
|
|
189
|
-
processor = ColPaliProcessor.from_pretrained(model_name)
|
|
190
|
-
|
|
191
|
-
except Exception as e:
|
|
192
|
-
raise RuntimeError(f"Failed to load model {model_name}: {e}")
|
|
193
|
-
|
|
194
|
-
# Get dimension from the actual model
|
|
195
|
-
dimension = _detect_colpali_dimension(model, processor, device)
|
|
196
|
-
|
|
197
|
-
return ColPaliModelInfo(
|
|
198
|
-
model=model,
|
|
199
|
-
processor=processor,
|
|
200
|
-
dimension=dimension,
|
|
201
|
-
device=device,
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def _detect_colpali_dimension(model: Any, processor: Any, device: Any) -> int:
|
|
206
|
-
"""Detect ColPali embedding dimension from the actual model config."""
|
|
207
|
-
# Try to access embedding dimension
|
|
208
|
-
if hasattr(model.config, "embedding_dim"):
|
|
209
|
-
dim = model.config.embedding_dim
|
|
210
|
-
else:
|
|
211
|
-
# Fallback: infer from output shape with dummy data
|
|
212
|
-
from PIL import Image
|
|
213
|
-
import numpy as np
|
|
214
|
-
import torch
|
|
215
|
-
|
|
216
|
-
dummy_img = Image.fromarray(np.zeros((224, 224, 3), np.uint8))
|
|
217
|
-
# Use the processor to process the dummy image
|
|
218
|
-
processed = processor.process_images([dummy_img]).to(device)
|
|
219
|
-
with torch.no_grad():
|
|
220
|
-
output = model(**processed)
|
|
221
|
-
dim = int(output.shape[-1])
|
|
222
|
-
if isinstance(dim, int):
|
|
223
|
-
return dim
|
|
224
|
-
else:
|
|
225
|
-
raise ValueError(f"Expected integer dimension, got {type(dim)}: {dim}")
|
|
226
|
-
return dim
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
class ColPaliEmbedImage(op.FunctionSpec):
|
|
230
|
-
"""
|
|
231
|
-
`ColPaliEmbedImage` embeds images using ColVision multimodal models.
|
|
232
|
-
|
|
233
|
-
Supports ALL models available in the colpali-engine library, including:
|
|
234
|
-
- ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
|
|
235
|
-
- ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
|
|
236
|
-
- ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
|
|
237
|
-
- Any future ColVision models supported by colpali-engine
|
|
238
|
-
|
|
239
|
-
These models use late interaction between image patch embeddings and text token
|
|
240
|
-
embeddings for retrieval.
|
|
241
|
-
|
|
242
|
-
Args:
|
|
243
|
-
model: Any ColVision model name supported by colpali-engine
|
|
244
|
-
(e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
|
|
245
|
-
See https://github.com/illuin-tech/colpali for the complete list of supported models.
|
|
246
|
-
|
|
247
|
-
Note:
|
|
248
|
-
This function requires the optional colpali-engine dependency.
|
|
249
|
-
Install it with: pip install 'cocoindex[colpali]'
|
|
250
|
-
"""
|
|
251
|
-
|
|
252
|
-
model: str
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
@op.executor_class(
|
|
256
|
-
gpu=True,
|
|
257
|
-
cache=True,
|
|
258
|
-
behavior_version=1,
|
|
259
|
-
)
|
|
260
|
-
class ColPaliEmbedImageExecutor:
|
|
261
|
-
"""Executor for ColVision image embedding (ColPali, ColQwen2, ColSmol, etc.)."""
|
|
262
|
-
|
|
263
|
-
spec: ColPaliEmbedImage
|
|
264
|
-
_model_info: ColPaliModelInfo
|
|
265
|
-
|
|
266
|
-
def analyze(self) -> type:
|
|
267
|
-
# Get shared model and dimension
|
|
268
|
-
self._model_info = _get_colpali_model_and_processor(self.spec.model)
|
|
269
|
-
|
|
270
|
-
# Return multi-vector type: Variable patches x Fixed hidden dimension
|
|
271
|
-
dimension = self._model_info.dimension
|
|
272
|
-
return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
|
|
273
|
-
|
|
274
|
-
def __call__(self, img_bytes: bytes) -> Any:
|
|
275
|
-
try:
|
|
276
|
-
from PIL import Image
|
|
277
|
-
import torch
|
|
278
|
-
import io
|
|
279
|
-
except ImportError as e:
|
|
280
|
-
raise ImportError(
|
|
281
|
-
"Required dependencies (PIL, torch) are missing for ColVision image embedding."
|
|
282
|
-
) from e
|
|
283
|
-
|
|
284
|
-
model = self._model_info.model
|
|
285
|
-
processor = self._model_info.processor
|
|
286
|
-
device = self._model_info.device
|
|
287
|
-
|
|
288
|
-
pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
|
289
|
-
inputs = processor.process_images([pil_image]).to(device)
|
|
290
|
-
with torch.no_grad():
|
|
291
|
-
embeddings = model(**inputs)
|
|
292
|
-
|
|
293
|
-
# Return multi-vector format: [patches, hidden_dim]
|
|
294
|
-
if len(embeddings.shape) != 3:
|
|
295
|
-
raise ValueError(
|
|
296
|
-
f"Expected 3D tensor [batch, patches, hidden_dim], got shape {embeddings.shape}"
|
|
297
|
-
)
|
|
298
|
-
|
|
299
|
-
# Keep patch-level embeddings: [batch, patches, hidden_dim] -> [patches, hidden_dim]
|
|
300
|
-
patch_embeddings = embeddings[0] # Remove batch dimension
|
|
301
|
-
|
|
302
|
-
return patch_embeddings.cpu().to(torch.float32).numpy()
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
class ColPaliEmbedQuery(op.FunctionSpec):
|
|
306
|
-
"""
|
|
307
|
-
`ColPaliEmbedQuery` embeds text queries using ColVision multimodal models.
|
|
308
|
-
|
|
309
|
-
Supports ALL models available in the colpali-engine library, including:
|
|
310
|
-
- ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
|
|
311
|
-
- ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
|
|
312
|
-
- ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
|
|
313
|
-
- Any future ColVision models supported by colpali-engine
|
|
314
|
-
|
|
315
|
-
This produces query embeddings compatible with ColVision image embeddings
|
|
316
|
-
for late interaction scoring (MaxSim).
|
|
317
|
-
|
|
318
|
-
Args:
|
|
319
|
-
model: Any ColVision model name supported by colpali-engine
|
|
320
|
-
(e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
|
|
321
|
-
See https://github.com/illuin-tech/colpali for the complete list of supported models.
|
|
322
|
-
|
|
323
|
-
Note:
|
|
324
|
-
This function requires the optional colpali-engine dependency.
|
|
325
|
-
Install it with: pip install 'cocoindex[colpali]'
|
|
326
|
-
"""
|
|
327
|
-
|
|
328
|
-
model: str
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
@op.executor_class(
|
|
332
|
-
gpu=True,
|
|
333
|
-
cache=True,
|
|
334
|
-
behavior_version=1,
|
|
335
|
-
)
|
|
336
|
-
class ColPaliEmbedQueryExecutor:
|
|
337
|
-
"""Executor for ColVision query embedding (ColPali, ColQwen2, ColSmol, etc.)."""
|
|
338
|
-
|
|
339
|
-
spec: ColPaliEmbedQuery
|
|
340
|
-
_model_info: ColPaliModelInfo
|
|
341
|
-
|
|
342
|
-
def analyze(self) -> type:
|
|
343
|
-
# Get shared model and dimension
|
|
344
|
-
self._model_info = _get_colpali_model_and_processor(self.spec.model)
|
|
345
|
-
|
|
346
|
-
# Return multi-vector type: Variable tokens x Fixed hidden dimension
|
|
347
|
-
dimension = self._model_info.dimension
|
|
348
|
-
return Vector[Vector[np.float32, Literal[dimension]]] # type: ignore
|
|
349
|
-
|
|
350
|
-
def __call__(self, query: str) -> Any:
|
|
351
|
-
try:
|
|
352
|
-
import torch
|
|
353
|
-
except ImportError as e:
|
|
354
|
-
raise ImportError(
|
|
355
|
-
"Required dependencies (torch) are missing for ColVision query embedding."
|
|
356
|
-
) from e
|
|
357
|
-
|
|
358
|
-
model = self._model_info.model
|
|
359
|
-
processor = self._model_info.processor
|
|
360
|
-
device = self._model_info.device
|
|
361
|
-
|
|
362
|
-
inputs = processor.process_queries([query]).to(device)
|
|
363
|
-
with torch.no_grad():
|
|
364
|
-
embeddings = model(**inputs)
|
|
365
|
-
|
|
366
|
-
# Return multi-vector format: [tokens, hidden_dim]
|
|
367
|
-
if len(embeddings.shape) != 3:
|
|
368
|
-
raise ValueError(
|
|
369
|
-
f"Expected 3D tensor [batch, tokens, hidden_dim], got shape {embeddings.shape}"
|
|
370
|
-
)
|
|
371
|
-
|
|
372
|
-
# Keep token-level embeddings: [batch, tokens, hidden_dim] -> [tokens, hidden_dim]
|
|
373
|
-
token_embeddings = embeddings[0] # Remove batch dimension
|
|
374
|
-
|
|
375
|
-
return token_embeddings.cpu().to(torch.float32).numpy()
|
|
File without changes
|
|
File without changes
|