cocoindex 0.2.15__cp311-abi3-manylinux_2_28_aarch64.whl → 0.2.17__cp311-abi3-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoindex/_engine.abi3.so +0 -0
- cocoindex/auth_registry.py +1 -1
- cocoindex/cli.py +121 -41
- cocoindex/engine_object.py +272 -0
- cocoindex/{convert.py → engine_value.py} +64 -208
- cocoindex/flow.py +17 -10
- cocoindex/functions/__init__.py +45 -0
- cocoindex/functions/_engine_builtin_specs.py +62 -0
- cocoindex/functions/colpali.py +250 -0
- cocoindex/functions/sbert.py +63 -0
- cocoindex/lib.py +1 -1
- cocoindex/op.py +7 -3
- cocoindex/sources/__init__.py +5 -0
- cocoindex/{sources.py → sources/_engine_builtin_specs.py} +3 -3
- cocoindex/targets/_engine_builtin_specs.py +9 -0
- cocoindex/tests/test_engine_object.py +331 -0
- cocoindex/tests/{test_convert.py → test_engine_value.py} +150 -26
- cocoindex/typing.py +125 -3
- {cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/METADATA +4 -1
- cocoindex-0.2.17.dist-info/RECORD +43 -0
- {cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/WHEEL +1 -1
- {cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/licenses/THIRD_PARTY_NOTICES.html +22 -19
- cocoindex/tests/test_load_convert.py +0 -118
- cocoindex-0.2.15.dist-info/RECORD +0 -37
- {cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/entry_points.txt +0 -0
@@ -1,18 +1,15 @@
|
|
1
1
|
"""
|
2
|
-
Utilities to
|
2
|
+
Utilities to encode/decode values in cocoindex (for data).
|
3
3
|
"""
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
|
7
7
|
import dataclasses
|
8
|
-
import datetime
|
9
8
|
import inspect
|
10
9
|
import warnings
|
11
|
-
from
|
12
|
-
from typing import Any, Callable, Mapping, get_origin, TypeVar, overload
|
10
|
+
from typing import Any, Callable, Mapping, TypeVar
|
13
11
|
|
14
12
|
import numpy as np
|
15
|
-
|
16
13
|
from .typing import (
|
17
14
|
AnalyzedAnyType,
|
18
15
|
AnalyzedBasicType,
|
@@ -22,18 +19,17 @@ from .typing import (
|
|
22
19
|
AnalyzedTypeInfo,
|
23
20
|
AnalyzedUnionType,
|
24
21
|
AnalyzedUnknownType,
|
25
|
-
EnrichedValueType,
|
26
22
|
analyze_type_info,
|
27
|
-
encode_enriched_type,
|
28
23
|
is_namedtuple_type,
|
24
|
+
is_pydantic_model,
|
29
25
|
is_numpy_number_type,
|
30
|
-
extract_ndarray_elem_dtype,
|
31
26
|
ValueType,
|
32
27
|
FieldSchema,
|
33
28
|
BasicValueType,
|
34
29
|
StructType,
|
35
30
|
TableType,
|
36
31
|
)
|
32
|
+
from .engine_object import get_auto_default_for_type
|
37
33
|
|
38
34
|
|
39
35
|
T = TypeVar("T")
|
@@ -167,6 +163,29 @@ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], An
|
|
167
163
|
|
168
164
|
return encode_namedtuple
|
169
165
|
|
166
|
+
elif is_pydantic_model(struct_type):
|
167
|
+
# Type guard: ensure we have model_fields attribute
|
168
|
+
if hasattr(struct_type, "model_fields"):
|
169
|
+
field_names = list(struct_type.model_fields.keys()) # type: ignore[attr-defined]
|
170
|
+
field_encoders = [
|
171
|
+
make_engine_value_encoder(
|
172
|
+
analyze_type_info(struct_type.model_fields[name].annotation) # type: ignore[attr-defined]
|
173
|
+
)
|
174
|
+
for name in field_names
|
175
|
+
]
|
176
|
+
else:
|
177
|
+
raise ValueError(f"Invalid Pydantic model: {struct_type}")
|
178
|
+
|
179
|
+
def encode_pydantic(value: Any) -> Any:
|
180
|
+
if value is None:
|
181
|
+
return None
|
182
|
+
return [
|
183
|
+
encoder(getattr(value, name))
|
184
|
+
for encoder, name in zip(field_encoders, field_names)
|
185
|
+
]
|
186
|
+
|
187
|
+
return encode_pydantic
|
188
|
+
|
170
189
|
def encode_basic_value(value: Any) -> Any:
|
171
190
|
if isinstance(value, np.number):
|
172
191
|
return value.item()
|
@@ -420,30 +439,6 @@ def make_engine_value_decoder(
|
|
420
439
|
return lambda value: value
|
421
440
|
|
422
441
|
|
423
|
-
def _get_auto_default_for_type(
|
424
|
-
type_info: AnalyzedTypeInfo,
|
425
|
-
) -> tuple[Any, bool]:
|
426
|
-
"""
|
427
|
-
Get an auto-default value for a type annotation if it's safe to do so.
|
428
|
-
|
429
|
-
Returns:
|
430
|
-
A tuple of (default_value, is_supported) where:
|
431
|
-
- default_value: The default value if auto-defaulting is supported
|
432
|
-
- is_supported: True if auto-defaulting is supported for this type
|
433
|
-
"""
|
434
|
-
# Case 1: Nullable types (Optional[T] or T | None)
|
435
|
-
if type_info.nullable:
|
436
|
-
return None, True
|
437
|
-
|
438
|
-
# Case 2: Table types (KTable or LTable) - check if it's a list or dict type
|
439
|
-
if isinstance(type_info.variant, AnalyzedListType):
|
440
|
-
return [], True
|
441
|
-
elif isinstance(type_info.variant, AnalyzedDictType):
|
442
|
-
return {}, True
|
443
|
-
|
444
|
-
return None, False
|
445
|
-
|
446
|
-
|
447
442
|
def make_engine_struct_decoder(
|
448
443
|
field_path: list[str],
|
449
444
|
src_fields: list[FieldSchema],
|
@@ -472,7 +467,7 @@ def make_engine_struct_decoder(
|
|
472
467
|
if not isinstance(dst_type_variant, AnalyzedStructType):
|
473
468
|
raise ValueError(
|
474
469
|
f"Type mismatch for `{''.join(field_path)}`: "
|
475
|
-
f"declared `{dst_type_info.core_type}`, a dataclass, NamedTuple or dict[str, Any] expected"
|
470
|
+
f"declared `{dst_type_info.core_type}`, a dataclass, NamedTuple, Pydantic model or dict[str, Any] expected"
|
476
471
|
)
|
477
472
|
|
478
473
|
src_name_to_idx = {f.name: i for i, f in enumerate(src_fields)}
|
@@ -495,6 +490,26 @@ def make_engine_struct_decoder(
|
|
495
490
|
)
|
496
491
|
for name in fields
|
497
492
|
}
|
493
|
+
elif is_pydantic_model(dst_struct_type):
|
494
|
+
# For Pydantic models, we can use model_fields to get field information
|
495
|
+
parameters = {}
|
496
|
+
# Type guard: ensure we have model_fields attribute
|
497
|
+
if hasattr(dst_struct_type, "model_fields"):
|
498
|
+
model_fields = dst_struct_type.model_fields # type: ignore[attr-defined]
|
499
|
+
else:
|
500
|
+
model_fields = {}
|
501
|
+
for name, field_info in model_fields.items():
|
502
|
+
default_value = (
|
503
|
+
field_info.default
|
504
|
+
if field_info.default is not ...
|
505
|
+
else inspect.Parameter.empty
|
506
|
+
)
|
507
|
+
parameters[name] = inspect.Parameter(
|
508
|
+
name=name,
|
509
|
+
kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
|
510
|
+
default=default_value,
|
511
|
+
annotation=field_info.annotation,
|
512
|
+
)
|
498
513
|
else:
|
499
514
|
raise ValueError(f"Unsupported struct type: {dst_struct_type}")
|
500
515
|
|
@@ -518,7 +533,7 @@ def make_engine_struct_decoder(
|
|
518
533
|
if default_value is not inspect.Parameter.empty:
|
519
534
|
return lambda _: default_value
|
520
535
|
|
521
|
-
auto_default, is_supported =
|
536
|
+
auto_default, is_supported = get_auto_default_for_type(type_info)
|
522
537
|
if is_supported:
|
523
538
|
warnings.warn(
|
524
539
|
f"Field '{name}' (type {param.annotation}) without default value is missing in input: "
|
@@ -536,9 +551,21 @@ def make_engine_struct_decoder(
|
|
536
551
|
make_closure_for_field(name, param) for (name, param) in parameters.items()
|
537
552
|
]
|
538
553
|
|
539
|
-
|
540
|
-
|
541
|
-
|
554
|
+
# Different construction for different struct types
|
555
|
+
if is_pydantic_model(dst_struct_type):
|
556
|
+
# Pydantic models prefer keyword arguments
|
557
|
+
field_names = list(parameters.keys())
|
558
|
+
return lambda values: dst_struct_type(
|
559
|
+
**{
|
560
|
+
field_names[i]: decoder(values)
|
561
|
+
for i, decoder in enumerate(field_value_decoder)
|
562
|
+
}
|
563
|
+
)
|
564
|
+
else:
|
565
|
+
# Dataclasses and NamedTuples can use positional arguments
|
566
|
+
return lambda values: dst_struct_type(
|
567
|
+
*(decoder(values) for decoder in field_value_decoder)
|
568
|
+
)
|
542
569
|
|
543
570
|
|
544
571
|
def _make_engine_struct_to_dict_decoder(
|
@@ -606,174 +633,3 @@ def _make_engine_struct_to_tuple_decoder(
|
|
606
633
|
)
|
607
634
|
|
608
635
|
return decode_to_tuple
|
609
|
-
|
610
|
-
|
611
|
-
def dump_engine_object(v: Any) -> Any:
|
612
|
-
"""Recursively dump an object for engine. Engine side uses `Pythonized` to catch."""
|
613
|
-
if v is None:
|
614
|
-
return None
|
615
|
-
elif isinstance(v, EnrichedValueType):
|
616
|
-
return v.encode()
|
617
|
-
elif isinstance(v, FieldSchema):
|
618
|
-
return v.encode()
|
619
|
-
elif isinstance(v, type) or get_origin(v) is not None:
|
620
|
-
return encode_enriched_type(v)
|
621
|
-
elif isinstance(v, Enum):
|
622
|
-
return v.value
|
623
|
-
elif isinstance(v, datetime.timedelta):
|
624
|
-
total_secs = v.total_seconds()
|
625
|
-
secs = int(total_secs)
|
626
|
-
nanos = int((total_secs - secs) * 1e9)
|
627
|
-
return {"secs": secs, "nanos": nanos}
|
628
|
-
elif is_namedtuple_type(type(v)):
|
629
|
-
# Handle NamedTuple objects specifically to use dict format
|
630
|
-
field_names = list(getattr(type(v), "_fields", ()))
|
631
|
-
result = {}
|
632
|
-
for name in field_names:
|
633
|
-
val = getattr(v, name)
|
634
|
-
result[name] = dump_engine_object(val) # Include all values, including None
|
635
|
-
if hasattr(v, "kind") and "kind" not in result:
|
636
|
-
result["kind"] = v.kind
|
637
|
-
return result
|
638
|
-
elif hasattr(v, "__dict__"): # for dataclass-like objects
|
639
|
-
s = {}
|
640
|
-
for k, val in v.__dict__.items():
|
641
|
-
if val is None:
|
642
|
-
# Skip None values
|
643
|
-
continue
|
644
|
-
s[k] = dump_engine_object(val)
|
645
|
-
if hasattr(v, "kind") and "kind" not in s:
|
646
|
-
s["kind"] = v.kind
|
647
|
-
return s
|
648
|
-
elif isinstance(v, (list, tuple)):
|
649
|
-
return [dump_engine_object(item) for item in v]
|
650
|
-
elif isinstance(v, np.ndarray):
|
651
|
-
return v.tolist()
|
652
|
-
elif isinstance(v, dict):
|
653
|
-
return {k: dump_engine_object(v) for k, v in v.items()}
|
654
|
-
return v
|
655
|
-
|
656
|
-
|
657
|
-
@overload
|
658
|
-
def load_engine_object(expected_type: type[T], v: Any) -> T: ...
|
659
|
-
@overload
|
660
|
-
def load_engine_object(expected_type: Any, v: Any) -> Any: ...
|
661
|
-
def load_engine_object(expected_type: Any, v: Any) -> Any:
|
662
|
-
"""Recursively load an object that was produced by dump_engine_object().
|
663
|
-
|
664
|
-
Args:
|
665
|
-
expected_type: The Python type annotation to reconstruct to.
|
666
|
-
v: The engine-facing Pythonized object (e.g., dict/list/primitive) to convert.
|
667
|
-
|
668
|
-
Returns:
|
669
|
-
A Python object matching the expected_type where possible.
|
670
|
-
"""
|
671
|
-
# Fast path
|
672
|
-
if v is None:
|
673
|
-
return None
|
674
|
-
|
675
|
-
type_info = analyze_type_info(expected_type)
|
676
|
-
variant = type_info.variant
|
677
|
-
|
678
|
-
if type_info.core_type is EnrichedValueType:
|
679
|
-
return EnrichedValueType.decode(v)
|
680
|
-
if type_info.core_type is FieldSchema:
|
681
|
-
return FieldSchema.decode(v)
|
682
|
-
|
683
|
-
# Any or unknown → return as-is
|
684
|
-
if isinstance(variant, AnalyzedAnyType) or type_info.base_type is Any:
|
685
|
-
return v
|
686
|
-
|
687
|
-
# Enum handling
|
688
|
-
if isinstance(expected_type, type) and issubclass(expected_type, Enum):
|
689
|
-
return expected_type(v)
|
690
|
-
|
691
|
-
# TimeDelta special form {secs, nanos}
|
692
|
-
if isinstance(variant, AnalyzedBasicType) and variant.kind == "TimeDelta":
|
693
|
-
if isinstance(v, Mapping) and "secs" in v and "nanos" in v:
|
694
|
-
secs = int(v["secs"]) # type: ignore[index]
|
695
|
-
nanos = int(v["nanos"]) # type: ignore[index]
|
696
|
-
return datetime.timedelta(seconds=secs, microseconds=nanos / 1_000)
|
697
|
-
return v
|
698
|
-
|
699
|
-
# List, NDArray (Vector-ish), or general sequences
|
700
|
-
if isinstance(variant, AnalyzedListType):
|
701
|
-
elem_type = variant.elem_type if variant.elem_type else Any
|
702
|
-
if type_info.base_type is np.ndarray:
|
703
|
-
# Reconstruct NDArray with appropriate dtype if available
|
704
|
-
try:
|
705
|
-
dtype = extract_ndarray_elem_dtype(type_info.core_type)
|
706
|
-
except (TypeError, ValueError, AttributeError):
|
707
|
-
dtype = None
|
708
|
-
return np.array(v, dtype=dtype)
|
709
|
-
# Regular Python list
|
710
|
-
return [load_engine_object(elem_type, item) for item in v]
|
711
|
-
|
712
|
-
# Dict / Mapping
|
713
|
-
if isinstance(variant, AnalyzedDictType):
|
714
|
-
key_t = variant.key_type
|
715
|
-
val_t = variant.value_type
|
716
|
-
return {
|
717
|
-
load_engine_object(key_t, k): load_engine_object(val_t, val)
|
718
|
-
for k, val in v.items()
|
719
|
-
}
|
720
|
-
|
721
|
-
# Structs (dataclass or NamedTuple)
|
722
|
-
if isinstance(variant, AnalyzedStructType):
|
723
|
-
struct_type = variant.struct_type
|
724
|
-
if dataclasses.is_dataclass(struct_type):
|
725
|
-
if not isinstance(v, Mapping):
|
726
|
-
raise ValueError(f"Expected dict for dataclass, got {type(v)}")
|
727
|
-
# Drop auxiliary discriminator "kind" if present
|
728
|
-
dc_init_kwargs: dict[str, Any] = {}
|
729
|
-
field_types = {f.name: f.type for f in dataclasses.fields(struct_type)}
|
730
|
-
for name, f_type in field_types.items():
|
731
|
-
if name in v:
|
732
|
-
dc_init_kwargs[name] = load_engine_object(f_type, v[name])
|
733
|
-
return struct_type(**dc_init_kwargs)
|
734
|
-
elif is_namedtuple_type(struct_type):
|
735
|
-
if not isinstance(v, Mapping):
|
736
|
-
raise ValueError(f"Expected dict for NamedTuple, got {type(v)}")
|
737
|
-
# Dict format (from dump/load functions)
|
738
|
-
annotations = getattr(struct_type, "__annotations__", {})
|
739
|
-
field_names = list(getattr(struct_type, "_fields", ()))
|
740
|
-
nt_init_kwargs: dict[str, Any] = {}
|
741
|
-
for name in field_names:
|
742
|
-
f_type = annotations.get(name, Any)
|
743
|
-
if name in v:
|
744
|
-
nt_init_kwargs[name] = load_engine_object(f_type, v[name])
|
745
|
-
return struct_type(**nt_init_kwargs)
|
746
|
-
return v
|
747
|
-
|
748
|
-
# Union with discriminator support via "kind"
|
749
|
-
if isinstance(variant, AnalyzedUnionType):
|
750
|
-
if isinstance(v, Mapping) and "kind" in v:
|
751
|
-
discriminator = v["kind"]
|
752
|
-
for typ in variant.variant_types:
|
753
|
-
t_info = analyze_type_info(typ)
|
754
|
-
if isinstance(t_info.variant, AnalyzedStructType):
|
755
|
-
t_struct = t_info.variant.struct_type
|
756
|
-
candidate_kind = getattr(t_struct, "kind", None)
|
757
|
-
if candidate_kind == discriminator:
|
758
|
-
# Remove discriminator for constructor
|
759
|
-
v_wo_kind = dict(v)
|
760
|
-
v_wo_kind.pop("kind", None)
|
761
|
-
return load_engine_object(t_struct, v_wo_kind)
|
762
|
-
# Fallback: try each variant until one succeeds
|
763
|
-
for typ in variant.variant_types:
|
764
|
-
try:
|
765
|
-
return load_engine_object(typ, v)
|
766
|
-
except (TypeError, ValueError):
|
767
|
-
continue
|
768
|
-
return v
|
769
|
-
|
770
|
-
# Basic types and everything else: handle numpy scalars and passthrough
|
771
|
-
if isinstance(v, np.ndarray) and type_info.base_type is list:
|
772
|
-
return v.tolist()
|
773
|
-
if isinstance(v, (list, tuple)) and type_info.base_type not in (list, tuple):
|
774
|
-
# If a non-sequence basic type expected, attempt direct cast
|
775
|
-
try:
|
776
|
-
return type_info.core_type(v)
|
777
|
-
except (TypeError, ValueError):
|
778
|
-
return v
|
779
|
-
return v
|
cocoindex/flow.py
CHANGED
@@ -17,7 +17,6 @@ from typing import (
|
|
17
17
|
Callable,
|
18
18
|
Generic,
|
19
19
|
Iterable,
|
20
|
-
NamedTuple,
|
21
20
|
Sequence,
|
22
21
|
TypeVar,
|
23
22
|
cast,
|
@@ -32,8 +31,8 @@ from . import _engine # type: ignore
|
|
32
31
|
from . import index
|
33
32
|
from . import op
|
34
33
|
from . import setting
|
35
|
-
from .
|
36
|
-
|
34
|
+
from .engine_object import dump_engine_object
|
35
|
+
from .engine_value import (
|
37
36
|
make_engine_value_decoder,
|
38
37
|
make_engine_value_encoder,
|
39
38
|
)
|
@@ -406,6 +405,7 @@ class DataCollector:
|
|
406
405
|
/,
|
407
406
|
*,
|
408
407
|
primary_key_fields: Sequence[str],
|
408
|
+
attachments: Sequence[op.TargetAttachmentSpec] = (),
|
409
409
|
vector_indexes: Sequence[index.VectorIndexDef] = (),
|
410
410
|
vector_index: Sequence[tuple[str, index.VectorSimilarityMetric]] = (),
|
411
411
|
setup_by_user: bool = False,
|
@@ -437,6 +437,10 @@ class DataCollector:
|
|
437
437
|
target_name,
|
438
438
|
_spec_kind(target_spec),
|
439
439
|
dump_engine_object(target_spec),
|
440
|
+
[
|
441
|
+
{"kind": _spec_kind(att), **dump_engine_object(att)}
|
442
|
+
for att in attachments
|
443
|
+
],
|
440
444
|
dump_engine_object(index_options),
|
441
445
|
self._engine_data_collector,
|
442
446
|
setup_by_user,
|
@@ -575,7 +579,8 @@ class FlowLiveUpdaterOptions:
|
|
575
579
|
print_stats: bool = False
|
576
580
|
|
577
581
|
|
578
|
-
|
582
|
+
@dataclass
|
583
|
+
class FlowUpdaterStatusUpdates:
|
579
584
|
"""
|
580
585
|
Status updates for a flow updater.
|
581
586
|
"""
|
@@ -1060,12 +1065,14 @@ def _get_data_slice_annotation_type(
|
|
1060
1065
|
_transform_flow_name_builder = _NameBuilder()
|
1061
1066
|
|
1062
1067
|
|
1063
|
-
|
1068
|
+
@dataclass
|
1069
|
+
class TransformFlowInfo(Generic[T]):
|
1064
1070
|
engine_flow: _engine.TransientFlow
|
1065
1071
|
result_decoder: Callable[[Any], T]
|
1066
1072
|
|
1067
1073
|
|
1068
|
-
|
1074
|
+
@dataclass
|
1075
|
+
class FlowArgInfo:
|
1069
1076
|
name: str
|
1070
1077
|
type_hint: Any
|
1071
1078
|
encoder: Callable[[Any], Any]
|
@@ -1081,7 +1088,7 @@ class TransformFlow(Generic[T]):
|
|
1081
1088
|
_args_info: list[FlowArgInfo]
|
1082
1089
|
|
1083
1090
|
_lazy_lock: asyncio.Lock
|
1084
|
-
_lazy_flow_info: TransformFlowInfo | None = None
|
1091
|
+
_lazy_flow_info: TransformFlowInfo[T] | None = None
|
1085
1092
|
|
1086
1093
|
def __init__(
|
1087
1094
|
self,
|
@@ -1123,12 +1130,12 @@ class TransformFlow(Generic[T]):
|
|
1123
1130
|
return self._flow_fn(*args, **kwargs)
|
1124
1131
|
|
1125
1132
|
@property
|
1126
|
-
def _flow_info(self) -> TransformFlowInfo:
|
1133
|
+
def _flow_info(self) -> TransformFlowInfo[T]:
|
1127
1134
|
if self._lazy_flow_info is not None:
|
1128
1135
|
return self._lazy_flow_info
|
1129
1136
|
return execution_context.run(self._flow_info_async())
|
1130
1137
|
|
1131
|
-
async def _flow_info_async(self) -> TransformFlowInfo:
|
1138
|
+
async def _flow_info_async(self) -> TransformFlowInfo[T]:
|
1132
1139
|
if self._lazy_flow_info is not None:
|
1133
1140
|
return self._lazy_flow_info
|
1134
1141
|
async with self._lazy_lock:
|
@@ -1136,7 +1143,7 @@ class TransformFlow(Generic[T]):
|
|
1136
1143
|
self._lazy_flow_info = await self._build_flow_info_async()
|
1137
1144
|
return self._lazy_flow_info
|
1138
1145
|
|
1139
|
-
async def _build_flow_info_async(self) -> TransformFlowInfo:
|
1146
|
+
async def _build_flow_info_async(self) -> TransformFlowInfo[T]:
|
1140
1147
|
flow_builder_state = _FlowBuilderState(self._flow_name)
|
1141
1148
|
kwargs: dict[str, DataSlice[T]] = {}
|
1142
1149
|
for arg_info in self._args_info:
|
@@ -0,0 +1,45 @@
|
|
1
|
+
"""Functions module for cocoindex.
|
2
|
+
|
3
|
+
This module provides various function specifications and executors for data processing,
|
4
|
+
including embedding functions, text processing, and multimodal operations.
|
5
|
+
"""
|
6
|
+
|
7
|
+
# Import all engine builtin function specs
|
8
|
+
from ._engine_builtin_specs import (
|
9
|
+
ParseJson,
|
10
|
+
SplitRecursively,
|
11
|
+
SplitBySeparators,
|
12
|
+
EmbedText,
|
13
|
+
ExtractByLlm,
|
14
|
+
)
|
15
|
+
|
16
|
+
# Import SentenceTransformer embedding functionality
|
17
|
+
from .sbert import (
|
18
|
+
SentenceTransformerEmbed,
|
19
|
+
SentenceTransformerEmbedExecutor,
|
20
|
+
)
|
21
|
+
|
22
|
+
# Import ColPali multimodal embedding functionality
|
23
|
+
from .colpali import (
|
24
|
+
ColPaliEmbedImage,
|
25
|
+
ColPaliEmbedImageExecutor,
|
26
|
+
ColPaliEmbedQuery,
|
27
|
+
ColPaliEmbedQueryExecutor,
|
28
|
+
)
|
29
|
+
|
30
|
+
__all__ = [
|
31
|
+
# Engine builtin specs
|
32
|
+
"ParseJson",
|
33
|
+
"SplitRecursively",
|
34
|
+
"SplitBySeparators",
|
35
|
+
"EmbedText",
|
36
|
+
"ExtractByLlm",
|
37
|
+
# SentenceTransformer
|
38
|
+
"SentenceTransformerEmbed",
|
39
|
+
"SentenceTransformerEmbedExecutor",
|
40
|
+
# ColPali
|
41
|
+
"ColPaliEmbedImage",
|
42
|
+
"ColPaliEmbedImageExecutor",
|
43
|
+
"ColPaliEmbedQuery",
|
44
|
+
"ColPaliEmbedQueryExecutor",
|
45
|
+
]
|
@@ -0,0 +1,62 @@
|
|
1
|
+
"""All builtin function specs."""
|
2
|
+
|
3
|
+
import dataclasses
|
4
|
+
from typing import Literal
|
5
|
+
|
6
|
+
from .. import llm, op
|
7
|
+
|
8
|
+
|
9
|
+
class ParseJson(op.FunctionSpec):
|
10
|
+
"""Parse a text into a JSON object."""
|
11
|
+
|
12
|
+
|
13
|
+
@dataclasses.dataclass
|
14
|
+
class CustomLanguageSpec:
|
15
|
+
"""Custom language specification."""
|
16
|
+
|
17
|
+
language_name: str
|
18
|
+
separators_regex: list[str]
|
19
|
+
aliases: list[str] = dataclasses.field(default_factory=list)
|
20
|
+
|
21
|
+
|
22
|
+
class SplitRecursively(op.FunctionSpec):
|
23
|
+
"""Split a document (in string) recursively."""
|
24
|
+
|
25
|
+
custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
|
26
|
+
|
27
|
+
|
28
|
+
class SplitBySeparators(op.FunctionSpec):
|
29
|
+
"""
|
30
|
+
Split text by specified regex separators only.
|
31
|
+
Output schema matches SplitRecursively for drop-in compatibility:
|
32
|
+
KTable rows with fields: location (Range), text (Str), start, end.
|
33
|
+
Args:
|
34
|
+
separators_regex: list[str] # e.g., [r"\\n\\n+"]
|
35
|
+
keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
|
36
|
+
include_empty: bool = False
|
37
|
+
trim: bool = True
|
38
|
+
"""
|
39
|
+
|
40
|
+
separators_regex: list[str] = dataclasses.field(default_factory=list)
|
41
|
+
keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
|
42
|
+
include_empty: bool = False
|
43
|
+
trim: bool = True
|
44
|
+
|
45
|
+
|
46
|
+
class EmbedText(op.FunctionSpec):
|
47
|
+
"""Embed a text into a vector space."""
|
48
|
+
|
49
|
+
api_type: llm.LlmApiType
|
50
|
+
model: str
|
51
|
+
address: str | None = None
|
52
|
+
output_dimension: int | None = None
|
53
|
+
task_type: str | None = None
|
54
|
+
api_config: llm.VertexAiConfig | None = None
|
55
|
+
|
56
|
+
|
57
|
+
class ExtractByLlm(op.FunctionSpec):
|
58
|
+
"""Extract information from a text using a LLM."""
|
59
|
+
|
60
|
+
llm_spec: llm.LlmSpec
|
61
|
+
output_type: type
|
62
|
+
instruction: str | None = None
|