cocoindex 0.2.15__cp311-abi3-macosx_11_0_arm64.whl → 0.2.17__cp311-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,18 +1,15 @@
1
1
  """
2
- Utilities to convert between Python and engine values.
2
+ Utilities to encode/decode values in cocoindex (for data).
3
3
  """
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
7
  import dataclasses
8
- import datetime
9
8
  import inspect
10
9
  import warnings
11
- from enum import Enum
12
- from typing import Any, Callable, Mapping, get_origin, TypeVar, overload
10
+ from typing import Any, Callable, Mapping, TypeVar
13
11
 
14
12
  import numpy as np
15
-
16
13
  from .typing import (
17
14
  AnalyzedAnyType,
18
15
  AnalyzedBasicType,
@@ -22,18 +19,17 @@ from .typing import (
22
19
  AnalyzedTypeInfo,
23
20
  AnalyzedUnionType,
24
21
  AnalyzedUnknownType,
25
- EnrichedValueType,
26
22
  analyze_type_info,
27
- encode_enriched_type,
28
23
  is_namedtuple_type,
24
+ is_pydantic_model,
29
25
  is_numpy_number_type,
30
- extract_ndarray_elem_dtype,
31
26
  ValueType,
32
27
  FieldSchema,
33
28
  BasicValueType,
34
29
  StructType,
35
30
  TableType,
36
31
  )
32
+ from .engine_object import get_auto_default_for_type
37
33
 
38
34
 
39
35
  T = TypeVar("T")
@@ -167,6 +163,29 @@ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], An
167
163
 
168
164
  return encode_namedtuple
169
165
 
166
+ elif is_pydantic_model(struct_type):
167
+ # Type guard: ensure we have model_fields attribute
168
+ if hasattr(struct_type, "model_fields"):
169
+ field_names = list(struct_type.model_fields.keys()) # type: ignore[attr-defined]
170
+ field_encoders = [
171
+ make_engine_value_encoder(
172
+ analyze_type_info(struct_type.model_fields[name].annotation) # type: ignore[attr-defined]
173
+ )
174
+ for name in field_names
175
+ ]
176
+ else:
177
+ raise ValueError(f"Invalid Pydantic model: {struct_type}")
178
+
179
+ def encode_pydantic(value: Any) -> Any:
180
+ if value is None:
181
+ return None
182
+ return [
183
+ encoder(getattr(value, name))
184
+ for encoder, name in zip(field_encoders, field_names)
185
+ ]
186
+
187
+ return encode_pydantic
188
+
170
189
  def encode_basic_value(value: Any) -> Any:
171
190
  if isinstance(value, np.number):
172
191
  return value.item()
@@ -420,30 +439,6 @@ def make_engine_value_decoder(
420
439
  return lambda value: value
421
440
 
422
441
 
423
- def _get_auto_default_for_type(
424
- type_info: AnalyzedTypeInfo,
425
- ) -> tuple[Any, bool]:
426
- """
427
- Get an auto-default value for a type annotation if it's safe to do so.
428
-
429
- Returns:
430
- A tuple of (default_value, is_supported) where:
431
- - default_value: The default value if auto-defaulting is supported
432
- - is_supported: True if auto-defaulting is supported for this type
433
- """
434
- # Case 1: Nullable types (Optional[T] or T | None)
435
- if type_info.nullable:
436
- return None, True
437
-
438
- # Case 2: Table types (KTable or LTable) - check if it's a list or dict type
439
- if isinstance(type_info.variant, AnalyzedListType):
440
- return [], True
441
- elif isinstance(type_info.variant, AnalyzedDictType):
442
- return {}, True
443
-
444
- return None, False
445
-
446
-
447
442
  def make_engine_struct_decoder(
448
443
  field_path: list[str],
449
444
  src_fields: list[FieldSchema],
@@ -472,7 +467,7 @@ def make_engine_struct_decoder(
472
467
  if not isinstance(dst_type_variant, AnalyzedStructType):
473
468
  raise ValueError(
474
469
  f"Type mismatch for `{''.join(field_path)}`: "
475
- f"declared `{dst_type_info.core_type}`, a dataclass, NamedTuple or dict[str, Any] expected"
470
+ f"declared `{dst_type_info.core_type}`, a dataclass, NamedTuple, Pydantic model or dict[str, Any] expected"
476
471
  )
477
472
 
478
473
  src_name_to_idx = {f.name: i for i, f in enumerate(src_fields)}
@@ -495,6 +490,26 @@ def make_engine_struct_decoder(
495
490
  )
496
491
  for name in fields
497
492
  }
493
+ elif is_pydantic_model(dst_struct_type):
494
+ # For Pydantic models, we can use model_fields to get field information
495
+ parameters = {}
496
+ # Type guard: ensure we have model_fields attribute
497
+ if hasattr(dst_struct_type, "model_fields"):
498
+ model_fields = dst_struct_type.model_fields # type: ignore[attr-defined]
499
+ else:
500
+ model_fields = {}
501
+ for name, field_info in model_fields.items():
502
+ default_value = (
503
+ field_info.default
504
+ if field_info.default is not ...
505
+ else inspect.Parameter.empty
506
+ )
507
+ parameters[name] = inspect.Parameter(
508
+ name=name,
509
+ kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
510
+ default=default_value,
511
+ annotation=field_info.annotation,
512
+ )
498
513
  else:
499
514
  raise ValueError(f"Unsupported struct type: {dst_struct_type}")
500
515
 
@@ -518,7 +533,7 @@ def make_engine_struct_decoder(
518
533
  if default_value is not inspect.Parameter.empty:
519
534
  return lambda _: default_value
520
535
 
521
- auto_default, is_supported = _get_auto_default_for_type(type_info)
536
+ auto_default, is_supported = get_auto_default_for_type(type_info)
522
537
  if is_supported:
523
538
  warnings.warn(
524
539
  f"Field '{name}' (type {param.annotation}) without default value is missing in input: "
@@ -536,9 +551,21 @@ def make_engine_struct_decoder(
536
551
  make_closure_for_field(name, param) for (name, param) in parameters.items()
537
552
  ]
538
553
 
539
- return lambda values: dst_struct_type(
540
- *(decoder(values) for decoder in field_value_decoder)
541
- )
554
+ # Different construction for different struct types
555
+ if is_pydantic_model(dst_struct_type):
556
+ # Pydantic models prefer keyword arguments
557
+ field_names = list(parameters.keys())
558
+ return lambda values: dst_struct_type(
559
+ **{
560
+ field_names[i]: decoder(values)
561
+ for i, decoder in enumerate(field_value_decoder)
562
+ }
563
+ )
564
+ else:
565
+ # Dataclasses and NamedTuples can use positional arguments
566
+ return lambda values: dst_struct_type(
567
+ *(decoder(values) for decoder in field_value_decoder)
568
+ )
542
569
 
543
570
 
544
571
  def _make_engine_struct_to_dict_decoder(
@@ -606,174 +633,3 @@ def _make_engine_struct_to_tuple_decoder(
606
633
  )
607
634
 
608
635
  return decode_to_tuple
609
-
610
-
611
- def dump_engine_object(v: Any) -> Any:
612
- """Recursively dump an object for engine. Engine side uses `Pythonized` to catch."""
613
- if v is None:
614
- return None
615
- elif isinstance(v, EnrichedValueType):
616
- return v.encode()
617
- elif isinstance(v, FieldSchema):
618
- return v.encode()
619
- elif isinstance(v, type) or get_origin(v) is not None:
620
- return encode_enriched_type(v)
621
- elif isinstance(v, Enum):
622
- return v.value
623
- elif isinstance(v, datetime.timedelta):
624
- total_secs = v.total_seconds()
625
- secs = int(total_secs)
626
- nanos = int((total_secs - secs) * 1e9)
627
- return {"secs": secs, "nanos": nanos}
628
- elif is_namedtuple_type(type(v)):
629
- # Handle NamedTuple objects specifically to use dict format
630
- field_names = list(getattr(type(v), "_fields", ()))
631
- result = {}
632
- for name in field_names:
633
- val = getattr(v, name)
634
- result[name] = dump_engine_object(val) # Include all values, including None
635
- if hasattr(v, "kind") and "kind" not in result:
636
- result["kind"] = v.kind
637
- return result
638
- elif hasattr(v, "__dict__"): # for dataclass-like objects
639
- s = {}
640
- for k, val in v.__dict__.items():
641
- if val is None:
642
- # Skip None values
643
- continue
644
- s[k] = dump_engine_object(val)
645
- if hasattr(v, "kind") and "kind" not in s:
646
- s["kind"] = v.kind
647
- return s
648
- elif isinstance(v, (list, tuple)):
649
- return [dump_engine_object(item) for item in v]
650
- elif isinstance(v, np.ndarray):
651
- return v.tolist()
652
- elif isinstance(v, dict):
653
- return {k: dump_engine_object(v) for k, v in v.items()}
654
- return v
655
-
656
-
657
- @overload
658
- def load_engine_object(expected_type: type[T], v: Any) -> T: ...
659
- @overload
660
- def load_engine_object(expected_type: Any, v: Any) -> Any: ...
661
- def load_engine_object(expected_type: Any, v: Any) -> Any:
662
- """Recursively load an object that was produced by dump_engine_object().
663
-
664
- Args:
665
- expected_type: The Python type annotation to reconstruct to.
666
- v: The engine-facing Pythonized object (e.g., dict/list/primitive) to convert.
667
-
668
- Returns:
669
- A Python object matching the expected_type where possible.
670
- """
671
- # Fast path
672
- if v is None:
673
- return None
674
-
675
- type_info = analyze_type_info(expected_type)
676
- variant = type_info.variant
677
-
678
- if type_info.core_type is EnrichedValueType:
679
- return EnrichedValueType.decode(v)
680
- if type_info.core_type is FieldSchema:
681
- return FieldSchema.decode(v)
682
-
683
- # Any or unknown → return as-is
684
- if isinstance(variant, AnalyzedAnyType) or type_info.base_type is Any:
685
- return v
686
-
687
- # Enum handling
688
- if isinstance(expected_type, type) and issubclass(expected_type, Enum):
689
- return expected_type(v)
690
-
691
- # TimeDelta special form {secs, nanos}
692
- if isinstance(variant, AnalyzedBasicType) and variant.kind == "TimeDelta":
693
- if isinstance(v, Mapping) and "secs" in v and "nanos" in v:
694
- secs = int(v["secs"]) # type: ignore[index]
695
- nanos = int(v["nanos"]) # type: ignore[index]
696
- return datetime.timedelta(seconds=secs, microseconds=nanos / 1_000)
697
- return v
698
-
699
- # List, NDArray (Vector-ish), or general sequences
700
- if isinstance(variant, AnalyzedListType):
701
- elem_type = variant.elem_type if variant.elem_type else Any
702
- if type_info.base_type is np.ndarray:
703
- # Reconstruct NDArray with appropriate dtype if available
704
- try:
705
- dtype = extract_ndarray_elem_dtype(type_info.core_type)
706
- except (TypeError, ValueError, AttributeError):
707
- dtype = None
708
- return np.array(v, dtype=dtype)
709
- # Regular Python list
710
- return [load_engine_object(elem_type, item) for item in v]
711
-
712
- # Dict / Mapping
713
- if isinstance(variant, AnalyzedDictType):
714
- key_t = variant.key_type
715
- val_t = variant.value_type
716
- return {
717
- load_engine_object(key_t, k): load_engine_object(val_t, val)
718
- for k, val in v.items()
719
- }
720
-
721
- # Structs (dataclass or NamedTuple)
722
- if isinstance(variant, AnalyzedStructType):
723
- struct_type = variant.struct_type
724
- if dataclasses.is_dataclass(struct_type):
725
- if not isinstance(v, Mapping):
726
- raise ValueError(f"Expected dict for dataclass, got {type(v)}")
727
- # Drop auxiliary discriminator "kind" if present
728
- dc_init_kwargs: dict[str, Any] = {}
729
- field_types = {f.name: f.type for f in dataclasses.fields(struct_type)}
730
- for name, f_type in field_types.items():
731
- if name in v:
732
- dc_init_kwargs[name] = load_engine_object(f_type, v[name])
733
- return struct_type(**dc_init_kwargs)
734
- elif is_namedtuple_type(struct_type):
735
- if not isinstance(v, Mapping):
736
- raise ValueError(f"Expected dict for NamedTuple, got {type(v)}")
737
- # Dict format (from dump/load functions)
738
- annotations = getattr(struct_type, "__annotations__", {})
739
- field_names = list(getattr(struct_type, "_fields", ()))
740
- nt_init_kwargs: dict[str, Any] = {}
741
- for name in field_names:
742
- f_type = annotations.get(name, Any)
743
- if name in v:
744
- nt_init_kwargs[name] = load_engine_object(f_type, v[name])
745
- return struct_type(**nt_init_kwargs)
746
- return v
747
-
748
- # Union with discriminator support via "kind"
749
- if isinstance(variant, AnalyzedUnionType):
750
- if isinstance(v, Mapping) and "kind" in v:
751
- discriminator = v["kind"]
752
- for typ in variant.variant_types:
753
- t_info = analyze_type_info(typ)
754
- if isinstance(t_info.variant, AnalyzedStructType):
755
- t_struct = t_info.variant.struct_type
756
- candidate_kind = getattr(t_struct, "kind", None)
757
- if candidate_kind == discriminator:
758
- # Remove discriminator for constructor
759
- v_wo_kind = dict(v)
760
- v_wo_kind.pop("kind", None)
761
- return load_engine_object(t_struct, v_wo_kind)
762
- # Fallback: try each variant until one succeeds
763
- for typ in variant.variant_types:
764
- try:
765
- return load_engine_object(typ, v)
766
- except (TypeError, ValueError):
767
- continue
768
- return v
769
-
770
- # Basic types and everything else: handle numpy scalars and passthrough
771
- if isinstance(v, np.ndarray) and type_info.base_type is list:
772
- return v.tolist()
773
- if isinstance(v, (list, tuple)) and type_info.base_type not in (list, tuple):
774
- # If a non-sequence basic type expected, attempt direct cast
775
- try:
776
- return type_info.core_type(v)
777
- except (TypeError, ValueError):
778
- return v
779
- return v
cocoindex/flow.py CHANGED
@@ -17,7 +17,6 @@ from typing import (
17
17
  Callable,
18
18
  Generic,
19
19
  Iterable,
20
- NamedTuple,
21
20
  Sequence,
22
21
  TypeVar,
23
22
  cast,
@@ -32,8 +31,8 @@ from . import _engine # type: ignore
32
31
  from . import index
33
32
  from . import op
34
33
  from . import setting
35
- from .convert import (
36
- dump_engine_object,
34
+ from .engine_object import dump_engine_object
35
+ from .engine_value import (
37
36
  make_engine_value_decoder,
38
37
  make_engine_value_encoder,
39
38
  )
@@ -406,6 +405,7 @@ class DataCollector:
406
405
  /,
407
406
  *,
408
407
  primary_key_fields: Sequence[str],
408
+ attachments: Sequence[op.TargetAttachmentSpec] = (),
409
409
  vector_indexes: Sequence[index.VectorIndexDef] = (),
410
410
  vector_index: Sequence[tuple[str, index.VectorSimilarityMetric]] = (),
411
411
  setup_by_user: bool = False,
@@ -437,6 +437,10 @@ class DataCollector:
437
437
  target_name,
438
438
  _spec_kind(target_spec),
439
439
  dump_engine_object(target_spec),
440
+ [
441
+ {"kind": _spec_kind(att), **dump_engine_object(att)}
442
+ for att in attachments
443
+ ],
440
444
  dump_engine_object(index_options),
441
445
  self._engine_data_collector,
442
446
  setup_by_user,
@@ -575,7 +579,8 @@ class FlowLiveUpdaterOptions:
575
579
  print_stats: bool = False
576
580
 
577
581
 
578
- class FlowUpdaterStatusUpdates(NamedTuple):
582
+ @dataclass
583
+ class FlowUpdaterStatusUpdates:
579
584
  """
580
585
  Status updates for a flow updater.
581
586
  """
@@ -1060,12 +1065,14 @@ def _get_data_slice_annotation_type(
1060
1065
  _transform_flow_name_builder = _NameBuilder()
1061
1066
 
1062
1067
 
1063
- class TransformFlowInfo(NamedTuple):
1068
+ @dataclass
1069
+ class TransformFlowInfo(Generic[T]):
1064
1070
  engine_flow: _engine.TransientFlow
1065
1071
  result_decoder: Callable[[Any], T]
1066
1072
 
1067
1073
 
1068
- class FlowArgInfo(NamedTuple):
1074
+ @dataclass
1075
+ class FlowArgInfo:
1069
1076
  name: str
1070
1077
  type_hint: Any
1071
1078
  encoder: Callable[[Any], Any]
@@ -1081,7 +1088,7 @@ class TransformFlow(Generic[T]):
1081
1088
  _args_info: list[FlowArgInfo]
1082
1089
 
1083
1090
  _lazy_lock: asyncio.Lock
1084
- _lazy_flow_info: TransformFlowInfo | None = None
1091
+ _lazy_flow_info: TransformFlowInfo[T] | None = None
1085
1092
 
1086
1093
  def __init__(
1087
1094
  self,
@@ -1123,12 +1130,12 @@ class TransformFlow(Generic[T]):
1123
1130
  return self._flow_fn(*args, **kwargs)
1124
1131
 
1125
1132
  @property
1126
- def _flow_info(self) -> TransformFlowInfo:
1133
+ def _flow_info(self) -> TransformFlowInfo[T]:
1127
1134
  if self._lazy_flow_info is not None:
1128
1135
  return self._lazy_flow_info
1129
1136
  return execution_context.run(self._flow_info_async())
1130
1137
 
1131
- async def _flow_info_async(self) -> TransformFlowInfo:
1138
+ async def _flow_info_async(self) -> TransformFlowInfo[T]:
1132
1139
  if self._lazy_flow_info is not None:
1133
1140
  return self._lazy_flow_info
1134
1141
  async with self._lazy_lock:
@@ -1136,7 +1143,7 @@ class TransformFlow(Generic[T]):
1136
1143
  self._lazy_flow_info = await self._build_flow_info_async()
1137
1144
  return self._lazy_flow_info
1138
1145
 
1139
- async def _build_flow_info_async(self) -> TransformFlowInfo:
1146
+ async def _build_flow_info_async(self) -> TransformFlowInfo[T]:
1140
1147
  flow_builder_state = _FlowBuilderState(self._flow_name)
1141
1148
  kwargs: dict[str, DataSlice[T]] = {}
1142
1149
  for arg_info in self._args_info:
@@ -0,0 +1,45 @@
1
+ """Functions module for cocoindex.
2
+
3
+ This module provides various function specifications and executors for data processing,
4
+ including embedding functions, text processing, and multimodal operations.
5
+ """
6
+
7
+ # Import all engine builtin function specs
8
+ from ._engine_builtin_specs import (
9
+ ParseJson,
10
+ SplitRecursively,
11
+ SplitBySeparators,
12
+ EmbedText,
13
+ ExtractByLlm,
14
+ )
15
+
16
+ # Import SentenceTransformer embedding functionality
17
+ from .sbert import (
18
+ SentenceTransformerEmbed,
19
+ SentenceTransformerEmbedExecutor,
20
+ )
21
+
22
+ # Import ColPali multimodal embedding functionality
23
+ from .colpali import (
24
+ ColPaliEmbedImage,
25
+ ColPaliEmbedImageExecutor,
26
+ ColPaliEmbedQuery,
27
+ ColPaliEmbedQueryExecutor,
28
+ )
29
+
30
+ __all__ = [
31
+ # Engine builtin specs
32
+ "ParseJson",
33
+ "SplitRecursively",
34
+ "SplitBySeparators",
35
+ "EmbedText",
36
+ "ExtractByLlm",
37
+ # SentenceTransformer
38
+ "SentenceTransformerEmbed",
39
+ "SentenceTransformerEmbedExecutor",
40
+ # ColPali
41
+ "ColPaliEmbedImage",
42
+ "ColPaliEmbedImageExecutor",
43
+ "ColPaliEmbedQuery",
44
+ "ColPaliEmbedQueryExecutor",
45
+ ]
@@ -0,0 +1,62 @@
1
+ """All builtin function specs."""
2
+
3
+ import dataclasses
4
+ from typing import Literal
5
+
6
+ from .. import llm, op
7
+
8
+
9
+ class ParseJson(op.FunctionSpec):
10
+ """Parse a text into a JSON object."""
11
+
12
+
13
+ @dataclasses.dataclass
14
+ class CustomLanguageSpec:
15
+ """Custom language specification."""
16
+
17
+ language_name: str
18
+ separators_regex: list[str]
19
+ aliases: list[str] = dataclasses.field(default_factory=list)
20
+
21
+
22
+ class SplitRecursively(op.FunctionSpec):
23
+ """Split a document (in string) recursively."""
24
+
25
+ custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
26
+
27
+
28
+ class SplitBySeparators(op.FunctionSpec):
29
+ """
30
+ Split text by specified regex separators only.
31
+ Output schema matches SplitRecursively for drop-in compatibility:
32
+ KTable rows with fields: location (Range), text (Str), start, end.
33
+ Args:
34
+ separators_regex: list[str] # e.g., [r"\\n\\n+"]
35
+ keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
36
+ include_empty: bool = False
37
+ trim: bool = True
38
+ """
39
+
40
+ separators_regex: list[str] = dataclasses.field(default_factory=list)
41
+ keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
42
+ include_empty: bool = False
43
+ trim: bool = True
44
+
45
+
46
+ class EmbedText(op.FunctionSpec):
47
+ """Embed a text into a vector space."""
48
+
49
+ api_type: llm.LlmApiType
50
+ model: str
51
+ address: str | None = None
52
+ output_dimension: int | None = None
53
+ task_type: str | None = None
54
+ api_config: llm.VertexAiConfig | None = None
55
+
56
+
57
+ class ExtractByLlm(op.FunctionSpec):
58
+ """Extract information from a text using a LLM."""
59
+
60
+ llm_spec: llm.LlmSpec
61
+ output_type: type
62
+ instruction: str | None = None