cocoindex 0.2.16__cp311-abi3-manylinux_2_28_aarch64.whl → 0.2.18__cp311-abi3-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,18 +1,15 @@
1
1
  """
2
- Utilities to convert between Python and engine values.
2
+ Utilities to encode/decode values in cocoindex (for data).
3
3
  """
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
7
  import dataclasses
8
- import datetime
9
8
  import inspect
10
9
  import warnings
11
- from enum import Enum
12
- from typing import Any, Callable, Mapping, get_origin, TypeVar, overload
10
+ from typing import Any, Callable, Mapping, TypeVar
13
11
 
14
12
  import numpy as np
15
-
16
13
  from .typing import (
17
14
  AnalyzedAnyType,
18
15
  AnalyzedBasicType,
@@ -22,18 +19,17 @@ from .typing import (
22
19
  AnalyzedTypeInfo,
23
20
  AnalyzedUnionType,
24
21
  AnalyzedUnknownType,
25
- EnrichedValueType,
26
22
  analyze_type_info,
27
- encode_enriched_type,
28
23
  is_namedtuple_type,
24
+ is_pydantic_model,
29
25
  is_numpy_number_type,
30
- extract_ndarray_elem_dtype,
31
26
  ValueType,
32
27
  FieldSchema,
33
28
  BasicValueType,
34
29
  StructType,
35
30
  TableType,
36
31
  )
32
+ from .engine_object import get_auto_default_for_type
37
33
 
38
34
 
39
35
  T = TypeVar("T")
@@ -167,6 +163,29 @@ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], An
167
163
 
168
164
  return encode_namedtuple
169
165
 
166
+ elif is_pydantic_model(struct_type):
167
+ # Type guard: ensure we have model_fields attribute
168
+ if hasattr(struct_type, "model_fields"):
169
+ field_names = list(struct_type.model_fields.keys()) # type: ignore[attr-defined]
170
+ field_encoders = [
171
+ make_engine_value_encoder(
172
+ analyze_type_info(struct_type.model_fields[name].annotation) # type: ignore[attr-defined]
173
+ )
174
+ for name in field_names
175
+ ]
176
+ else:
177
+ raise ValueError(f"Invalid Pydantic model: {struct_type}")
178
+
179
+ def encode_pydantic(value: Any) -> Any:
180
+ if value is None:
181
+ return None
182
+ return [
183
+ encoder(getattr(value, name))
184
+ for encoder, name in zip(field_encoders, field_names)
185
+ ]
186
+
187
+ return encode_pydantic
188
+
170
189
  def encode_basic_value(value: Any) -> Any:
171
190
  if isinstance(value, np.number):
172
191
  return value.item()
@@ -420,30 +439,6 @@ def make_engine_value_decoder(
420
439
  return lambda value: value
421
440
 
422
441
 
423
- def _get_auto_default_for_type(
424
- type_info: AnalyzedTypeInfo,
425
- ) -> tuple[Any, bool]:
426
- """
427
- Get an auto-default value for a type annotation if it's safe to do so.
428
-
429
- Returns:
430
- A tuple of (default_value, is_supported) where:
431
- - default_value: The default value if auto-defaulting is supported
432
- - is_supported: True if auto-defaulting is supported for this type
433
- """
434
- # Case 1: Nullable types (Optional[T] or T | None)
435
- if type_info.nullable:
436
- return None, True
437
-
438
- # Case 2: Table types (KTable or LTable) - check if it's a list or dict type
439
- if isinstance(type_info.variant, AnalyzedListType):
440
- return [], True
441
- elif isinstance(type_info.variant, AnalyzedDictType):
442
- return {}, True
443
-
444
- return None, False
445
-
446
-
447
442
  def make_engine_struct_decoder(
448
443
  field_path: list[str],
449
444
  src_fields: list[FieldSchema],
@@ -472,7 +467,7 @@ def make_engine_struct_decoder(
472
467
  if not isinstance(dst_type_variant, AnalyzedStructType):
473
468
  raise ValueError(
474
469
  f"Type mismatch for `{''.join(field_path)}`: "
475
- f"declared `{dst_type_info.core_type}`, a dataclass, NamedTuple or dict[str, Any] expected"
470
+ f"declared `{dst_type_info.core_type}`, a dataclass, NamedTuple, Pydantic model or dict[str, Any] expected"
476
471
  )
477
472
 
478
473
  src_name_to_idx = {f.name: i for i, f in enumerate(src_fields)}
@@ -495,6 +490,26 @@ def make_engine_struct_decoder(
495
490
  )
496
491
  for name in fields
497
492
  }
493
+ elif is_pydantic_model(dst_struct_type):
494
+ # For Pydantic models, we can use model_fields to get field information
495
+ parameters = {}
496
+ # Type guard: ensure we have model_fields attribute
497
+ if hasattr(dst_struct_type, "model_fields"):
498
+ model_fields = dst_struct_type.model_fields # type: ignore[attr-defined]
499
+ else:
500
+ model_fields = {}
501
+ for name, field_info in model_fields.items():
502
+ default_value = (
503
+ field_info.default
504
+ if field_info.default is not ...
505
+ else inspect.Parameter.empty
506
+ )
507
+ parameters[name] = inspect.Parameter(
508
+ name=name,
509
+ kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
510
+ default=default_value,
511
+ annotation=field_info.annotation,
512
+ )
498
513
  else:
499
514
  raise ValueError(f"Unsupported struct type: {dst_struct_type}")
500
515
 
@@ -518,7 +533,7 @@ def make_engine_struct_decoder(
518
533
  if default_value is not inspect.Parameter.empty:
519
534
  return lambda _: default_value
520
535
 
521
- auto_default, is_supported = _get_auto_default_for_type(type_info)
536
+ auto_default, is_supported = get_auto_default_for_type(type_info)
522
537
  if is_supported:
523
538
  warnings.warn(
524
539
  f"Field '{name}' (type {param.annotation}) without default value is missing in input: "
@@ -536,9 +551,21 @@ def make_engine_struct_decoder(
536
551
  make_closure_for_field(name, param) for (name, param) in parameters.items()
537
552
  ]
538
553
 
539
- return lambda values: dst_struct_type(
540
- *(decoder(values) for decoder in field_value_decoder)
541
- )
554
+ # Different construction for different struct types
555
+ if is_pydantic_model(dst_struct_type):
556
+ # Pydantic models prefer keyword arguments
557
+ field_names = list(parameters.keys())
558
+ return lambda values: dst_struct_type(
559
+ **{
560
+ field_names[i]: decoder(values)
561
+ for i, decoder in enumerate(field_value_decoder)
562
+ }
563
+ )
564
+ else:
565
+ # Dataclasses and NamedTuples can use positional arguments
566
+ return lambda values: dst_struct_type(
567
+ *(decoder(values) for decoder in field_value_decoder)
568
+ )
542
569
 
543
570
 
544
571
  def _make_engine_struct_to_dict_decoder(
@@ -606,174 +633,3 @@ def _make_engine_struct_to_tuple_decoder(
606
633
  )
607
634
 
608
635
  return decode_to_tuple
609
-
610
-
611
- def dump_engine_object(v: Any) -> Any:
612
- """Recursively dump an object for engine. Engine side uses `Pythonized` to catch."""
613
- if v is None:
614
- return None
615
- elif isinstance(v, EnrichedValueType):
616
- return v.encode()
617
- elif isinstance(v, FieldSchema):
618
- return v.encode()
619
- elif isinstance(v, type) or get_origin(v) is not None:
620
- return encode_enriched_type(v)
621
- elif isinstance(v, Enum):
622
- return v.value
623
- elif isinstance(v, datetime.timedelta):
624
- total_secs = v.total_seconds()
625
- secs = int(total_secs)
626
- nanos = int((total_secs - secs) * 1e9)
627
- return {"secs": secs, "nanos": nanos}
628
- elif is_namedtuple_type(type(v)):
629
- # Handle NamedTuple objects specifically to use dict format
630
- field_names = list(getattr(type(v), "_fields", ()))
631
- result = {}
632
- for name in field_names:
633
- val = getattr(v, name)
634
- result[name] = dump_engine_object(val) # Include all values, including None
635
- if hasattr(v, "kind") and "kind" not in result:
636
- result["kind"] = v.kind
637
- return result
638
- elif hasattr(v, "__dict__"): # for dataclass-like objects
639
- s = {}
640
- for k, val in v.__dict__.items():
641
- if val is None:
642
- # Skip None values
643
- continue
644
- s[k] = dump_engine_object(val)
645
- if hasattr(v, "kind") and "kind" not in s:
646
- s["kind"] = v.kind
647
- return s
648
- elif isinstance(v, (list, tuple)):
649
- return [dump_engine_object(item) for item in v]
650
- elif isinstance(v, np.ndarray):
651
- return v.tolist()
652
- elif isinstance(v, dict):
653
- return {k: dump_engine_object(v) for k, v in v.items()}
654
- return v
655
-
656
-
657
- @overload
658
- def load_engine_object(expected_type: type[T], v: Any) -> T: ...
659
- @overload
660
- def load_engine_object(expected_type: Any, v: Any) -> Any: ...
661
- def load_engine_object(expected_type: Any, v: Any) -> Any:
662
- """Recursively load an object that was produced by dump_engine_object().
663
-
664
- Args:
665
- expected_type: The Python type annotation to reconstruct to.
666
- v: The engine-facing Pythonized object (e.g., dict/list/primitive) to convert.
667
-
668
- Returns:
669
- A Python object matching the expected_type where possible.
670
- """
671
- # Fast path
672
- if v is None:
673
- return None
674
-
675
- type_info = analyze_type_info(expected_type)
676
- variant = type_info.variant
677
-
678
- if type_info.core_type is EnrichedValueType:
679
- return EnrichedValueType.decode(v)
680
- if type_info.core_type is FieldSchema:
681
- return FieldSchema.decode(v)
682
-
683
- # Any or unknown → return as-is
684
- if isinstance(variant, AnalyzedAnyType) or type_info.base_type is Any:
685
- return v
686
-
687
- # Enum handling
688
- if isinstance(expected_type, type) and issubclass(expected_type, Enum):
689
- return expected_type(v)
690
-
691
- # TimeDelta special form {secs, nanos}
692
- if isinstance(variant, AnalyzedBasicType) and variant.kind == "TimeDelta":
693
- if isinstance(v, Mapping) and "secs" in v and "nanos" in v:
694
- secs = int(v["secs"]) # type: ignore[index]
695
- nanos = int(v["nanos"]) # type: ignore[index]
696
- return datetime.timedelta(seconds=secs, microseconds=nanos / 1_000)
697
- return v
698
-
699
- # List, NDArray (Vector-ish), or general sequences
700
- if isinstance(variant, AnalyzedListType):
701
- elem_type = variant.elem_type if variant.elem_type else Any
702
- if type_info.base_type is np.ndarray:
703
- # Reconstruct NDArray with appropriate dtype if available
704
- try:
705
- dtype = extract_ndarray_elem_dtype(type_info.core_type)
706
- except (TypeError, ValueError, AttributeError):
707
- dtype = None
708
- return np.array(v, dtype=dtype)
709
- # Regular Python list
710
- return [load_engine_object(elem_type, item) for item in v]
711
-
712
- # Dict / Mapping
713
- if isinstance(variant, AnalyzedDictType):
714
- key_t = variant.key_type
715
- val_t = variant.value_type
716
- return {
717
- load_engine_object(key_t, k): load_engine_object(val_t, val)
718
- for k, val in v.items()
719
- }
720
-
721
- # Structs (dataclass or NamedTuple)
722
- if isinstance(variant, AnalyzedStructType):
723
- struct_type = variant.struct_type
724
- if dataclasses.is_dataclass(struct_type):
725
- if not isinstance(v, Mapping):
726
- raise ValueError(f"Expected dict for dataclass, got {type(v)}")
727
- # Drop auxiliary discriminator "kind" if present
728
- dc_init_kwargs: dict[str, Any] = {}
729
- field_types = {f.name: f.type for f in dataclasses.fields(struct_type)}
730
- for name, f_type in field_types.items():
731
- if name in v:
732
- dc_init_kwargs[name] = load_engine_object(f_type, v[name])
733
- return struct_type(**dc_init_kwargs)
734
- elif is_namedtuple_type(struct_type):
735
- if not isinstance(v, Mapping):
736
- raise ValueError(f"Expected dict for NamedTuple, got {type(v)}")
737
- # Dict format (from dump/load functions)
738
- annotations = getattr(struct_type, "__annotations__", {})
739
- field_names = list(getattr(struct_type, "_fields", ()))
740
- nt_init_kwargs: dict[str, Any] = {}
741
- for name in field_names:
742
- f_type = annotations.get(name, Any)
743
- if name in v:
744
- nt_init_kwargs[name] = load_engine_object(f_type, v[name])
745
- return struct_type(**nt_init_kwargs)
746
- return v
747
-
748
- # Union with discriminator support via "kind"
749
- if isinstance(variant, AnalyzedUnionType):
750
- if isinstance(v, Mapping) and "kind" in v:
751
- discriminator = v["kind"]
752
- for typ in variant.variant_types:
753
- t_info = analyze_type_info(typ)
754
- if isinstance(t_info.variant, AnalyzedStructType):
755
- t_struct = t_info.variant.struct_type
756
- candidate_kind = getattr(t_struct, "kind", None)
757
- if candidate_kind == discriminator:
758
- # Remove discriminator for constructor
759
- v_wo_kind = dict(v)
760
- v_wo_kind.pop("kind", None)
761
- return load_engine_object(t_struct, v_wo_kind)
762
- # Fallback: try each variant until one succeeds
763
- for typ in variant.variant_types:
764
- try:
765
- return load_engine_object(typ, v)
766
- except (TypeError, ValueError):
767
- continue
768
- return v
769
-
770
- # Basic types and everything else: handle numpy scalars and passthrough
771
- if isinstance(v, np.ndarray) and type_info.base_type is list:
772
- return v.tolist()
773
- if isinstance(v, (list, tuple)) and type_info.base_type not in (list, tuple):
774
- # If a non-sequence basic type expected, attempt direct cast
775
- try:
776
- return type_info.core_type(v)
777
- except (TypeError, ValueError):
778
- return v
779
- return v
cocoindex/flow.py CHANGED
@@ -31,8 +31,8 @@ from . import _engine # type: ignore
31
31
  from . import index
32
32
  from . import op
33
33
  from . import setting
34
- from .convert import (
35
- dump_engine_object,
34
+ from .engine_object import dump_engine_object
35
+ from .engine_value import (
36
36
  make_engine_value_decoder,
37
37
  make_engine_value_encoder,
38
38
  )
@@ -405,6 +405,7 @@ class DataCollector:
405
405
  /,
406
406
  *,
407
407
  primary_key_fields: Sequence[str],
408
+ attachments: Sequence[op.TargetAttachmentSpec] = (),
408
409
  vector_indexes: Sequence[index.VectorIndexDef] = (),
409
410
  vector_index: Sequence[tuple[str, index.VectorSimilarityMetric]] = (),
410
411
  setup_by_user: bool = False,
@@ -436,6 +437,10 @@ class DataCollector:
436
437
  target_name,
437
438
  _spec_kind(target_spec),
438
439
  dump_engine_object(target_spec),
440
+ [
441
+ {"kind": _spec_kind(att), **dump_engine_object(att)}
442
+ for att in attachments
443
+ ],
439
444
  dump_engine_object(index_options),
440
445
  self._engine_data_collector,
441
446
  setup_by_user,
@@ -0,0 +1,40 @@
1
+ """Functions module for cocoindex.
2
+
3
+ This module provides various function specifications and executors for data processing,
4
+ including embedding functions, text processing, and multimodal operations.
5
+ """
6
+
7
+ # Import all engine builtin function specs
8
+ from ._engine_builtin_specs import *
9
+
10
+ # Import SentenceTransformer embedding functionality
11
+ from .sbert import (
12
+ SentenceTransformerEmbed,
13
+ SentenceTransformerEmbedExecutor,
14
+ )
15
+
16
+ # Import ColPali multimodal embedding functionality
17
+ from .colpali import (
18
+ ColPaliEmbedImage,
19
+ ColPaliEmbedImageExecutor,
20
+ ColPaliEmbedQuery,
21
+ ColPaliEmbedQueryExecutor,
22
+ )
23
+
24
+ __all__ = [
25
+ # Engine builtin specs
26
+ "DetectProgrammingLanguage",
27
+ "EmbedText",
28
+ "ExtractByLlm",
29
+ "ParseJson",
30
+ "SplitBySeparators",
31
+ "SplitRecursively",
32
+ # SentenceTransformer
33
+ "SentenceTransformerEmbed",
34
+ "SentenceTransformerEmbedExecutor",
35
+ # ColPali
36
+ "ColPaliEmbedImage",
37
+ "ColPaliEmbedImageExecutor",
38
+ "ColPaliEmbedQuery",
39
+ "ColPaliEmbedQueryExecutor",
40
+ ]
@@ -0,0 +1,66 @@
1
+ """All builtin function specs."""
2
+
3
+ import dataclasses
4
+ from typing import Literal
5
+
6
+ from .. import llm, op
7
+
8
+
9
+ class ParseJson(op.FunctionSpec):
10
+ """Parse a text into a JSON object."""
11
+
12
+
13
+ @dataclasses.dataclass
14
+ class CustomLanguageSpec:
15
+ """Custom language specification."""
16
+
17
+ language_name: str
18
+ separators_regex: list[str]
19
+ aliases: list[str] = dataclasses.field(default_factory=list)
20
+
21
+
22
+ class DetectProgrammingLanguage(op.FunctionSpec):
23
+ """Detect the programming language of a file."""
24
+
25
+
26
+ class SplitRecursively(op.FunctionSpec):
27
+ """Split a document (in string) recursively."""
28
+
29
+ custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
30
+
31
+
32
+ class SplitBySeparators(op.FunctionSpec):
33
+ """
34
+ Split text by specified regex separators only.
35
+ Output schema matches SplitRecursively for drop-in compatibility:
36
+ KTable rows with fields: location (Range), text (Str), start, end.
37
+ Args:
38
+ separators_regex: list[str] # e.g., [r"\\n\\n+"]
39
+ keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
40
+ include_empty: bool = False
41
+ trim: bool = True
42
+ """
43
+
44
+ separators_regex: list[str] = dataclasses.field(default_factory=list)
45
+ keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
46
+ include_empty: bool = False
47
+ trim: bool = True
48
+
49
+
50
+ class EmbedText(op.FunctionSpec):
51
+ """Embed a text into a vector space."""
52
+
53
+ api_type: llm.LlmApiType
54
+ model: str
55
+ address: str | None = None
56
+ output_dimension: int | None = None
57
+ task_type: str | None = None
58
+ api_config: llm.VertexAiConfig | None = None
59
+
60
+
61
+ class ExtractByLlm(op.FunctionSpec):
62
+ """Extract information from a text using a LLM."""
63
+
64
+ llm_spec: llm.LlmSpec
65
+ output_type: type
66
+ instruction: str | None = None