PyPI - cocoindex - Versions diffs - 0.2.15__cp311-abi3-macosx_11_0_arm64.whl → 0.2.17__cp311-abi3-macosx_11_0_arm64.whl - Mend

cocoindex 0.2.15__cp311-abi3-macosx_11_0_arm64.whl → 0.2.17__cp311-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

cocoindex/_engine.abi3.so +0 -0
cocoindex/auth_registry.py +1 -1
cocoindex/cli.py +121 -41
cocoindex/engine_object.py +272 -0
cocoindex/{convert.py → engine_value.py} +64 -208
cocoindex/flow.py +17 -10
cocoindex/functions/__init__.py +45 -0
cocoindex/functions/_engine_builtin_specs.py +62 -0
cocoindex/functions/colpali.py +250 -0
cocoindex/functions/sbert.py +63 -0
cocoindex/lib.py +1 -1
cocoindex/op.py +7 -3
cocoindex/sources/__init__.py +5 -0
cocoindex/{sources.py → sources/_engine_builtin_specs.py} +3 -3
cocoindex/targets/_engine_builtin_specs.py +9 -0
cocoindex/tests/test_engine_object.py +331 -0
cocoindex/tests/{test_convert.py → test_engine_value.py} +150 -26
cocoindex/typing.py +125 -3
{cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/METADATA +4 -1
cocoindex-0.2.17.dist-info/RECORD +43 -0
{cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/WHEEL +1 -1
{cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/licenses/THIRD_PARTY_NOTICES.html +22 -19
cocoindex/tests/test_load_convert.py +0 -118
cocoindex-0.2.15.dist-info/RECORD +0 -37
{cocoindex-0.2.15.dist-info → cocoindex-0.2.17.dist-info}/entry_points.txt +0 -0

cocoindex/{convert.py → engine_value.py} RENAMED Viewed

@@ -1,18 +1,15 @@
 """
-Utilities to convert between Python and engine values.
+Utilities to encode/decode values in cocoindex (for data).
 """
 from __future__ import annotations
 import dataclasses
-import datetime
 import inspect
 import warnings
-from enum import Enum
-from typing import Any, Callable, Mapping, get_origin, TypeVar, overload
+from typing import Any, Callable, Mapping, TypeVar
 import numpy as np
 from .typing import (
     AnalyzedAnyType,
     AnalyzedBasicType,
@@ -22,18 +19,17 @@ from .typing import (
     AnalyzedTypeInfo,
     AnalyzedUnionType,
     AnalyzedUnknownType,
-    EnrichedValueType,
     analyze_type_info,
-    encode_enriched_type,
     is_namedtuple_type,
+    is_pydantic_model,
     is_numpy_number_type,
-    extract_ndarray_elem_dtype,
     ValueType,
     FieldSchema,
     BasicValueType,
     StructType,
     TableType,
 )
+from .engine_object import get_auto_default_for_type
 T = TypeVar("T")
@@ -167,6 +163,29 @@ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], An
             return encode_namedtuple
+        elif is_pydantic_model(struct_type):
+            # Type guard: ensure we have model_fields attribute
+            if hasattr(struct_type, "model_fields"):
+                field_names = list(struct_type.model_fields.keys())  # type: ignore[attr-defined]
+                field_encoders = [
+                    make_engine_value_encoder(
+                        analyze_type_info(struct_type.model_fields[name].annotation)  # type: ignore[attr-defined]
+                    )
+                    for name in field_names
+                ]
+            else:
+                raise ValueError(f"Invalid Pydantic model: {struct_type}")
+            def encode_pydantic(value: Any) -> Any:
+                if value is None:
+                    return None
+                return [
+                    encoder(getattr(value, name))
+                    for encoder, name in zip(field_encoders, field_names)
+                ]
+            return encode_pydantic
     def encode_basic_value(value: Any) -> Any:
         if isinstance(value, np.number):
             return value.item()
@@ -420,30 +439,6 @@ def make_engine_value_decoder(
     return lambda value: value
-def _get_auto_default_for_type(
-    type_info: AnalyzedTypeInfo,
-) -> tuple[Any, bool]:
-    """
-    Get an auto-default value for a type annotation if it's safe to do so.
-    Returns:
-        A tuple of (default_value, is_supported) where:
-        - default_value: The default value if auto-defaulting is supported
-        - is_supported: True if auto-defaulting is supported for this type
-    """
-    # Case 1: Nullable types (Optional[T] or T | None)
-    if type_info.nullable:
-        return None, True
-    # Case 2: Table types (KTable or LTable) - check if it's a list or dict type
-    if isinstance(type_info.variant, AnalyzedListType):
-        return [], True
-    elif isinstance(type_info.variant, AnalyzedDictType):
-        return {}, True
-    return None, False
 def make_engine_struct_decoder(
     field_path: list[str],
     src_fields: list[FieldSchema],
@@ -472,7 +467,7 @@ def make_engine_struct_decoder(
     if not isinstance(dst_type_variant, AnalyzedStructType):
         raise ValueError(
             f"Type mismatch for `{''.join(field_path)}`: "
-            f"declared `{dst_type_info.core_type}`, a dataclass, NamedTuple or dict[str, Any] expected"
+            f"declared `{dst_type_info.core_type}`, a dataclass, NamedTuple, Pydantic model or dict[str, Any] expected"
         )
     src_name_to_idx = {f.name: i for i, f in enumerate(src_fields)}
@@ -495,6 +490,26 @@ def make_engine_struct_decoder(
             )
             for name in fields
         }
+    elif is_pydantic_model(dst_struct_type):
+        # For Pydantic models, we can use model_fields to get field information
+        parameters = {}
+        # Type guard: ensure we have model_fields attribute
+        if hasattr(dst_struct_type, "model_fields"):
+            model_fields = dst_struct_type.model_fields  # type: ignore[attr-defined]
+        else:
+            model_fields = {}
+        for name, field_info in model_fields.items():
+            default_value = (
+                field_info.default
+                if field_info.default is not ...
+                else inspect.Parameter.empty
+            )
+            parameters[name] = inspect.Parameter(
+                name=name,
+                kind=inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                default=default_value,
+                annotation=field_info.annotation,
+            )
     else:
         raise ValueError(f"Unsupported struct type: {dst_struct_type}")
@@ -518,7 +533,7 @@ def make_engine_struct_decoder(
             if default_value is not inspect.Parameter.empty:
                 return lambda _: default_value
-            auto_default, is_supported = _get_auto_default_for_type(type_info)
+            auto_default, is_supported = get_auto_default_for_type(type_info)
             if is_supported:
                 warnings.warn(
                     f"Field '{name}' (type {param.annotation}) without default value is missing in input: "
@@ -536,9 +551,21 @@ def make_engine_struct_decoder(
         make_closure_for_field(name, param) for (name, param) in parameters.items()
     ]
-    return lambda values: dst_struct_type(
-        *(decoder(values) for decoder in field_value_decoder)
-    )
+    # Different construction for different struct types
+    if is_pydantic_model(dst_struct_type):
+        # Pydantic models prefer keyword arguments
+        field_names = list(parameters.keys())
+        return lambda values: dst_struct_type(
+            **{
+                field_names[i]: decoder(values)
+                for i, decoder in enumerate(field_value_decoder)
+            }
+        )
+    else:
+        # Dataclasses and NamedTuples can use positional arguments
+        return lambda values: dst_struct_type(
+            *(decoder(values) for decoder in field_value_decoder)
+        )
 def _make_engine_struct_to_dict_decoder(
@@ -606,174 +633,3 @@ def _make_engine_struct_to_tuple_decoder(
         )
     return decode_to_tuple
-def dump_engine_object(v: Any) -> Any:
-    """Recursively dump an object for engine. Engine side uses `Pythonized` to catch."""
-    if v is None:
-        return None
-    elif isinstance(v, EnrichedValueType):
-        return v.encode()
-    elif isinstance(v, FieldSchema):
-        return v.encode()
-    elif isinstance(v, type) or get_origin(v) is not None:
-        return encode_enriched_type(v)
-    elif isinstance(v, Enum):
-        return v.value
-    elif isinstance(v, datetime.timedelta):
-        total_secs = v.total_seconds()
-        secs = int(total_secs)
-        nanos = int((total_secs - secs) * 1e9)
-        return {"secs": secs, "nanos": nanos}
-    elif is_namedtuple_type(type(v)):
-        # Handle NamedTuple objects specifically to use dict format
-        field_names = list(getattr(type(v), "_fields", ()))
-        result = {}
-        for name in field_names:
-            val = getattr(v, name)
-            result[name] = dump_engine_object(val)  # Include all values, including None
-        if hasattr(v, "kind") and "kind" not in result:
-            result["kind"] = v.kind
-        return result
-    elif hasattr(v, "__dict__"):  # for dataclass-like objects
-        s = {}
-        for k, val in v.__dict__.items():
-            if val is None:
-                # Skip None values
-                continue
-            s[k] = dump_engine_object(val)
-        if hasattr(v, "kind") and "kind" not in s:
-            s["kind"] = v.kind
-        return s
-    elif isinstance(v, (list, tuple)):
-        return [dump_engine_object(item) for item in v]
-    elif isinstance(v, np.ndarray):
-        return v.tolist()
-    elif isinstance(v, dict):
-        return {k: dump_engine_object(v) for k, v in v.items()}
-    return v
-@overload
-def load_engine_object(expected_type: type[T], v: Any) -> T: ...
-@overload
-def load_engine_object(expected_type: Any, v: Any) -> Any: ...
-def load_engine_object(expected_type: Any, v: Any) -> Any:
-    """Recursively load an object that was produced by dump_engine_object().
-    Args:
-        expected_type: The Python type annotation to reconstruct to.
-        v: The engine-facing Pythonized object (e.g., dict/list/primitive) to convert.
-    Returns:
-        A Python object matching the expected_type where possible.
-    """
-    # Fast path
-    if v is None:
-        return None
-    type_info = analyze_type_info(expected_type)
-    variant = type_info.variant
-    if type_info.core_type is EnrichedValueType:
-        return EnrichedValueType.decode(v)
-    if type_info.core_type is FieldSchema:
-        return FieldSchema.decode(v)
-    # Any or unknown → return as-is
-    if isinstance(variant, AnalyzedAnyType) or type_info.base_type is Any:
-        return v
-    # Enum handling
-    if isinstance(expected_type, type) and issubclass(expected_type, Enum):
-        return expected_type(v)
-    # TimeDelta special form {secs, nanos}
-    if isinstance(variant, AnalyzedBasicType) and variant.kind == "TimeDelta":
-        if isinstance(v, Mapping) and "secs" in v and "nanos" in v:
-            secs = int(v["secs"])  # type: ignore[index]
-            nanos = int(v["nanos"])  # type: ignore[index]
-            return datetime.timedelta(seconds=secs, microseconds=nanos / 1_000)
-        return v
-    # List, NDArray (Vector-ish), or general sequences
-    if isinstance(variant, AnalyzedListType):
-        elem_type = variant.elem_type if variant.elem_type else Any
-        if type_info.base_type is np.ndarray:
-            # Reconstruct NDArray with appropriate dtype if available
-            try:
-                dtype = extract_ndarray_elem_dtype(type_info.core_type)
-            except (TypeError, ValueError, AttributeError):
-                dtype = None
-            return np.array(v, dtype=dtype)
-        # Regular Python list
-        return [load_engine_object(elem_type, item) for item in v]
-    # Dict / Mapping
-    if isinstance(variant, AnalyzedDictType):
-        key_t = variant.key_type
-        val_t = variant.value_type
-        return {
-            load_engine_object(key_t, k): load_engine_object(val_t, val)
-            for k, val in v.items()
-        }
-    # Structs (dataclass or NamedTuple)
-    if isinstance(variant, AnalyzedStructType):
-        struct_type = variant.struct_type
-        if dataclasses.is_dataclass(struct_type):
-            if not isinstance(v, Mapping):
-                raise ValueError(f"Expected dict for dataclass, got {type(v)}")
-            # Drop auxiliary discriminator "kind" if present
-            dc_init_kwargs: dict[str, Any] = {}
-            field_types = {f.name: f.type for f in dataclasses.fields(struct_type)}
-            for name, f_type in field_types.items():
-                if name in v:
-                    dc_init_kwargs[name] = load_engine_object(f_type, v[name])
-            return struct_type(**dc_init_kwargs)
-        elif is_namedtuple_type(struct_type):
-            if not isinstance(v, Mapping):
-                raise ValueError(f"Expected dict for NamedTuple, got {type(v)}")
-            # Dict format (from dump/load functions)
-            annotations = getattr(struct_type, "__annotations__", {})
-            field_names = list(getattr(struct_type, "_fields", ()))
-            nt_init_kwargs: dict[str, Any] = {}
-            for name in field_names:
-                f_type = annotations.get(name, Any)
-                if name in v:
-                    nt_init_kwargs[name] = load_engine_object(f_type, v[name])
-            return struct_type(**nt_init_kwargs)
-        return v
-    # Union with discriminator support via "kind"
-    if isinstance(variant, AnalyzedUnionType):
-        if isinstance(v, Mapping) and "kind" in v:
-            discriminator = v["kind"]
-            for typ in variant.variant_types:
-                t_info = analyze_type_info(typ)
-                if isinstance(t_info.variant, AnalyzedStructType):
-                    t_struct = t_info.variant.struct_type
-                    candidate_kind = getattr(t_struct, "kind", None)
-                    if candidate_kind == discriminator:
-                        # Remove discriminator for constructor
-                        v_wo_kind = dict(v)
-                        v_wo_kind.pop("kind", None)
-                        return load_engine_object(t_struct, v_wo_kind)
-        # Fallback: try each variant until one succeeds
-        for typ in variant.variant_types:
-            try:
-                return load_engine_object(typ, v)
-            except (TypeError, ValueError):
-                continue
-        return v
-    # Basic types and everything else: handle numpy scalars and passthrough
-    if isinstance(v, np.ndarray) and type_info.base_type is list:
-        return v.tolist()
-    if isinstance(v, (list, tuple)) and type_info.base_type not in (list, tuple):
-        # If a non-sequence basic type expected, attempt direct cast
-        try:
-            return type_info.core_type(v)
-        except (TypeError, ValueError):
-            return v
-    return v

cocoindex/flow.py CHANGED Viewed

@@ -17,7 +17,6 @@ from typing import (
     Callable,
     Generic,
     Iterable,
-    NamedTuple,
     Sequence,
     TypeVar,
     cast,
@@ -32,8 +31,8 @@ from . import _engine  # type: ignore
 from . import index
 from . import op
 from . import setting
-from .convert import (
-    dump_engine_object,
+from .engine_object import dump_engine_object
+from .engine_value import (
     make_engine_value_decoder,
     make_engine_value_encoder,
 )
@@ -406,6 +405,7 @@ class DataCollector:
         /,
         *,
         primary_key_fields: Sequence[str],
+        attachments: Sequence[op.TargetAttachmentSpec] = (),
         vector_indexes: Sequence[index.VectorIndexDef] = (),
         vector_index: Sequence[tuple[str, index.VectorSimilarityMetric]] = (),
         setup_by_user: bool = False,
@@ -437,6 +437,10 @@ class DataCollector:
             target_name,
             _spec_kind(target_spec),
             dump_engine_object(target_spec),
+            [
+                {"kind": _spec_kind(att), **dump_engine_object(att)}
+                for att in attachments
+            ],
             dump_engine_object(index_options),
             self._engine_data_collector,
             setup_by_user,
@@ -575,7 +579,8 @@ class FlowLiveUpdaterOptions:
     print_stats: bool = False
-class FlowUpdaterStatusUpdates(NamedTuple):
+@dataclass
+class FlowUpdaterStatusUpdates:
     """
     Status updates for a flow updater.
     """
@@ -1060,12 +1065,14 @@ def _get_data_slice_annotation_type(
 _transform_flow_name_builder = _NameBuilder()
-class TransformFlowInfo(NamedTuple):
+@dataclass
+class TransformFlowInfo(Generic[T]):
     engine_flow: _engine.TransientFlow
     result_decoder: Callable[[Any], T]
-class FlowArgInfo(NamedTuple):
+@dataclass
+class FlowArgInfo:
     name: str
     type_hint: Any
     encoder: Callable[[Any], Any]
@@ -1081,7 +1088,7 @@ class TransformFlow(Generic[T]):
     _args_info: list[FlowArgInfo]
     _lazy_lock: asyncio.Lock
-    _lazy_flow_info: TransformFlowInfo | None = None
+    _lazy_flow_info: TransformFlowInfo[T] | None = None
     def __init__(
         self,
@@ -1123,12 +1130,12 @@ class TransformFlow(Generic[T]):
         return self._flow_fn(*args, **kwargs)
     @property
-    def _flow_info(self) -> TransformFlowInfo:
+    def _flow_info(self) -> TransformFlowInfo[T]:
         if self._lazy_flow_info is not None:
             return self._lazy_flow_info
         return execution_context.run(self._flow_info_async())
-    async def _flow_info_async(self) -> TransformFlowInfo:
+    async def _flow_info_async(self) -> TransformFlowInfo[T]:
         if self._lazy_flow_info is not None:
             return self._lazy_flow_info
         async with self._lazy_lock:
@@ -1136,7 +1143,7 @@ class TransformFlow(Generic[T]):
                 self._lazy_flow_info = await self._build_flow_info_async()
             return self._lazy_flow_info
-    async def _build_flow_info_async(self) -> TransformFlowInfo:
+    async def _build_flow_info_async(self) -> TransformFlowInfo[T]:
         flow_builder_state = _FlowBuilderState(self._flow_name)
         kwargs: dict[str, DataSlice[T]] = {}
         for arg_info in self._args_info:

cocoindex/functions/__init__.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Functions module for cocoindex.
+This module provides various function specifications and executors for data processing,
+including embedding functions, text processing, and multimodal operations.
+"""
+# Import all engine builtin function specs
+from ._engine_builtin_specs import (
+    ParseJson,
+    SplitRecursively,
+    SplitBySeparators,
+    EmbedText,
+    ExtractByLlm,
+)
+# Import SentenceTransformer embedding functionality
+from .sbert import (
+    SentenceTransformerEmbed,
+    SentenceTransformerEmbedExecutor,
+)
+# Import ColPali multimodal embedding functionality
+from .colpali import (
+    ColPaliEmbedImage,
+    ColPaliEmbedImageExecutor,
+    ColPaliEmbedQuery,
+    ColPaliEmbedQueryExecutor,
+)
+__all__ = [
+    # Engine builtin specs
+    "ParseJson",
+    "SplitRecursively",
+    "SplitBySeparators",
+    "EmbedText",
+    "ExtractByLlm",
+    # SentenceTransformer
+    "SentenceTransformerEmbed",
+    "SentenceTransformerEmbedExecutor",
+    # ColPali
+    "ColPaliEmbedImage",
+    "ColPaliEmbedImageExecutor",
+    "ColPaliEmbedQuery",
+    "ColPaliEmbedQueryExecutor",
+]

cocoindex/functions/_engine_builtin_specs.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""All builtin function specs."""
+import dataclasses
+from typing import Literal
+from .. import llm, op
+class ParseJson(op.FunctionSpec):
+    """Parse a text into a JSON object."""
+@dataclasses.dataclass
+class CustomLanguageSpec:
+    """Custom language specification."""
+    language_name: str
+    separators_regex: list[str]
+    aliases: list[str] = dataclasses.field(default_factory=list)
+class SplitRecursively(op.FunctionSpec):
+    """Split a document (in string) recursively."""
+    custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
+class SplitBySeparators(op.FunctionSpec):
+    """
+    Split text by specified regex separators only.
+    Output schema matches SplitRecursively for drop-in compatibility:
+        KTable rows with fields: location (Range), text (Str), start, end.
+    Args:
+        separators_regex: list[str]  # e.g., [r"\\n\\n+"]
+        keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
+        include_empty: bool = False
+        trim: bool = True
+    """
+    separators_regex: list[str] = dataclasses.field(default_factory=list)
+    keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
+    include_empty: bool = False
+    trim: bool = True
+class EmbedText(op.FunctionSpec):
+    """Embed a text into a vector space."""
+    api_type: llm.LlmApiType
+    model: str
+    address: str | None = None
+    output_dimension: int | None = None
+    task_type: str | None = None
+    api_config: llm.VertexAiConfig | None = None
+class ExtractByLlm(op.FunctionSpec):
+    """Extract information from a text using a LLM."""
+    llm_spec: llm.LlmSpec
+    output_type: type
+    instruction: str | None = None