PyPI - cocoindex - Versions diffs - 0.1.74__cp313-cp313-manylinux_2_28_x86_64.whl → 0.1.76__cp313-cp313-manylinux_2_28_x86_64.whl - Mend

cocoindex 0.1.74__cp313-cp313-manylinux_2_28_x86_64.whl → 0.1.76__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

cocoindex/__init__.py +5 -3
cocoindex/_engine.cpython-313-x86_64-linux-gnu.so +0 -0
cocoindex/convert.py +56 -87
cocoindex/flow.py +27 -11
cocoindex/functions.py +197 -0
cocoindex/op.py +3 -2
cocoindex/tests/test_convert.py +111 -24
cocoindex/tests/test_transform_flow.py +103 -0
cocoindex/typing.py +4 -4
{cocoindex-0.1.74.dist-info → cocoindex-0.1.76.dist-info}/METADATA +7 -2
{cocoindex-0.1.74.dist-info → cocoindex-0.1.76.dist-info}/RECORD +14 -13
{cocoindex-0.1.74.dist-info → cocoindex-0.1.76.dist-info}/WHEEL +1 -1
{cocoindex-0.1.74.dist-info → cocoindex-0.1.76.dist-info}/entry_points.txt +0 -0
{cocoindex-0.1.74.dist-info → cocoindex-0.1.76.dist-info}/licenses/LICENSE +0 -0

cocoindex/__init__.py CHANGED Viewed

@@ -11,7 +11,8 @@ from .flow import FlowBuilder, DataScope, DataSlice, Flow, transform_flow
 from .flow import flow_def
 from .flow import EvaluateAndDumpOptions, GeneratedField
 from .flow import FlowLiveUpdater, FlowLiveUpdaterOptions, FlowUpdaterStatusUpdates
-from .flow import add_flow_def, remove_flow
+from .flow import open_flow
+from .flow import add_flow_def, remove_flow  # DEPRECATED
 from .flow import update_all_flows_async, setup_all_flows, drop_all_flows
 from .lib import init, start_server, stop
 from .llm import LlmSpec, LlmApiType
@@ -57,8 +58,9 @@ __all__ = [
     "FlowLiveUpdater",
     "FlowLiveUpdaterOptions",
     "FlowUpdaterStatusUpdates",
-    "add_flow_def",
-    "remove_flow",
+    "open_flow",
+    "add_flow_def",  # DEPRECATED
+    "remove_flow",  # DEPRECATED
     "update_all_flows_async",
     "setup_all_flows",
     "drop_all_flows",

cocoindex/_engine.cpython-313-x86_64-linux-gnu.so CHANGED Viewed

Binary file

cocoindex/convert.py CHANGED Viewed

@@ -95,6 +95,7 @@ def make_engine_value_decoder(
     field_path: list[str],
     src_type: dict[str, Any],
     dst_type_info: AnalyzedTypeInfo,
+    for_key: bool = False,
 ) -> Callable[[Any], Any]:
     """
     Make a decoder from an engine value to a Python value.
@@ -123,6 +124,7 @@ def make_engine_value_decoder(
             field_path,
             src_type["fields"],
             dst_type_info,
+            for_key=for_key,
         )
     if src_type_kind in TABLE_TYPES:
@@ -131,10 +133,10 @@ def make_engine_value_decoder(
             if src_type_kind == "LTable":
                 if isinstance(dst_type_variant, AnalyzedAnyType):
-                    return _make_engine_ltable_to_list_dict_decoder(
-                        field_path, engine_fields_schema
-                    )
-                if not isinstance(dst_type_variant, AnalyzedListType):
+                    dst_elem_type = Any
+                elif isinstance(dst_type_variant, AnalyzedListType):
+                    dst_elem_type = dst_type_variant.elem_type
+                else:
                     raise ValueError(
                         f"Type mismatch for `{''.join(field_path)}`: "
                         f"declared `{dst_type_info.core_type}`, a list type expected"
@@ -142,7 +144,7 @@ def make_engine_value_decoder(
                 row_decoder = make_engine_struct_decoder(
                     field_path,
                     engine_fields_schema,
-                    analyze_type_info(dst_type_variant.elem_type),
+                    analyze_type_info(dst_elem_type),
                 )
                 def decode(value: Any) -> Any | None:
@@ -152,10 +154,11 @@ def make_engine_value_decoder(
             elif src_type_kind == "KTable":
                 if isinstance(dst_type_variant, AnalyzedAnyType):
-                    return _make_engine_ktable_to_dict_dict_decoder(
-                        field_path, engine_fields_schema
-                    )
-                if not isinstance(dst_type_variant, AnalyzedDictType):
+                    key_type, value_type = Any, Any
+                elif isinstance(dst_type_variant, AnalyzedDictType):
+                    key_type = dst_type_variant.key_type
+                    value_type = dst_type_variant.value_type
+                else:
                     raise ValueError(
                         f"Type mismatch for `{''.join(field_path)}`: "
                         f"declared `{dst_type_info.core_type}`, a dict type expected"
@@ -166,13 +169,14 @@ def make_engine_value_decoder(
                 key_decoder = make_engine_value_decoder(
                     field_path,
                     key_field_schema["type"],
-                    analyze_type_info(dst_type_variant.key_type),
+                    analyze_type_info(key_type),
+                    for_key=True,
                 )
                 field_path.pop()
                 value_decoder = make_engine_struct_decoder(
                     field_path,
                     engine_fields_schema[1:],
-                    analyze_type_info(dst_type_variant.value_type),
+                    analyze_type_info(value_type),
                 )
                 def decode(value: Any) -> Any | None:
@@ -237,7 +241,9 @@ def make_engine_value_decoder(
             vec_elem_decoder = make_engine_value_decoder(
                 field_path + ["[*]"],
                 src_type["element_type"],
-                analyze_type_info(dst_type_variant and dst_type_variant.elem_type),
+                analyze_type_info(
+                    dst_type_variant.elem_type if dst_type_variant else Any
+                ),
             )
         def decode_vector(value: Any) -> Any | None:
@@ -316,26 +322,26 @@ def make_engine_struct_decoder(
     field_path: list[str],
     src_fields: list[dict[str, Any]],
     dst_type_info: AnalyzedTypeInfo,
+    for_key: bool = False,
 ) -> Callable[[list[Any]], Any]:
     """Make a decoder from an engine field values to a Python value."""
     dst_type_variant = dst_type_info.variant
-    use_dict = False
     if isinstance(dst_type_variant, AnalyzedAnyType):
-        use_dict = True
+        if for_key:
+            return _make_engine_struct_to_tuple_decoder(field_path, src_fields)
+        else:
+            return _make_engine_struct_to_dict_decoder(field_path, src_fields, Any)
     elif isinstance(dst_type_variant, AnalyzedDictType):
         analyzed_key_type = analyze_type_info(dst_type_variant.key_type)
-        analyzed_value_type = analyze_type_info(dst_type_variant.value_type)
-        use_dict = (
+        if (
             isinstance(analyzed_key_type.variant, AnalyzedAnyType)
-            or (
-                isinstance(analyzed_key_type.variant, AnalyzedBasicType)
-                and analyzed_key_type.variant.kind == "Str"
+            or analyzed_key_type.core_type is str
+        ):
+            return _make_engine_struct_to_dict_decoder(
+                field_path, src_fields, dst_type_variant.value_type
             )
-        ) and isinstance(analyzed_value_type.variant, AnalyzedAnyType)
-    if use_dict:
-        return _make_engine_struct_to_dict_decoder(field_path, src_fields)
     if not isinstance(dst_type_variant, AnalyzedStructType):
         raise ValueError(
@@ -375,7 +381,7 @@ def make_engine_struct_decoder(
         with ChildFieldPath(field_path, f".{name}"):
             if src_idx is not None:
                 field_decoder = make_engine_value_decoder(
-                    field_path, src_fields[src_idx]["type"], type_info
+                    field_path, src_fields[src_idx]["type"], type_info, for_key=for_key
                 )
                 return lambda values: field_decoder(values[src_idx])
@@ -409,17 +415,19 @@ def make_engine_struct_decoder(
 def _make_engine_struct_to_dict_decoder(
     field_path: list[str],
     src_fields: list[dict[str, Any]],
+    value_type_annotation: Any,
 ) -> Callable[[list[Any] | None], dict[str, Any] | None]:
     """Make a decoder from engine field values to a Python dict."""
     field_decoders = []
-    for i, field_schema in enumerate(src_fields):
+    value_type_info = analyze_type_info(value_type_annotation)
+    for field_schema in src_fields:
         field_name = field_schema["name"]
         with ChildFieldPath(field_path, f".{field_name}"):
             field_decoder = make_engine_value_decoder(
                 field_path,
                 field_schema["type"],
-                analyze_type_info(Any),  # Use Any for recursive decoding
+                value_type_info,
             )
         field_decoders.append((field_name, field_decoder))
@@ -438,76 +446,37 @@ def _make_engine_struct_to_dict_decoder(
     return decode_to_dict
-def _make_engine_ltable_to_list_dict_decoder(
+def _make_engine_struct_to_tuple_decoder(
     field_path: list[str],
     src_fields: list[dict[str, Any]],
-) -> Callable[[list[Any] | None], list[dict[str, Any]] | None]:
-    """Make a decoder from engine LTable values to a list of dicts."""
+) -> Callable[[list[Any] | None], tuple[Any, ...] | None]:
+    """Make a decoder from engine field values to a Python tuple."""
-    # Create a decoder for each row (struct) to dict
-    row_decoder = _make_engine_struct_to_dict_decoder(field_path, src_fields)
-    def decode_to_list_dict(values: list[Any] | None) -> list[dict[str, Any]] | None:
-        if values is None:
-            return None
-        result = []
-        for i, row_values in enumerate(values):
-            decoded_row = row_decoder(row_values)
-            if decoded_row is None:
-                raise ValueError(
-                    f"LTable row at index {i} decoded to None, which is not allowed."
+    field_decoders = []
+    value_type_info = analyze_type_info(Any)
+    for field_schema in src_fields:
+        field_name = field_schema["name"]
+        with ChildFieldPath(field_path, f".{field_name}"):
+            field_decoders.append(
+                make_engine_value_decoder(
+                    field_path,
+                    field_schema["type"],
+                    value_type_info,
                 )
-            result.append(decoded_row)
-        return result
-    return decode_to_list_dict
-def _make_engine_ktable_to_dict_dict_decoder(
-    field_path: list[str],
-    src_fields: list[dict[str, Any]],
-) -> Callable[[list[Any] | None], dict[Any, dict[str, Any]] | None]:
-    """Make a decoder from engine KTable values to a dict of dicts."""
-    if not src_fields:
-        raise ValueError("KTable must have at least one field for the key")
-    # First field is the key, remaining fields are the value
-    key_field_schema = src_fields[0]
-    value_fields_schema = src_fields[1:]
-    # Create decoders
-    with ChildFieldPath(field_path, f".{key_field_schema.get('name', KEY_FIELD_NAME)}"):
-        key_decoder = make_engine_value_decoder(
-            field_path, key_field_schema["type"], analyze_type_info(Any)
-        )
-    value_decoder = _make_engine_struct_to_dict_decoder(field_path, value_fields_schema)
+            )
-    def decode_to_dict_dict(
-        values: list[Any] | None,
-    ) -> dict[Any, dict[str, Any]] | None:
+    def decode_to_tuple(values: list[Any] | None) -> tuple[Any, ...] | None:
         if values is None:
             return None
-        result = {}
-        for row_values in values:
-            if not row_values:
-                raise ValueError("KTable row must have at least 1 value (the key)")
-            key = key_decoder(row_values[0])
-            if len(row_values) == 1:
-                value: dict[str, Any] = {}
-            else:
-                tmp = value_decoder(row_values[1:])
-                if tmp is None:
-                    value = {}
-                else:
-                    value = tmp
-            if isinstance(key, dict):
-                key = tuple(key.values())
-            result[key] = value
-        return result
+        if len(field_decoders) != len(values):
+            raise ValueError(
+                f"Field count mismatch: expected {len(field_decoders)}, got {len(values)}"
+            )
+        return tuple(
+            field_decoder(value) for value, field_decoder in zip(values, field_decoders)
+        )
-    return decode_to_dict_dict
+    return decode_to_tuple
 def dump_engine_object(v: Any) -> Any:

cocoindex/flow.py CHANGED Viewed

@@ -798,7 +798,7 @@ class Flow:
         The current instance is still valid after it's called.
         For example, you can still call `setup()` after it, to setup the persistent backends again.
-        Call `cocoindex.remove_flow()` if you want to remove the flow from the current process.
+        Call `close()` if you want to remove the flow from the current process.
         """
         execution_context.run(self.drop_async(report_to_stdout=report_to_stdout))
@@ -810,6 +810,18 @@ class Flow:
             report_to_stdout=report_to_stdout
         )
+    def close(self) -> None:
+        """
+        Close the flow. It will remove the flow from the current process to free up resources.
+        After it's called, methods of the flow should no longer be called.
+        This will NOT touch the persistent backends of the flow.
+        """
+        _engine.remove_flow_context(self.full_name)
+        self._lazy_engine_flow = None
+        with _flows_lock:
+            del _flows[self.name]
 def _create_lazy_flow(
     name: str | None, fl_def: Callable[[FlowBuilder, DataScope], None]
@@ -845,7 +857,10 @@ def get_flow_full_name(name: str) -> str:
     return f"{setting.get_app_namespace(trailing_delimiter='.')}{name}"
-def add_flow_def(name: str, fl_def: Callable[[FlowBuilder, DataScope], None]) -> Flow:
+def open_flow(name: str, fl_def: Callable[[FlowBuilder, DataScope], None]) -> Flow:
+    """
+    Open a flow, with the given name and definition.
+    """
     with _flows_lock:
         if name in _flows:
             raise KeyError(f"Flow with name {name} already exists")
@@ -853,17 +868,18 @@ def add_flow_def(name: str, fl_def: Callable[[FlowBuilder, DataScope], None]) ->
     return fl
-def remove_flow(fl: Flow) -> None:
+def add_flow_def(name: str, fl_def: Callable[[FlowBuilder, DataScope], None]) -> Flow:
     """
-    Remove a flow from the current process to free up resources.
-    After it's called, methods of the flow should no longer be called.
+    DEPRECATED: Use `open_flow()` instead.
+    """
+    return open_flow(name, fl_def)
-    This will NOT touch the persistent backends of the flow.
+def remove_flow(fl: Flow) -> None:
     """
-    _engine.remove_flow_context(fl.full_name)
-    fl._lazy_engine_flow = None  # pylint: disable=protected-access
-    with _flows_lock:
-        del _flows[fl.name]
+    DEPRECATED: Use `Flow.close()` instead.
+    """
+    fl.close()
 def flow_def(
@@ -872,7 +888,7 @@ def flow_def(
     """
     A decorator to wrap the flow definition.
     """
-    return lambda fl_def: add_flow_def(name or fl_def.__name__, fl_def)
+    return lambda fl_def: open_flow(name or fl_def.__name__, fl_def)
 def flow_names() -> list[str]:

cocoindex/functions.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All builtin functions."""
 import dataclasses
+import functools
 from typing import Annotated, Any, Literal
 import numpy as np
@@ -23,6 +24,16 @@ class CustomLanguageSpec:
     aliases: list[str] = dataclasses.field(default_factory=list)
+@dataclasses.dataclass
+class ColPaliModelInfo:
+    """Data structure for ColPali model and processor."""
+    model: Any
+    processor: Any
+    dimension: int
+    device: Any
 class SplitRecursively(op.FunctionSpec):
     """Split a document (in string) recursively."""
@@ -99,3 +110,189 @@ class SentenceTransformerEmbedExecutor:
         assert self._model is not None
         result: NDArray[np.float32] = self._model.encode(text, convert_to_numpy=True)
         return result
+@functools.cache
+def _get_colpali_model_and_processor(model_name: str) -> ColPaliModelInfo:
+    """Get or load ColPali model and processor, with caching."""
+    try:
+        from colpali_engine.models import ColPali, ColPaliProcessor  # type: ignore[import-untyped]
+        from colpali_engine.utils.torch_utils import get_torch_device  # type: ignore[import-untyped]
+        import torch
+    except ImportError as e:
+        raise ImportError(
+            "ColPali is not available. Make sure cocoindex is installed with ColPali support."
+        ) from e
+    device = get_torch_device("auto")
+    model = ColPali.from_pretrained(
+        model_name, device_map=device, torch_dtype=torch.bfloat16
+    ).eval()
+    processor = ColPaliProcessor.from_pretrained(model_name)
+    # Get dimension from the actual model
+    dimension = _detect_colpali_dimension(model, processor, device)
+    return ColPaliModelInfo(
+        model=model,
+        processor=processor,
+        dimension=dimension,
+        device=device,
+    )
+def _detect_colpali_dimension(model: Any, processor: Any, device: Any) -> int:
+    """Detect ColPali embedding dimension from the actual model config."""
+    # Try to access embedding dimension
+    if hasattr(model.config, "embedding_dim"):
+        dim = model.config.embedding_dim
+    else:
+        # Fallback: infer from output shape with dummy data
+        from PIL import Image
+        import numpy as np
+        import torch
+        dummy_img = Image.fromarray(np.zeros((224, 224, 3), np.uint8))
+        # Use the processor to process the dummy image
+        processed = processor.process_images([dummy_img]).to(device)
+        with torch.no_grad():
+            output = model(**processed)
+        dim = int(output.shape[-1])
+    if isinstance(dim, int):
+        return dim
+    else:
+        raise ValueError(f"Expected integer dimension, got {type(dim)}: {dim}")
+    return dim
+class ColPaliEmbedImage(op.FunctionSpec):
+    """
+    `ColPaliEmbedImage` embeds images using the ColPali multimodal model.
+    ColPali (Contextual Late-interaction over Patches) uses late interaction
+    between image patch embeddings and text token embeddings for retrieval.
+    Args:
+        model: The ColPali model name to use (e.g., "vidore/colpali-v1.2")
+    Note:
+        This function requires the optional colpali-engine dependency.
+        Install it with: pip install 'cocoindex[embeddings]'
+    """
+    model: str
+@op.executor_class(
+    gpu=True,
+    cache=True,
+    behavior_version=1,
+)
+class ColPaliEmbedImageExecutor:
+    """Executor for ColPaliEmbedImage."""
+    spec: ColPaliEmbedImage
+    _model_info: ColPaliModelInfo
+    def analyze(self, _img_bytes: Any) -> type:
+        # Get shared model and dimension
+        self._model_info = _get_colpali_model_and_processor(self.spec.model)
+        # Return multi-vector type: Variable patches x Fixed hidden dimension
+        dimension = self._model_info.dimension
+        return Vector[Vector[np.float32, Literal[dimension]]]  # type: ignore
+    def __call__(self, img_bytes: bytes) -> Any:
+        try:
+            from PIL import Image
+            import torch
+            import io
+        except ImportError as e:
+            raise ImportError(
+                "Required dependencies (PIL, torch) are missing for ColPali image embedding."
+            ) from e
+        model = self._model_info.model
+        processor = self._model_info.processor
+        device = self._model_info.device
+        pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+        inputs = processor.process_images([pil_image]).to(device)
+        with torch.no_grad():
+            embeddings = model(**inputs)
+        # Return multi-vector format: [patches, hidden_dim]
+        if len(embeddings.shape) != 3:
+            raise ValueError(
+                f"Expected 3D tensor [batch, patches, hidden_dim], got shape {embeddings.shape}"
+            )
+        # Keep patch-level embeddings: [batch, patches, hidden_dim] -> [patches, hidden_dim]
+        patch_embeddings = embeddings[0]  # Remove batch dimension
+        return patch_embeddings.cpu().to(torch.float32).numpy()
+class ColPaliEmbedQuery(op.FunctionSpec):
+    """
+    `ColPaliEmbedQuery` embeds text queries using the ColPali multimodal model.
+    This produces query embeddings compatible with ColPali image embeddings
+    for late interaction scoring (MaxSim).
+    Args:
+        model: The ColPali model name to use (e.g., "vidore/colpali-v1.2")
+    Note:
+        This function requires the optional colpali-engine dependency.
+        Install it with: pip install 'cocoindex[embeddings]'
+    """
+    model: str
+@op.executor_class(
+    gpu=True,
+    cache=True,
+    behavior_version=1,
+)
+class ColPaliEmbedQueryExecutor:
+    """Executor for ColPaliEmbedQuery."""
+    spec: ColPaliEmbedQuery
+    _model_info: ColPaliModelInfo
+    def analyze(self, _query: Any) -> type:
+        # Get shared model and dimension
+        self._model_info = _get_colpali_model_and_processor(self.spec.model)
+        # Return multi-vector type: Variable tokens x Fixed hidden dimension
+        dimension = self._model_info.dimension
+        return Vector[Vector[np.float32, Literal[dimension]]]  # type: ignore
+    def __call__(self, query: str) -> Any:
+        try:
+            import torch
+        except ImportError as e:
+            raise ImportError(
+                "Required dependencies (torch) are missing for ColPali query embedding."
+            ) from e
+        model = self._model_info.model
+        processor = self._model_info.processor
+        device = self._model_info.device
+        inputs = processor.process_queries([query]).to(device)
+        with torch.no_grad():
+            embeddings = model(**inputs)
+        # Return multi-vector format: [tokens, hidden_dim]
+        if len(embeddings.shape) != 3:
+            raise ValueError(
+                f"Expected 3D tensor [batch, tokens, hidden_dim], got shape {embeddings.shape}"
+            )
+        # Keep token-level embeddings: [batch, tokens, hidden_dim] -> [tokens, hidden_dim]
+        token_embeddings = embeddings[0]  # Remove batch dimension
+        return token_embeddings.cpu().to(torch.float32).numpy()

cocoindex/op.py CHANGED Viewed

@@ -505,7 +505,7 @@ class _TargetConnector:
                 self._mutatation_type.value_type,
             )
             if self._mutatation_type is not None
-            else (None, None)
+            else (Any, Any)
         )
         key_type_info = analyze_type_info(key_annotation)
@@ -519,10 +519,11 @@ class _TargetConnector:
                 ["(key)"],
                 key_fields_schema[0]["type"],
                 key_type_info,
+                for_key=True,
             )
         else:
             key_decoder = make_engine_struct_decoder(
-                ["(key)"], key_fields_schema, key_type_info
+                ["(key)"], key_fields_schema, key_type_info, for_key=True
             )
         value_decoder = make_engine_struct_decoder(

cocoindex/tests/test_convert.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import datetime
+import inspect
 import uuid
 from dataclasses import dataclass, make_dataclass, field
 from typing import Annotated, Any, Callable, Literal, NamedTuple
@@ -236,19 +237,24 @@ def test_encode_engine_value_none() -> None:
 def test_roundtrip_basic_types() -> None:
-    validate_full_roundtrip(b"hello world", bytes, (b"hello world", None))
+    validate_full_roundtrip(
+        b"hello world",
+        bytes,
+        (b"hello world", inspect.Parameter.empty),
+        (b"hello world", Any),
+    )
     validate_full_roundtrip(b"\x00\x01\x02\xff\xfe", bytes)
-    validate_full_roundtrip("hello", str, ("hello", None))
-    validate_full_roundtrip(True, bool, (True, None))
-    validate_full_roundtrip(False, bool, (False, None))
+    validate_full_roundtrip("hello", str, ("hello", Any))
+    validate_full_roundtrip(True, bool, (True, Any))
+    validate_full_roundtrip(False, bool, (False, Any))
     validate_full_roundtrip(
-        42, cocoindex.Int64, (42, int), (np.int64(42), np.int64), (42, None)
+        42, cocoindex.Int64, (42, int), (np.int64(42), np.int64), (42, Any)
     )
     validate_full_roundtrip(42, int, (42, cocoindex.Int64))
     validate_full_roundtrip(np.int64(42), np.int64, (42, cocoindex.Int64))
     validate_full_roundtrip(
-        3.25, Float64, (3.25, float), (np.float64(3.25), np.float64), (3.25, None)
+        3.25, Float64, (3.25, float), (np.float64(3.25), np.float64), (3.25, Any)
     )
     validate_full_roundtrip(3.25, float, (3.25, Float64))
     validate_full_roundtrip(np.float64(3.25), np.float64, (3.25, Float64))
@@ -260,35 +266,35 @@ def test_roundtrip_basic_types() -> None:
         (np.float32(3.25), np.float32),
         (np.float64(3.25), np.float64),
         (3.25, Float64),
-        (3.25, None),
+        (3.25, Any),
     )
     validate_full_roundtrip(np.float32(3.25), np.float32, (3.25, Float32))
 def test_roundtrip_uuid() -> None:
     uuid_value = uuid.uuid4()
-    validate_full_roundtrip(uuid_value, uuid.UUID, (uuid_value, None))
+    validate_full_roundtrip(uuid_value, uuid.UUID, (uuid_value, Any))
 def test_roundtrip_range() -> None:
     r1 = (0, 100)
-    validate_full_roundtrip(r1, cocoindex.Range, (r1, None))
+    validate_full_roundtrip(r1, cocoindex.Range, (r1, Any))
     r2 = (50, 50)
-    validate_full_roundtrip(r2, cocoindex.Range, (r2, None))
+    validate_full_roundtrip(r2, cocoindex.Range, (r2, Any))
     r3 = (0, 1_000_000_000)
-    validate_full_roundtrip(r3, cocoindex.Range, (r3, None))
+    validate_full_roundtrip(r3, cocoindex.Range, (r3, Any))
 def test_roundtrip_time() -> None:
     t1 = datetime.time(10, 30, 50, 123456)
-    validate_full_roundtrip(t1, datetime.time, (t1, None))
+    validate_full_roundtrip(t1, datetime.time, (t1, Any))
     t2 = datetime.time(23, 59, 59)
-    validate_full_roundtrip(t2, datetime.time, (t2, None))
+    validate_full_roundtrip(t2, datetime.time, (t2, Any))
     t3 = datetime.time(0, 0, 0)
-    validate_full_roundtrip(t3, datetime.time, (t3, None))
+    validate_full_roundtrip(t3, datetime.time, (t3, Any))
     validate_full_roundtrip(
-        datetime.date(2025, 1, 1), datetime.date, (datetime.date(2025, 1, 1), None)
+        datetime.date(2025, 1, 1), datetime.date, (datetime.date(2025, 1, 1), Any)
     )
     validate_full_roundtrip(
@@ -333,11 +339,11 @@ def test_roundtrip_timedelta() -> None:
     td1 = datetime.timedelta(
         days=5, seconds=10, microseconds=123, milliseconds=456, minutes=30, hours=2
     )
-    validate_full_roundtrip(td1, datetime.timedelta, (td1, None))
+    validate_full_roundtrip(td1, datetime.timedelta, (td1, Any))
     td2 = datetime.timedelta(days=-5, hours=-2)
-    validate_full_roundtrip(td2, datetime.timedelta, (td2, None))
+    validate_full_roundtrip(td2, datetime.timedelta, (td2, Any))
     td3 = datetime.timedelta(0)
-    validate_full_roundtrip(td3, datetime.timedelta, (td3, None))
+    validate_full_roundtrip(td3, datetime.timedelta, (td3, Any))
 def test_roundtrip_json() -> None:
@@ -1160,6 +1166,37 @@ def test_full_roundtrip_scalar_with_python_types() -> None:
     validate_full_roundtrip(instance, MixedStruct)
+def test_roundtrip_simple_struct_to_dict_binding() -> None:
+    """Test struct -> dict binding with Any annotation."""
+    @dataclass
+    class SimpleStruct:
+        first_name: str
+        last_name: str
+    instance = SimpleStruct("John", "Doe")
+    expected_dict = {"first_name": "John", "last_name": "Doe"}
+    # Test Any annotation
+    validate_full_roundtrip(
+        instance,
+        SimpleStruct,
+        (expected_dict, Any),
+        (expected_dict, dict),
+        (expected_dict, dict[Any, Any]),
+        (expected_dict, dict[str, Any]),
+        # For simple struct, all fields have the same type, so we can directly use the type as the dict value type.
+        (expected_dict, dict[Any, str]),
+        (expected_dict, dict[str, str]),
+    )
+    with pytest.raises(ValueError):
+        validate_full_roundtrip(instance, SimpleStruct, (expected_dict, dict[str, int]))
+    with pytest.raises(ValueError):
+        validate_full_roundtrip(instance, SimpleStruct, (expected_dict, dict[int, Any]))
 def test_roundtrip_struct_to_dict_binding() -> None:
     """Test struct -> dict binding with Any annotation."""
@@ -1173,7 +1210,20 @@ def test_roundtrip_struct_to_dict_binding() -> None:
     expected_dict = {"name": "test", "value": 42, "price": 3.14}
     # Test Any annotation
-    validate_full_roundtrip(instance, SimpleStruct, (expected_dict, Any))
+    validate_full_roundtrip(
+        instance,
+        SimpleStruct,
+        (expected_dict, Any),
+        (expected_dict, dict),
+        (expected_dict, dict[Any, Any]),
+        (expected_dict, dict[str, Any]),
+    )
+    with pytest.raises(ValueError):
+        validate_full_roundtrip(instance, SimpleStruct, (expected_dict, dict[str, str]))
+    with pytest.raises(ValueError):
+        validate_full_roundtrip(instance, SimpleStruct, (expected_dict, dict[int, Any]))
 def test_roundtrip_struct_to_dict_explicit() -> None:
@@ -1207,8 +1257,8 @@ def test_roundtrip_struct_to_dict_with_none_annotation() -> None:
     instance = Config("localhost", 8080, True)
     expected_dict = {"host": "localhost", "port": 8080, "debug": True}
-    # Test None annotation (should be treated as Any)
-    validate_full_roundtrip(instance, Config, (expected_dict, None))
+    # Test empty annotation (should be treated as Any)
+    validate_full_roundtrip(instance, Config, (expected_dict, inspect.Parameter.empty))
 def test_roundtrip_struct_to_dict_nested() -> None:
@@ -1289,7 +1339,13 @@ def test_roundtrip_ltable_to_list_dict_binding() -> None:
     ]
     # Test Any annotation
-    validate_full_roundtrip(users, list[User], (expected_list_dict, Any))
+    validate_full_roundtrip(
+        users,
+        list[User],
+        (expected_list_dict, Any),
+        (expected_list_dict, list[Any]),
+        (expected_list_dict, list[dict[str, Any]]),
+    )
 def test_roundtrip_ktable_to_dict_dict_binding() -> None:
@@ -1313,7 +1369,17 @@ def test_roundtrip_ktable_to_dict_dict_binding() -> None:
     }
     # Test Any annotation
-    validate_full_roundtrip(products, dict[str, Product], (expected_dict_dict, Any))
+    validate_full_roundtrip(
+        products,
+        dict[str, Product],
+        (expected_dict_dict, Any),
+        (expected_dict_dict, dict),
+        (expected_dict_dict, dict[Any, Any]),
+        (expected_dict_dict, dict[str, Any]),
+        (expected_dict_dict, dict[Any, dict[Any, Any]]),
+        (expected_dict_dict, dict[str, dict[Any, Any]]),
+        (expected_dict_dict, dict[str, dict[str, Any]]),
+    )
 def test_roundtrip_ktable_with_complex_key() -> None:
@@ -1339,7 +1405,28 @@ def test_roundtrip_ktable_with_complex_key() -> None:
     }
     # Test Any annotation
-    validate_full_roundtrip(orders, dict[OrderKey, Order], (expected_dict_dict, Any))
+    validate_full_roundtrip(
+        orders,
+        dict[OrderKey, Order],
+        (expected_dict_dict, Any),
+        (expected_dict_dict, dict),
+        (expected_dict_dict, dict[Any, Any]),
+        (expected_dict_dict, dict[Any, dict[str, Any]]),
+        (
+            {
+                ("shop1", 1): Order("Alice", 100.0),
+                ("shop2", 2): Order("Bob", 200.0),
+            },
+            dict[Any, Order],
+        ),
+        (
+            {
+                OrderKey("shop1", 1): {"customer": "Alice", "total": 100.0},
+                OrderKey("shop2", 2): {"customer": "Bob", "total": 200.0},
+            },
+            dict[OrderKey, Any],
+        ),
+    )
 def test_roundtrip_ltable_with_nested_structs() -> None:

cocoindex/tests/test_transform_flow.py ADDED Viewed

@@ -0,0 +1,103 @@
+import typing
+from dataclasses import dataclass
+from typing import Any
+import pytest
+import cocoindex
+@dataclass
+class Child:
+    value: int
+@dataclass
+class Parent:
+    children: list[Child]
+# Fixture to initialize CocoIndex library
+@pytest.fixture(scope="session", autouse=True)
+def init_cocoindex() -> typing.Generator[None, None, None]:
+    cocoindex.init()
+    yield
+@cocoindex.op.function()
+def add_suffix(text: str) -> str:
+    """Append ' world' to the input text."""
+    return f"{text} world"
+@cocoindex.transform_flow()
+def simple_transform(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[str]:
+    """Transform flow that applies add_suffix to input text."""
+    return text.transform(add_suffix)
+@cocoindex.op.function()
+def extract_value(value: int) -> int:
+    """Extracts the value."""
+    return value
+@cocoindex.transform_flow()
+def for_each_transform(
+    data: cocoindex.DataSlice[Parent],
+) -> cocoindex.DataSlice[Any]:
+    """Transform flow that processes child rows to extract values."""
+    with data["children"].row() as child:
+        child["new_field"] = child["value"].transform(extract_value)
+    return data
+def test_simple_transform_flow() -> None:
+    """Test the simple transform flow."""
+    input_text = "hello"
+    result = simple_transform.eval(input_text)
+    assert result == "hello world", f"Expected 'hello world', got {result}"
+    result = simple_transform.eval("")
+    assert result == " world", f"Expected ' world', got {result}"
+@pytest.mark.asyncio
+async def test_simple_transform_flow_async() -> None:
+    """Test the simple transform flow asynchronously."""
+    input_text = "async"
+    result = await simple_transform.eval_async(input_text)
+    assert result == "async world", f"Expected 'async world', got {result}"
+def test_for_each_transform_flow() -> None:
+    """Test the complex transform flow with child rows."""
+    input_data = Parent(children=[Child(1), Child(2), Child(3)])
+    result = for_each_transform.eval(input_data)
+    expected = {
+        "children": [
+            {"value": 1, "new_field": 1},
+            {"value": 2, "new_field": 2},
+            {"value": 3, "new_field": 3},
+        ]
+    }
+    assert result == expected, f"Expected {expected}, got {result}"
+    input_data = Parent(children=[])
+    result = for_each_transform.eval(input_data)
+    assert result == {"children": []}, f"Expected {{'children': []}}, got {result}"
+@pytest.mark.asyncio
+async def test_for_each_transform_flow_async() -> None:
+    """Test the complex transform flow asynchronously."""
+    input_data = Parent(children=[Child(4), Child(5)])
+    result = await for_each_transform.eval_async(input_data)
+    expected = {
+        "children": [
+            {"value": 4, "new_field": 4},
+            {"value": 5, "new_field": 5},
+        ]
+    }
+    assert result == expected, f"Expected {expected}, got {result}"

cocoindex/typing.py CHANGED Viewed

@@ -262,7 +262,7 @@ def analyze_type_info(t: Any) -> AnalyzedTypeInfo:
     if kind is not None:
         variant = AnalyzedBasicType(kind=kind)
-    elif base_type is None or base_type is Any or base_type is inspect.Parameter.empty:
+    elif base_type is Any or base_type is inspect.Parameter.empty:
         variant = AnalyzedAnyType()
     elif is_struct_type(base_type):
         variant = AnalyzedStructType(struct_type=t)
@@ -270,15 +270,15 @@ def analyze_type_info(t: Any) -> AnalyzedTypeInfo:
         kind = DtypeRegistry.validate_dtype_and_get_kind(t)
         variant = AnalyzedBasicType(kind=kind)
     elif base_type is collections.abc.Sequence or base_type is list:
-        elem_type = type_args[0] if len(type_args) > 0 else None
+        elem_type = type_args[0] if len(type_args) > 0 else Any
         variant = AnalyzedListType(elem_type=elem_type, vector_info=vector_info)
     elif base_type is np.ndarray:
         np_number_type = t
         elem_type = extract_ndarray_elem_dtype(np_number_type)
         variant = AnalyzedListType(elem_type=elem_type, vector_info=vector_info)
     elif base_type is collections.abc.Mapping or base_type is dict or t is dict:
-        key_type = type_args[0] if len(type_args) > 0 else None
-        elem_type = type_args[1] if len(type_args) > 1 else None
+        key_type = type_args[0] if len(type_args) > 0 else Any
+        elem_type = type_args[1] if len(type_args) > 1 else Any
         variant = AnalyzedDictType(key_type=key_type, value_type=elem_type)
     elif base_type in (types.UnionType, typing.Union):
         non_none_types = [arg for arg in type_args if arg not in (None, types.NoneType)]

{cocoindex-0.1.74.dist-info → cocoindex-0.1.76.dist-info}/METADATA RENAMED Viewed

@@ -1,24 +1,28 @@
 Metadata-Version: 2.4
 Name: cocoindex
-Version: 0.1.74
+Version: 0.1.76
 Requires-Dist: click>=8.1.8
 Requires-Dist: rich>=14.0.0
 Requires-Dist: python-dotenv>=1.1.0
 Requires-Dist: watchfiles>=1.1.0
 Requires-Dist: numpy>=1.23.2
 Requires-Dist: pytest ; extra == 'dev'
+Requires-Dist: pytest-asyncio ; extra == 'dev'
 Requires-Dist: ruff ; extra == 'dev'
 Requires-Dist: mypy ; extra == 'dev'
 Requires-Dist: pre-commit ; extra == 'dev'
 Requires-Dist: sentence-transformers>=3.3.1 ; extra == 'embeddings'
+Requires-Dist: colpali-engine ; extra == 'colpali'
 Requires-Dist: sentence-transformers>=3.3.1 ; extra == 'all'
+Requires-Dist: colpali-engine ; extra == 'all'
 Provides-Extra: dev
 Provides-Extra: embeddings
+Provides-Extra: colpali
 Provides-Extra: all
 License-File: LICENSE
 Summary: With CocoIndex, users declare the transformation, CocoIndex creates & maintains an index, and keeps the derived index up to date based on source update, with minimal computation and changes.
 Author-email: CocoIndex <cocoindex.io@gmail.com>
-License: Apache-2.0
+License-Expression: Apache-2.0
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
 Project-URL: Homepage, https://cocoindex.io/
@@ -210,6 +214,7 @@ It defines an index flow like this:
 | [Image Search with Vision API](examples/image_search) | Generates detailed captions for images using a vision model, embeds them, enables live-updating semantic search via FastAPI and served on a React frontend|
 | [Face Recognition](examples/face_recognition) | Recognize faces in images and build embedding index |
 | [Paper Metadata](examples/paper_metadata) | Index papers in PDF files, and build metadata tables for each paper |
+| [Custom Output Files](examples/custom_output_files) | Convert markdown files to HTML files and save them to a local directory, using *CocoIndex Custom Targets* |
 More coming and stay tuned 👀!

{cocoindex-0.1.74.dist-info → cocoindex-0.1.76.dist-info}/RECORD RENAMED Viewed

@@ -1,18 +1,18 @@
-cocoindex-0.1.74.dist-info/METADATA,sha256=kkA1L7X7ypbNcSLBtJ92Zp1lYEgEJEfXpR1d1c8V_Lw,11311
-cocoindex-0.1.74.dist-info/WHEEL,sha256=s25ArxaEBsS5fVGGsO24U-sMTFw8T6jsDJcABX1aYfs,108
-cocoindex-0.1.74.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
-cocoindex-0.1.74.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-cocoindex/__init__.py,sha256=kfTgbh2haepo7kIbzJqfxU6Kx7wPol5_t1SYF2x6cBM,2114
-cocoindex/_engine.cpython-313-x86_64-linux-gnu.so,sha256=KBhGhQFVrZj1gYN6basp_J3f17mOq1M6iyRmUMJIw5U,71102424
+cocoindex-0.1.76.dist-info/METADATA,sha256=w_VMnPWkx5iMgpwgAWTB3KRxSfcGaU1sVES6-jXiAjQ,11655
+cocoindex-0.1.76.dist-info/WHEEL,sha256=9Ee4MwqZpMDLH1_kZE8rvruLKRVRs9cmbXRSBB0h-_M,108
+cocoindex-0.1.76.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
+cocoindex-0.1.76.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+cocoindex/__init__.py,sha256=sLpSVO5Cotgn_82lawxvXnaqfa-qj33rytWBAe2MTtU,2201
+cocoindex/_engine.cpython-313-x86_64-linux-gnu.so,sha256=sIEYde0TfFKjUS1vPsht_7DPZjbvlApMxb_U2CbiIJw,71114160
 cocoindex/auth_registry.py,sha256=PE1-kVkcyC1G2C_V7b1kvYzeq73OFQehWKQP7ln7fJ8,1478
 cocoindex/cli.py,sha256=-gp639JSyQN6YjnhGqCakIzYoSSqXxQMbxbkcYGP0QY,22359
-cocoindex/convert.py,sha256=jO95WD8X7AYS5COfRyPiq4t7ewVC1TdRbYpTdHrkCvY,19564
-cocoindex/flow.py,sha256=C2VwexNxes_7b70jhlp5yAPeBQPtdbKabOp1AarsacA,36184
-cocoindex/functions.py,sha256=LLu_ausirvqnsx_k3euZpv8sLCpBZ4DF77h2HOzbinE,3109
+cocoindex/convert.py,sha256=HodeDl1HVX8nnBH02lQKarw5i3xmkjB0nGj-DXt7Ifc,18284
+cocoindex/flow.py,sha256=egKbBG2X9DjAqmcATcndyRhe9zMZHRd-YxKCpt9BsUg,36551
+cocoindex/functions.py,sha256=34sZWoS0zGnaKyooIODQgc6QEPZKiJoWhfb8jKIWwps,9528
 cocoindex/index.py,sha256=j93B9jEvvLXHtpzKWL88SY6wCGEoPgpsQhEGHlyYGFg,540
 cocoindex/lib.py,sha256=f--9dAYd84CZosbDZqNW0oGbBLsY3dXiUTR1VrfQ_QY,817
 cocoindex/llm.py,sha256=WxmWUbNcf9HOCM5xkbDeFs9lF67M3mr810B7deDDc-8,673
-cocoindex/op.py,sha256=DNAlZGpGFZWAfRFuxXKmT72kSORdGbirDYsSlD7sDqk,21474
+cocoindex/op.py,sha256=oiG1rjxz6ad1jGS7DMya4NStrA_6LV3RbcVSR75XUl0,21516
 cocoindex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cocoindex/runtime.py,sha256=povilB3HH3y1JF-yxKwU-pD8n2WnAqyQxIgvXXHNc60,1080
 cocoindex/setting.py,sha256=TwhQ6pEeZmvc8ZXlnT9d8Wn8Vz_u7Z5LJUkGsKmKSno,4859
@@ -20,11 +20,12 @@ cocoindex/setup.py,sha256=7uIHKN4FOCuoidPXcKyGTrkqpkl9luL49-6UcnMxYzw,3068
 cocoindex/sources.py,sha256=69COA4qbZDipzGYfXv-WJSmicFkA509xIShRGDh6A0A,2083
 cocoindex/targets.py,sha256=Nfh_tpFd1goTnS_cxBjIs4j9zl3Z4Z1JomAQ1dl3Sic,2796
 cocoindex/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cocoindex/tests/test_convert.py,sha256=guQ6-AJh4z1NoR60wlm0lJ2R-rOstmFDihDL2oYGouI,46875
+cocoindex/tests/test_convert.py,sha256=l7LqD7duV9-xkYTaKOsEPdqw7v14dUzE40f4VVLlBCQ,49423
 cocoindex/tests/test_optional_database.py,sha256=snAmkNa6wtOSaxoZE1HgjvL5v_ylitt3Jt_9df4Cgdc,8506
+cocoindex/tests/test_transform_flow.py,sha256=VvT5b895MH5kwT-h4OpdDTl545SU4nxeIm7E_QANmAk,2894
 cocoindex/tests/test_typing.py,sha256=9OF3lO2uSpZBefkEJx7WRbnkXjwQtvlQIeeARYQID68,12391
 cocoindex/tests/test_validation.py,sha256=X6AQzVs-hVKIXcrHMEMQnhfUE8at7iXQnPq8nHNhZ2Q,4543
-cocoindex/typing.py,sha256=RK-UYiNkxL9DPjhitFeJVB0b3eV2DsGjOg8V-jrnUYU,13383
+cocoindex/typing.py,sha256=qQ0ANF3iuQDeSqipHgL2SDiiXL2reTMUN0aj4ve_T0w,13359
 cocoindex/utils.py,sha256=hUhX-XV6XGCtJSEIpBOuDv6VvqImwPlgBxztBTw7u0U,598
 cocoindex/validation.py,sha256=PZnJoby4sLbsmPv9fOjOQXuefjfZ7gmtsiTGU8SH-tc,3090
-cocoindex-0.1.74.dist-info/RECORD,,
+cocoindex-0.1.76.dist-info/RECORD,,

{cocoindex-0.1.74.dist-info → cocoindex-0.1.76.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: maturin (1.9.1)
+Generator: maturin (1.9.2)
 Root-Is-Purelib: false
 Tag: cp313-cp313-manylinux_2_28_x86_64

{cocoindex-0.1.74.dist-info → cocoindex-0.1.76.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{cocoindex-0.1.74.dist-info → cocoindex-0.1.76.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes