PyPI - cocoindex - Versions diffs - 0.3.4__cp311-abi3-manylinux_2_28_x86_64.whl - Mend

cocoindex 0.3.4__cp311-abi3-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

cocoindex/__init__.py +114 -0
cocoindex/_engine.abi3.so +0 -0
cocoindex/auth_registry.py +44 -0
cocoindex/cli.py +830 -0
cocoindex/engine_object.py +214 -0
cocoindex/engine_value.py +550 -0
cocoindex/flow.py +1281 -0
cocoindex/functions/__init__.py +40 -0
cocoindex/functions/_engine_builtin_specs.py +66 -0
cocoindex/functions/colpali.py +247 -0
cocoindex/functions/sbert.py +77 -0
cocoindex/index.py +50 -0
cocoindex/lib.py +75 -0
cocoindex/llm.py +47 -0
cocoindex/op.py +1047 -0
cocoindex/py.typed +0 -0
cocoindex/query_handler.py +57 -0
cocoindex/runtime.py +78 -0
cocoindex/setting.py +171 -0
cocoindex/setup.py +92 -0
cocoindex/sources/__init__.py +5 -0
cocoindex/sources/_engine_builtin_specs.py +120 -0
cocoindex/subprocess_exec.py +277 -0
cocoindex/targets/__init__.py +5 -0
cocoindex/targets/_engine_builtin_specs.py +153 -0
cocoindex/targets/lancedb.py +466 -0
cocoindex/tests/__init__.py +0 -0
cocoindex/tests/test_engine_object.py +331 -0
cocoindex/tests/test_engine_value.py +1724 -0
cocoindex/tests/test_optional_database.py +249 -0
cocoindex/tests/test_transform_flow.py +300 -0
cocoindex/tests/test_typing.py +553 -0
cocoindex/tests/test_validation.py +134 -0
cocoindex/typing.py +834 -0
cocoindex/user_app_loader.py +53 -0
cocoindex/utils.py +20 -0
cocoindex/validation.py +104 -0
cocoindex-0.3.4.dist-info/METADATA +288 -0
cocoindex-0.3.4.dist-info/RECORD +42 -0
cocoindex-0.3.4.dist-info/WHEEL +4 -0
cocoindex-0.3.4.dist-info/entry_points.txt +2 -0
cocoindex-0.3.4.dist-info/licenses/THIRD_PARTY_NOTICES.html +13249 -0

cocoindex/functions/__init__.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Functions module for cocoindex.
+This module provides various function specifications and executors for data processing,
+including embedding functions, text processing, and multimodal operations.
+"""
+# Import all engine builtin function specs
+from ._engine_builtin_specs import *
+# Import SentenceTransformer embedding functionality
+from .sbert import (
+    SentenceTransformerEmbed,
+    SentenceTransformerEmbedExecutor,
+)
+# Import ColPali multimodal embedding functionality
+from .colpali import (
+    ColPaliEmbedImage,
+    ColPaliEmbedImageExecutor,
+    ColPaliEmbedQuery,
+    ColPaliEmbedQueryExecutor,
+)
+__all__ = [
+    # Engine builtin specs
+    "DetectProgrammingLanguage",
+    "EmbedText",
+    "ExtractByLlm",
+    "ParseJson",
+    "SplitBySeparators",
+    "SplitRecursively",
+    # SentenceTransformer
+    "SentenceTransformerEmbed",
+    "SentenceTransformerEmbedExecutor",
+    # ColPali
+    "ColPaliEmbedImage",
+    "ColPaliEmbedImageExecutor",
+    "ColPaliEmbedQuery",
+    "ColPaliEmbedQueryExecutor",
+]

cocoindex/functions/_engine_builtin_specs.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""All builtin function specs."""
+import dataclasses
+from typing import Literal
+from .. import llm, op
+class ParseJson(op.FunctionSpec):
+    """Parse a text into a JSON object."""
+@dataclasses.dataclass
+class CustomLanguageSpec:
+    """Custom language specification."""
+    language_name: str
+    separators_regex: list[str]
+    aliases: list[str] = dataclasses.field(default_factory=list)
+class DetectProgrammingLanguage(op.FunctionSpec):
+    """Detect the programming language of a file."""
+class SplitRecursively(op.FunctionSpec):
+    """Split a document (in string) recursively."""
+    custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
+class SplitBySeparators(op.FunctionSpec):
+    """
+    Split text by specified regex separators only.
+    Output schema matches SplitRecursively for drop-in compatibility:
+        KTable rows with fields: location (Range), text (Str), start, end.
+    Args:
+        separators_regex: list[str]  # e.g., [r"\\n\\n+"]
+        keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
+        include_empty: bool = False
+        trim: bool = True
+    """
+    separators_regex: list[str] = dataclasses.field(default_factory=list)
+    keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
+    include_empty: bool = False
+    trim: bool = True
+class EmbedText(op.FunctionSpec):
+    """Embed a text into a vector space."""
+    api_type: llm.LlmApiType
+    model: str
+    address: str | None = None
+    output_dimension: int | None = None
+    task_type: str | None = None
+    api_config: llm.VertexAiConfig | None = None
+class ExtractByLlm(op.FunctionSpec):
+    """Extract information from a text using a LLM."""
+    llm_spec: llm.LlmSpec
+    output_type: type
+    instruction: str | None = None

cocoindex/functions/colpali.py ADDED Viewed

@@ -0,0 +1,247 @@
+"""ColPali image and query embedding functions for multimodal document retrieval."""
+import functools
+from dataclasses import dataclass
+from typing import Any, TYPE_CHECKING, Literal
+import numpy as np
+from .. import op
+from ..typing import Vector
+if TYPE_CHECKING:
+    import torch
+@dataclass
+class ColPaliModelInfo:
+    """Shared model information for ColPali embedding functions."""
+    model: Any
+    processor: Any
+    device: Any
+    dimension: int
+@functools.cache
+def _get_colpali_model_and_processor(model_name: str) -> ColPaliModelInfo:
+    """Load and cache ColPali model and processor with shared device setup."""
+    try:
+        import colpali_engine as ce  # type: ignore[import-untyped]
+        import torch
+    except ImportError as e:
+        raise ImportError(
+            "ColPali support requires the optional 'colpali' dependency. "
+            "Install it with: pip install 'cocoindex[colpali]'"
+        ) from e
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    lower_model_name = model_name.lower()
+    # Determine model type from name
+    if lower_model_name.startswith("colpali"):
+        model = ce.ColPali.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16, device_map=device
+        )
+        processor = ce.ColPaliProcessor.from_pretrained(model_name)
+    elif lower_model_name.startswith("colqwen2.5"):
+        model = ce.ColQwen2_5.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16, device_map=device
+        )
+        processor = ce.ColQwen2_5_Processor.from_pretrained(model_name)
+    elif lower_model_name.startswith("colqwen"):
+        model = ce.ColQwen2.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16, device_map=device
+        )
+        processor = ce.ColQwen2Processor.from_pretrained(model_name)
+    else:
+        # Fallback to ColPali for backwards compatibility
+        model = ce.ColPali.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16, device_map=device
+        )
+        processor = ce.ColPaliProcessor.from_pretrained(model_name)
+    # Detect dimension
+    dimension = _detect_colpali_dimension(model, processor, device)
+    return ColPaliModelInfo(
+        model=model,
+        processor=processor,
+        dimension=dimension,
+        device=device,
+    )
+def _detect_colpali_dimension(model: Any, processor: Any, device: Any) -> int:
+    """Detect ColPali embedding dimension from the actual model config."""
+    # Try to access embedding dimension
+    if hasattr(model.config, "embedding_dim"):
+        dim = model.config.embedding_dim
+    else:
+        # Fallback: infer from output shape with dummy data
+        from PIL import Image
+        import numpy as np
+        import torch
+        dummy_img = Image.fromarray(np.zeros((224, 224, 3), np.uint8))
+        # Use the processor to process the dummy image
+        processed = processor.process_images([dummy_img]).to(device)
+        with torch.no_grad():
+            output = model(**processed)
+        dim = int(output.shape[-1])
+    if isinstance(dim, int):
+        return dim
+    else:
+        raise ValueError(f"Expected integer dimension, got {type(dim)}: {dim}")
+    return dim
+class ColPaliEmbedImage(op.FunctionSpec):
+    """
+    `ColPaliEmbedImage` embeds images using ColVision multimodal models.
+    Supports ALL models available in the colpali-engine library, including:
+    - ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
+    - ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
+    - ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
+    - Any future ColVision models supported by colpali-engine
+    These models use late interaction between image patch embeddings and text token
+    embeddings for retrieval.
+    Args:
+        model: Any ColVision model name supported by colpali-engine
+               (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
+               See https://github.com/illuin-tech/colpali for the complete list of supported models.
+    Note:
+        This function requires the optional colpali-engine dependency.
+        Install it with: pip install 'cocoindex[colpali]'
+    """
+    model: str
+@op.executor_class(
+    gpu=True,
+    cache=True,
+    batching=True,
+    max_batch_size=32,
+    behavior_version=1,
+)
+class ColPaliEmbedImageExecutor:
+    """Executor for ColVision image embedding (ColPali, ColQwen2, ColSmol, etc.)."""
+    spec: ColPaliEmbedImage
+    _model_info: ColPaliModelInfo
+    def analyze(self) -> type:
+        # Get shared model and dimension
+        self._model_info = _get_colpali_model_and_processor(self.spec.model)
+        # Return multi-vector type: Variable patches x Fixed hidden dimension
+        dimension = self._model_info.dimension
+        return Vector[Vector[np.float32, Literal[dimension]]]  # type: ignore
+    def __call__(self, img_bytes_list: list[bytes]) -> Any:
+        try:
+            from PIL import Image
+            import torch
+            import io
+        except ImportError as e:
+            raise ImportError(
+                "Required dependencies (PIL, torch) are missing for ColVision image embedding."
+            ) from e
+        model = self._model_info.model
+        processor = self._model_info.processor
+        device = self._model_info.device
+        pil_images = [
+            Image.open(io.BytesIO(img_bytes)).convert("RGB")
+            for img_bytes in img_bytes_list
+        ]
+        inputs = processor.process_images(pil_images).to(device)
+        with torch.no_grad():
+            embeddings = model(**inputs)
+        # Return multi-vector format: [patches, hidden_dim]
+        if len(embeddings.shape) != 3:
+            raise ValueError(
+                f"Expected 3D tensor [batch, patches, hidden_dim], got shape {embeddings.shape}"
+            )
+        # [patches, hidden_dim]
+        return embeddings.cpu().to(torch.float32).numpy()
+class ColPaliEmbedQuery(op.FunctionSpec):
+    """
+    `ColPaliEmbedQuery` embeds text queries using ColVision multimodal models.
+    Supports ALL models available in the colpali-engine library, including:
+    - ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
+    - ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
+    - ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
+    - Any future ColVision models supported by colpali-engine
+    This produces query embeddings compatible with ColVision image embeddings
+    for late interaction scoring (MaxSim).
+    Args:
+        model: Any ColVision model name supported by colpali-engine
+               (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
+               See https://github.com/illuin-tech/colpali for the complete list of supported models.
+    Note:
+        This function requires the optional colpali-engine dependency.
+        Install it with: pip install 'cocoindex[colpali]'
+    """
+    model: str
+@op.executor_class(
+    gpu=True,
+    cache=True,
+    behavior_version=1,
+    batching=True,
+    max_batch_size=32,
+)
+class ColPaliEmbedQueryExecutor:
+    """Executor for ColVision query embedding (ColPali, ColQwen2, ColSmol, etc.)."""
+    spec: ColPaliEmbedQuery
+    _model_info: ColPaliModelInfo
+    def analyze(self) -> type:
+        # Get shared model and dimension
+        self._model_info = _get_colpali_model_and_processor(self.spec.model)
+        # Return multi-vector type: Variable tokens x Fixed hidden dimension
+        dimension = self._model_info.dimension
+        return Vector[Vector[np.float32, Literal[dimension]]]  # type: ignore
+    def __call__(self, queries: list[str]) -> Any:
+        try:
+            import torch
+        except ImportError as e:
+            raise ImportError(
+                "Required dependencies (torch) are missing for ColVision query embedding."
+            ) from e
+        model = self._model_info.model
+        processor = self._model_info.processor
+        device = self._model_info.device
+        inputs = processor.process_queries(queries).to(device)
+        with torch.no_grad():
+            embeddings = model(**inputs)
+        # Return multi-vector format: [tokens, hidden_dim]
+        if len(embeddings.shape) != 3:
+            raise ValueError(
+                f"Expected 3D tensor [batch, tokens, hidden_dim], got shape {embeddings.shape}"
+            )
+        # [tokens, hidden_dim]
+        return embeddings.cpu().to(torch.float32).numpy()

cocoindex/functions/sbert.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""SentenceTransformer embedding functionality."""
+from typing import Any, Literal, cast
+import numpy as np
+from numpy.typing import NDArray
+from .. import op
+from ..typing import Vector
+class SentenceTransformerEmbed(op.FunctionSpec):
+    """
+    `SentenceTransformerEmbed` embeds a text into a vector space using the [SentenceTransformer](https://huggingface.co/sentence-transformers) library.
+    Args:
+        model: The name of the SentenceTransformer model to use.
+        args: Additional arguments to pass to the SentenceTransformer constructor. e.g. {"trust_remote_code": True}
+    Note:
+        This function requires the optional sentence-transformers dependency.
+        Install it with: pip install 'cocoindex[embeddings]'
+    """
+    model: str
+    args: dict[str, Any] | None = None
+@op.executor_class(
+    gpu=True,
+    cache=True,
+    batching=True,
+    max_batch_size=512,
+    behavior_version=1,
+    arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),
+)
+class SentenceTransformerEmbedExecutor:
+    """Executor for SentenceTransformerEmbed."""
+    spec: SentenceTransformerEmbed
+    _model: Any | None = None
+    def analyze(self) -> type:
+        try:
+            # Only import sentence_transformers locally when it's needed, as its import is very slow.
+            import sentence_transformers  # pylint: disable=import-outside-toplevel
+        except ImportError as e:
+            raise ImportError(
+                "sentence_transformers is required for SentenceTransformerEmbed function. "
+                "Install it with one of these commands:\n"
+                "  pip install 'cocoindex[embeddings]'\n"
+                "  pip install sentence-transformers"
+            ) from e
+        args = self.spec.args or {}
+        self._model = sentence_transformers.SentenceTransformer(self.spec.model, **args)
+        dim = self._model.get_sentence_embedding_dimension()
+        return Vector[np.float32, Literal[dim]]  # type: ignore
+    def __call__(self, text: list[str]) -> list[NDArray[np.float32]]:
+        assert self._model is not None
+        # Sort the text by length to minimize the number of padding tokens.
+        text_with_idx = [(idx, t) for idx, t in enumerate(text)]
+        text_with_idx.sort(key=lambda x: len(x[1]))
+        results: list[NDArray[np.float32]] = self._model.encode(
+            [t for _, t in text_with_idx], convert_to_numpy=True
+        )
+        final_results: list[NDArray[np.float32] | None] = [
+            None for _ in range(len(text))
+        ]
+        for (idx, _), result in zip(text_with_idx, results):
+            final_results[idx] = result
+        return cast(list[NDArray[np.float32]], final_results)

cocoindex/index.py ADDED Viewed

@@ -0,0 +1,50 @@
+from enum import Enum
+from dataclasses import dataclass
+from typing import Sequence, Union
+class VectorSimilarityMetric(Enum):
+    COSINE_SIMILARITY = "CosineSimilarity"
+    L2_DISTANCE = "L2Distance"
+    INNER_PRODUCT = "InnerProduct"
+@dataclass
+class HnswVectorIndexMethod:
+    """HNSW vector index parameters."""
+    kind: str = "Hnsw"
+    m: int | None = None
+    ef_construction: int | None = None
+@dataclass
+class IvfFlatVectorIndexMethod:
+    """IVFFlat vector index parameters."""
+    kind: str = "IvfFlat"
+    lists: int | None = None
+VectorIndexMethod = Union[HnswVectorIndexMethod, IvfFlatVectorIndexMethod]
+@dataclass
+class VectorIndexDef:
+    """
+    Define a vector index on a field.
+    """
+    field_name: str
+    metric: VectorSimilarityMetric
+    method: VectorIndexMethod | None = None
+@dataclass
+class IndexOptions:
+    """
+    Options for an index.
+    """
+    primary_key_fields: Sequence[str]
+    vector_indexes: Sequence[VectorIndexDef] = ()

cocoindex/lib.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""
+Library level functions and states.
+"""
+import threading
+import warnings
+from . import _engine  # type: ignore
+from . import flow, setting
+from .engine_object import dump_engine_object
+from .validation import validate_app_namespace_name
+from typing import Any, Callable, overload
+def prepare_settings(settings: setting.Settings) -> Any:
+    """Prepare the settings for the engine."""
+    if settings.app_namespace:
+        validate_app_namespace_name(settings.app_namespace)
+    return dump_engine_object(settings)
+_engine.set_settings_fn(lambda: prepare_settings(setting.Settings.from_env()))
+_prev_settings_fn: Callable[[], setting.Settings] | None = None
+_prev_settings_fn_lock: threading.Lock = threading.Lock()
+@overload
+def settings(fn: Callable[[], setting.Settings]) -> Callable[[], setting.Settings]: ...
+@overload
+def settings(
+    fn: None,
+) -> Callable[[Callable[[], setting.Settings]], Callable[[], setting.Settings]]: ...
+def settings(fn: Callable[[], setting.Settings] | None = None) -> Any:
+    """
+    Decorate a function that returns a settings.Settings object.
+    It registers the function as a settings provider.
+    """
+    def _inner(fn: Callable[[], setting.Settings]) -> Callable[[], setting.Settings]:
+        global _prev_settings_fn  # pylint: disable=global-statement
+        with _prev_settings_fn_lock:
+            if _prev_settings_fn is not None:
+                warnings.warn(
+                    f"Setting a new settings function will override the previous one {_prev_settings_fn}."
+                )
+            _prev_settings_fn = fn
+        _engine.set_settings_fn(lambda: prepare_settings(fn()))
+        return fn
+    if fn is not None:
+        return _inner(fn)
+    else:
+        return _inner
+def init(settings: setting.Settings | None = None) -> None:
+    """
+    Initialize the cocoindex library.
+    If the settings are not provided, they are loaded from the environment variables.
+    """
+    _engine.init(prepare_settings(settings) if settings is not None else None)
+def start_server(settings: setting.ServerSettings) -> None:
+    """Start the cocoindex server."""
+    flow.ensure_all_flows_built()
+    _engine.start_server(settings.__dict__)
+def stop() -> None:
+    """Stop the cocoindex library."""
+    _engine.stop()

cocoindex/llm.py ADDED Viewed

@@ -0,0 +1,47 @@
+from dataclasses import dataclass
+from enum import Enum
+class LlmApiType(Enum):
+    """The type of LLM API to use."""
+    OPENAI = "OpenAi"
+    OLLAMA = "Ollama"
+    GEMINI = "Gemini"
+    VERTEX_AI = "VertexAi"
+    ANTHROPIC = "Anthropic"
+    LITE_LLM = "LiteLlm"
+    OPEN_ROUTER = "OpenRouter"
+    VOYAGE = "Voyage"
+    VLLM = "Vllm"
+    BEDROCK = "Bedrock"
+@dataclass
+class VertexAiConfig:
+    """A specification for a Vertex AI LLM."""
+    kind = "VertexAi"
+    project: str
+    region: str | None = None
+@dataclass
+class OpenAiConfig:
+    """A specification for a OpenAI LLM."""
+    kind = "OpenAi"
+    org_id: str | None = None
+    project_id: str | None = None
+@dataclass
+class LlmSpec:
+    """A specification for a LLM."""
+    api_type: LlmApiType
+    model: str
+    address: str | None = None
+    api_config: VertexAiConfig | OpenAiConfig | None = None