PyPI - cocoindex - Versions diffs - 0.2.19__cp311-abi3-macosx_10_12_x86_64.whl → 0.2.21__cp311-abi3-macosx_10_12_x86_64.whl - Mend

cocoindex 0.2.19__cp311-abi3-macosx_10_12_x86_64.whl → 0.2.21__cp311-abi3-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cocoindex might be problematic. Click here for more details.

Files changed (13) hide show

cocoindex/_engine.abi3.so +0 -0
cocoindex/cli.py +1 -3
cocoindex/engine_value.py +15 -13
cocoindex/flow.py +4 -4
cocoindex/llm.py +1 -0
cocoindex/op.py +262 -5
cocoindex/typing.py +6 -6
{cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/METADATA +15 -8
{cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/RECORD +12 -13
{cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/licenses/THIRD_PARTY_NOTICES.html +33 -27
cocoindex/functions.py +0 -375
{cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/WHEEL +0 -0
{cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/entry_points.txt +0 -0

cocoindex/_engine.abi3.so CHANGED Viewed

Binary file

cocoindex/cli.py CHANGED Viewed

@@ -84,9 +84,7 @@ def _load_user_app(app_target: str) -> None:
     try:
         load_user_app(app_target)
     except UserAppLoaderError as e:
-        raise click.ClickException(
-            f"Failed to load APP_TARGET '{app_target}': {e}"
-        ) from e
+        raise ValueError(f"Failed to load APP_TARGET '{app_target}'") from e
     add_user_app(app_target)

cocoindex/engine_value.py CHANGED Viewed

@@ -70,6 +70,17 @@ def _is_type_kind_convertible_to(src_type_kind: str, dst_type_kind: str) -> bool
 ANY_TYPE_INFO = analyze_type_info(inspect.Parameter.empty)
+def make_engine_key_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], Any]:
+    """
+    Create an encoder closure for a key type.
+    """
+    value_encoder = make_engine_value_encoder(type_info)
+    if isinstance(type_info.variant, AnalyzedBasicType):
+        return lambda value: [value_encoder(value)]
+    else:
+        return value_encoder
 def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], Any]:
     """
     Create an encoder closure for a specific type.
@@ -94,6 +105,9 @@ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], An
         # Otherwise it's a vector, falling into basic type in the engine.
     if isinstance(variant, AnalyzedDictType):
+        key_type_info = analyze_type_info(variant.key_type)
+        key_encoder = make_engine_key_encoder(key_type_info)
         value_type_info = analyze_type_info(variant.value_type)
         if not isinstance(value_type_info.variant, AnalyzedStructType):
             raise ValueError(
@@ -102,22 +116,10 @@ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], An
             )
         value_encoder = make_engine_value_encoder(value_type_info)
-        key_type_info = analyze_type_info(variant.key_type)
-        key_encoder = make_engine_value_encoder(key_type_info)
-        if isinstance(key_type_info.variant, AnalyzedBasicType):
-            def encode_row(k: Any, v: Any) -> Any:
-                return [key_encoder(k)] + value_encoder(v)
-        else:
-            def encode_row(k: Any, v: Any) -> Any:
-                return key_encoder(k) + value_encoder(v)
         def encode_struct_dict(value: Any) -> Any:
             if not value:
                 return []
-            return [encode_row(k, v) for k, v in value.items()]
+            return [key_encoder(k) + value_encoder(v) for k, v in value.items()]
         return encode_struct_dict

cocoindex/flow.py CHANGED Viewed

@@ -459,7 +459,9 @@ class _FlowBuilderState:
     field_name_builder: _NameBuilder
     def __init__(self, full_name: str):
-        self.engine_flow_builder = _engine.FlowBuilder(full_name)
+        self.engine_flow_builder = _engine.FlowBuilder(
+            full_name, execution_context.event_loop
+        )
         self.field_name_builder = _NameBuilder()
     def get_data_slice(self, v: Any) -> _engine.DataSlice:
@@ -931,9 +933,7 @@ def _create_lazy_flow(
             flow_builder_state, flow_builder_state.engine_flow_builder.root_scope()
         )
         fl_def(FlowBuilder(flow_builder_state), root_scope)
-        return flow_builder_state.engine_flow_builder.build_flow(
-            execution_context.event_loop
-        )
+        return flow_builder_state.engine_flow_builder.build_flow()
     return Flow(flow_name, _create_engine_flow)

cocoindex/llm.py CHANGED Viewed

@@ -14,6 +14,7 @@ class LlmApiType(Enum):
     OPEN_ROUTER = "OpenRouter"
     VOYAGE = "Voyage"
     VLLM = "Vllm"
+    BEDROCK = "Bedrock"
 @dataclass

cocoindex/op.py CHANGED Viewed

@@ -9,22 +9,33 @@ from typing import (
     Any,
     Awaitable,
     Callable,
+    Iterator,
     Protocol,
     dataclass_transform,
     Annotated,
+    TypeVar,
+    Generic,
+    Literal,
     get_args,
 )
+from collections.abc import AsyncIterator
 from . import _engine  # type: ignore
 from .subprocess_exec import executor_stub
 from .engine_object import dump_engine_object, load_engine_object
 from .engine_value import (
+    make_engine_key_encoder,
     make_engine_value_encoder,
     make_engine_value_decoder,
     make_engine_key_decoder,
     make_engine_struct_decoder,
 )
 from .typing import (
+    KEY_FIELD_NAME,
+    AnalyzedTypeInfo,
+    StructSchema,
+    StructType,
+    TableType,
     TypeAttr,
     encode_enriched_type_info,
     resolve_forward_ref,
@@ -96,12 +107,12 @@ class Executor(Protocol):
     op_category: OpCategory
-def _get_required_method(cls: type, name: str) -> Callable[..., Any]:
-    method = getattr(cls, name, None)
+def _get_required_method(obj: type, name: str) -> Callable[..., Any]:
+    method = getattr(obj, name, None)
     if method is None:
-        raise ValueError(f"Method {name}() is required for {cls.__name__}")
-    if not inspect.isfunction(method):
-        raise ValueError(f"Method {cls.__name__}.{name}() is not a function")
+        raise ValueError(f"Method {name}() is required for {obj}")
+    if not inspect.isfunction(method) and not inspect.ismethod(method):
+        raise ValueError(f"{obj}.{name}() is not a function; {method}")
     return method
@@ -421,6 +432,252 @@ def function(**args: Any) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
     return _inner
+########################################################
+# Custom source connector
+########################################################
+@dataclasses.dataclass
+class SourceReadOptions:
+    """
+    The options for reading a source row.
+    This is argument for both `list()` and `get_value()` methods.
+    Note that in most cases (unless spelled out otherwise below) it's not a mandatory requirement, but more like a hint to say it's useful under the current context.
+    - include_ordinal: Whether to include the ordinal of the source row.
+      When provides_ordinal() returns True, you must provide `ordinal` in `list()` when `include_ordinal` is True.
+      It's optional for other cases. It's helpful to skip unnecessary reprocessing early, and avoid output from older version of input over-writing the latest one when there's concurrency (especially multiple processes) and source updates frequently.
+    - include_content_version_fp: Whether to include the content version fingerprint of the source row.
+      It's always optional even if this is True.
+      It's helpful to skip unnecessary reprocessing early.
+      You should only consider providing it if you can directly get it without computing the hash on the content.
+    - include_value: Whether to include the value of the source row.
+      You must provide it in `get_value()` when `include_value` is True.
+      It's optional for `list()`.
+      Consider providing it when it's significantly cheaper then calling another `get_value()` for each row.
+      It will save costs of individual `get_value()` calls.
+    """
+    include_ordinal: bool = False
+    include_content_version_fp: bool = False
+    include_value: bool = False
+K = TypeVar("K")
+V = TypeVar("V")
+NON_EXISTENCE: Literal["NON_EXISTENCE"] = "NON_EXISTENCE"
+NO_ORDINAL: Literal["NO_ORDINAL"] = "NO_ORDINAL"
+@dataclasses.dataclass
+class PartialSourceRowData(Generic[V]):
+    """
+    The data of a source row.
+    - value: The value of the source row. NON_EXISTENCE means the row does not exist.
+    - ordinal: The ordinal of the source row. NO_ORDINAL means ordinal is not available for the source.
+    - content_version_fp: The content version fingerprint of the source row.
+    """
+    value: V | Literal["NON_EXISTENCE"] | None = None
+    ordinal: int | Literal["NO_ORDINAL"] | None = None
+    content_version_fp: bytes | None = None
+@dataclasses.dataclass
+class PartialSourceRow(Generic[K, V]):
+    key: K
+    data: PartialSourceRowData[V]
+class _SourceExecutorContext:
+    _executor: Any
+    _key_encoder: Callable[[Any], Any]
+    _key_decoder: Callable[[Any], Any]
+    _value_encoder: Callable[[Any], Any]
+    _list_fn: Callable[
+        [SourceReadOptions],
+        AsyncIterator[PartialSourceRow[Any, Any]]
+        | Iterator[PartialSourceRow[Any, Any]],
+    ]
+    _orig_get_value_fn: Callable[..., Any]
+    _get_value_fn: Callable[..., Awaitable[PartialSourceRowData[Any]]]
+    _provides_ordinal_fn: Callable[[], bool] | None
+    def __init__(
+        self,
+        executor: Any,
+        key_type_info: AnalyzedTypeInfo,
+        key_decoder: Callable[[Any], Any],
+        value_type_info: AnalyzedTypeInfo,
+    ):
+        self._executor = executor
+        self._key_encoder = make_engine_key_encoder(key_type_info)
+        self._key_decoder = key_decoder
+        self._value_encoder = make_engine_value_encoder(value_type_info)
+        self._list_fn = _get_required_method(executor, "list")
+        self._orig_get_value_fn = _get_required_method(executor, "get_value")
+        self._get_value_fn = to_async_call(self._orig_get_value_fn)
+        self._provides_ordinal_fn = getattr(executor, "provides_ordinal", None)
+    def provides_ordinal(self) -> bool:
+        if self._provides_ordinal_fn is not None:
+            result = self._provides_ordinal_fn()
+            return bool(result)
+        else:
+            return False
+    async def list_async(
+        self, options: dict[str, Any]
+    ) -> AsyncIterator[tuple[Any, dict[str, Any]]]:
+        """
+        Return an async iterator that yields individual rows one by one.
+        Each yielded item is a tuple of (key, data).
+        """
+        read_options = load_engine_object(SourceReadOptions, options)
+        args = _build_args(self._list_fn, 0, options=read_options)
+        list_result = self._list_fn(*args)
+        # Handle both sync and async iterators
+        if hasattr(list_result, "__aiter__"):
+            async for partial_row in list_result:
+                yield (
+                    self._key_encoder(partial_row.key),
+                    self._encode_source_row_data(partial_row.data),
+                )
+        else:
+            for partial_row in list_result:
+                yield (
+                    self._key_encoder(partial_row.key),
+                    self._encode_source_row_data(partial_row.data),
+                )
+    async def get_value_async(
+        self,
+        raw_key: Any,
+        options: dict[str, Any],
+    ) -> dict[str, Any]:
+        key = self._key_decoder(raw_key)
+        read_options = load_engine_object(SourceReadOptions, options)
+        args = _build_args(self._orig_get_value_fn, 1, key=key, options=read_options)
+        row_data = await self._get_value_fn(*args)
+        return self._encode_source_row_data(row_data)
+    def _encode_source_row_data(
+        self, row_data: PartialSourceRowData[Any]
+    ) -> dict[str, Any]:
+        """Convert Python PartialSourceRowData to the format expected by Rust."""
+        return {
+            "ordinal": row_data.ordinal,
+            "content_version_fp": row_data.content_version_fp,
+            "value": (
+                NON_EXISTENCE
+                if row_data.value == NON_EXISTENCE
+                else self._value_encoder(row_data.value)
+            ),
+        }
+class _SourceConnector:
+    """
+    The connector class passed to the engine.
+    """
+    _spec_cls: type[Any]
+    _key_type_info: AnalyzedTypeInfo
+    _key_decoder: Callable[[Any], Any]
+    _value_type_info: AnalyzedTypeInfo
+    _table_type: EnrichedValueType
+    _connector_cls: type[Any]
+    _create_fn: Callable[[Any], Awaitable[Any]]
+    def __init__(
+        self,
+        spec_cls: type[Any],
+        key_type: Any,
+        value_type: Any,
+        connector_cls: type[Any],
+    ):
+        self._spec_cls = spec_cls
+        self._key_type_info = analyze_type_info(key_type)
+        self._value_type_info = analyze_type_info(value_type)
+        self._connector_cls = connector_cls
+        # TODO: We can save the intermediate step after #1083 is fixed.
+        encoded_engine_key_type = encode_enriched_type_info(self._key_type_info)
+        engine_key_type = EnrichedValueType.decode(encoded_engine_key_type)
+        # TODO: We can save the intermediate step after #1083 is fixed.
+        encoded_engine_value_type = encode_enriched_type_info(self._value_type_info)
+        engine_value_type = EnrichedValueType.decode(encoded_engine_value_type)
+        if not isinstance(engine_value_type.type, StructType):
+            raise ValueError(f"Expected a StructType, got {engine_value_type.type}")
+        if isinstance(engine_key_type.type, StructType):
+            key_fields_schema = engine_key_type.type.fields
+        else:
+            key_fields_schema = [
+                FieldSchema(name=KEY_FIELD_NAME, value_type=engine_key_type)
+            ]
+        self._key_decoder = make_engine_key_decoder(
+            [], key_fields_schema, self._key_type_info
+        )
+        self._table_type = EnrichedValueType(
+            type=TableType(
+                kind="KTable",
+                row=StructSchema(
+                    fields=key_fields_schema + engine_value_type.type.fields
+                ),
+                num_key_parts=len(key_fields_schema),
+            ),
+        )
+        self._create_fn = to_async_call(_get_required_method(connector_cls, "create"))
+    async def create_executor(self, raw_spec: dict[str, Any]) -> _SourceExecutorContext:
+        spec = load_engine_object(self._spec_cls, raw_spec)
+        executor = await self._create_fn(spec)
+        return _SourceExecutorContext(
+            executor, self._key_type_info, self._key_decoder, self._value_type_info
+        )
+    def get_table_type(self) -> Any:
+        return dump_engine_object(self._table_type)
+def source_connector(
+    *,
+    spec_cls: type[Any],
+    key_type: Any = Any,
+    value_type: Any = Any,
+) -> Callable[[type], type]:
+    """
+    Decorate a class to provide a source connector for an op.
+    """
+    # Validate the spec_cls is a SourceSpec.
+    if not issubclass(spec_cls, SourceSpec):
+        raise ValueError(f"Expect a SourceSpec, got {spec_cls}")
+    # Register the source connector.
+    def _inner(connector_cls: type) -> type:
+        connector = _SourceConnector(spec_cls, key_type, value_type, connector_cls)
+        _engine.register_source_connector(spec_cls.__name__, connector)
+        return connector_cls
+    return _inner
 ########################################################
 # Custom target connector
 ########################################################

cocoindex/typing.py CHANGED Viewed

@@ -475,16 +475,16 @@ def _encode_type(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
         }
-def encode_enriched_type_info(enriched_type_info: AnalyzedTypeInfo) -> dict[str, Any]:
+def encode_enriched_type_info(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
     """
-    Encode an enriched type info to a CocoIndex engine's type representation
+    Encode an `AnalyzedTypeInfo` to a CocoIndex engine's `EnrichedValueType` representation
     """
-    encoded: dict[str, Any] = {"type": _encode_type(enriched_type_info)}
+    encoded: dict[str, Any] = {"type": _encode_type(type_info)}
-    if enriched_type_info.attrs is not None:
-        encoded["attrs"] = enriched_type_info.attrs
+    if type_info.attrs is not None:
+        encoded["attrs"] = type_info.attrs
-    if enriched_type_info.nullable:
+    if type_info.nullable:
         encoded["nullable"] = True
     return encoded

{cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cocoindex
-Version: 0.2.19
+Version: 0.2.21
 Classifier: Development Status :: 3 - Alpha
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
@@ -75,7 +75,6 @@ Project-URL: Homepage, https://cocoindex.io/
     <a href="https://trendshift.io/repositories/13939" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13939" alt="cocoindex-io%2Fcocoindex | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </div>
 Ultra performant data transformation framework for AI, with core engine written in Rust. Support incremental processing and data lineage out-of-box.  Exceptional developer velocity. Production-ready at day 0.
 ⭐ Drop a star to help us grow!
@@ -113,9 +112,8 @@ CocoIndex makes it effortless to transform data with AI, and keep source data an
 </br>
 ## Exceptional velocity
 Just declare transformation in dataflow with ~100 lines of python
 ```python
@@ -139,6 +137,7 @@ CocoIndex follows the idea of [Dataflow](https://en.wikipedia.org/wiki/Dataflow_
 **Particularly**, developers don't explicitly mutate data by creating, updating and deleting. They just need to define transformation/formula for a set of source data.
 ## Plug-and-Play Building Blocks
 Native builtins for different source, targets and transformations. Standardize interface, make it 1-line code switch between different components - as easy as assembling building blocks.
 <p align="center">
@@ -146,6 +145,7 @@ Native builtins for different source, targets and transformations. Standardize i
 </p>
 ## Data Freshness
 CocoIndex keep source data and target in sync effortlessly.
 <p align="center">
@@ -153,11 +153,14 @@ CocoIndex keep source data and target in sync effortlessly.
 </p>
 It has out-of-box support for incremental indexing:
 - minimal recomputation on source or logic change.
 - (re-)processing necessary portions; reuse cache when possible
-## Quick Start:
+## Quick Start
 If you're new to CocoIndex, we recommend checking out
 - 📖 [Documentation](https://cocoindex.io/docs)
 - ⚡  [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart)
 - 🎬 [Quick Start Video Tutorial](https://youtu.be/gv5R8nOXsWU?si=9ioeKYkMEnYevTXT)
@@ -172,7 +175,6 @@ pip install -U cocoindex
 2. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. CocoIndex uses it for incremental processing.
 ## Define data flow
 Follow [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart) to define your first indexing flow. An example flow looks like:
@@ -228,6 +230,7 @@ It defines an index flow like this:
 | [Text Embedding](examples/text_embedding) | Index text documents with embeddings for semantic search |
 | [Code Embedding](examples/code_embedding) | Index code embeddings for semantic search |
 | [PDF Embedding](examples/pdf_embedding) | Parse PDF and index text embeddings for semantic search |
+| [PDF Elements Embedding](examples/pdf_elements_embedding) | Extract text and images from PDFs; embed text with SentenceTransformers and images with CLIP; store in Qdrant for multimodal search |
 | [Manuals LLM Extraction](examples/manuals_llm_extraction) | Extract structured information from a manual using LLM |
 | [Amazon S3 Embedding](examples/amazon_s3_embedding) | Index text documents from Amazon S3 |
 | [Azure Blob Storage Embedding](examples/azure_blob_embedding) | Index text documents from Azure Blob Storage |
@@ -244,16 +247,18 @@ It defines an index flow like this:
 | [Custom Output Files](examples/custom_output_files) | Convert markdown files to HTML files and save them to a local directory, using *CocoIndex Custom Targets* |
 | [Patient intake form extraction](examples/patient_intake_extraction) | Use LLM to extract structured data from patient intake forms with different formats |
 More coming and stay tuned 👀!
 ## 📖 Documentation
 For detailed documentation, visit [CocoIndex Documentation](https://cocoindex.io/docs), including a [Quickstart guide](https://cocoindex.io/docs/getting_started/quickstart).
 ## 🤝 Contributing
 We love contributions from our community ❤️. For details on contributing or running the project for development, check out our [contributing guide](https://cocoindex.io/docs/about/contributing).
 ## 👥 Community
 Welcome with a huge coconut hug 🥥⋆｡˚🤗. We are super excited for community contributions of all kinds - whether it's code improvements, documentation updates, issue reports, feature requests, and discussions in our Discord.
 Join our community here:
@@ -263,9 +268,11 @@ Join our community here:
 - ▶️ [Subscribe to our YouTube channel](https://www.youtube.com/@cocoindex-io)
 - 📜 [Read our blog posts](https://cocoindex.io/blogs/)
-## Support us:
+## Support us
 We are constantly improving, and more features and examples are coming soon. If you love this project, please drop us a star ⭐ at GitHub repo [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) to stay tuned and help us grow.
 ## License
 CocoIndex is Apache 2.0 licensed.

{cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/RECORD RENAMED Viewed

@@ -1,23 +1,22 @@
-cocoindex-0.2.19.dist-info/METADATA,sha256=z-JkcBlnCXXhcn7yDzAi2OFeeyUKmozPm-p12261vU8,13444
-cocoindex-0.2.19.dist-info/WHEEL,sha256=eDlp9unULyyDxD2Zd14qZwSC_Y-kO5nuRBdlMsoCXEY,105
-cocoindex-0.2.19.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
-cocoindex-0.2.19.dist-info/licenses/THIRD_PARTY_NOTICES.html,sha256=FpJNEsnvjyAB1qtTee_XMRM_2Thi2mbI0mfm-AVM9Ag,719620
+cocoindex-0.2.21.dist-info/METADATA,sha256=dmcpI0cptqzSRSMvU4SdifiOithkSaYcgIPZCEBp2Mk,13644
+cocoindex-0.2.21.dist-info/WHEEL,sha256=eDlp9unULyyDxD2Zd14qZwSC_Y-kO5nuRBdlMsoCXEY,105
+cocoindex-0.2.21.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
+cocoindex-0.2.21.dist-info/licenses/THIRD_PARTY_NOTICES.html,sha256=_rafrUNfyvjjCzp4oqD9ti424cq-OkzpGbOW9RFmwls,719655
 cocoindex/__init__.py,sha256=6qZWVkK4WZ01BIAg3CPh_bRRdA6Clk4d4Q6OnZ2jFa4,2630
-cocoindex/_engine.abi3.so,sha256=g0xBUi6nEZZsrskMfyHPHfLPkEefxznFgBk94CWt6P8,73872996
+cocoindex/_engine.abi3.so,sha256=QE0mlwKln9iI462G41R-VFfRHlyocabZipLD5ji6Mb8,69615752
 cocoindex/auth_registry.py,sha256=g-uLDWLYW5NMbYe7q4Y-sU5dSyrlJXBEciyWtAiP9KE,1340
-cocoindex/cli.py,sha256=19IszBXOzqGn0xOV1SaS-oR9NupTmIm18uzFNET7NTQ,23978
+cocoindex/cli.py,sha256=vk_YtGMPXTuu1U4J_VxzjWfTFv8Fu3tdyaVocpoxb5g,23941
 cocoindex/engine_object.py,sha256=5YTuWoR3WILhyt3PW-d9es3MAas_xD6tZZqvipN-sjg,10050
-cocoindex/engine_value.py,sha256=8M7MbwVG2bfd3kFptGGbQHBAp9pD3TVjrBiBDOAhD5M,23211
-cocoindex/flow.py,sha256=JWPTR2G6TdPJkO5ZlrCcyDyQ8utUS4zZWNR8zsHTeW8,40074
-cocoindex/functions.py,sha256=V4ljBnCprvA25XlCVvNLwK5ergXiEcKU76jkOGC-X3A,12882
+cocoindex/engine_value.py,sha256=WJw8ymYAqF2CCyg9SBiQzx8z9bl7XNVuD6ffgYvRRWQ,23277
+cocoindex/flow.py,sha256=xDz3rOo4RhbboknvC-KnbWq8RBykEO0YsjGSBfXqIEg,40076
 cocoindex/functions/__init__.py,sha256=V2IF4h-Cqq4OD_GN3Oqdry-FArORyRCKmqJ7g5UlJr8,1021
 cocoindex/functions/_engine_builtin_specs.py,sha256=WpCGrjUfJBa8xZP5JiEmA8kLu7fp9Rcs7ynpuJmvSGg,1786
 cocoindex/functions/colpali.py,sha256=oACyG3qG2dquyCJ6bT7FkMkua5rXDLSxnOHcgoz9waU,8865
 cocoindex/functions/sbert.py,sha256=1z5OJT-blXT6tVN5vEvEzvYAzOnzs1RCnu1UbCUP6wM,2162
 cocoindex/index.py,sha256=tz5ilvmOp0BtroGehCQDqWK_pIX9m6ghkhcxsDVU8WE,982
 cocoindex/lib.py,sha256=spfdU4IbzdffHyGdrQPIw_qGo9aX0OAAboqsjj8bTiQ,2290
-cocoindex/llm.py,sha256=Pv_cdnRngTLtuLU9AUmS8izIHhcKVnuBNolC33f9BDI,851
-cocoindex/op.py,sha256=Ycvr6lJf7hcCCjYUqHtXZqzSeDD-FQdP3_jcmZUV_zI,26896
+cocoindex/llm.py,sha256=8ZdJhOmhdb2xEcCxk6rDpnj6hlhCyFBmJdhCNMqAOP4,875
+cocoindex/op.py,sha256=TO-ETk3qXgnNS51NlWuLrOw_TfQ2mw83-_iswqULcQI,36095
 cocoindex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cocoindex/query_handler.py,sha256=X-SQT71LHiOOXn6-TJlQcGodJk-iT8p_1TcIMvRLBRI,1344
 cocoindex/runtime.py,sha256=4NxcltaDZvA3RR3Pnt6gH_f99jcWSyMH_1Xi5BjbtwY,1342
@@ -36,8 +35,8 @@ cocoindex/tests/test_optional_database.py,sha256=snAmkNa6wtOSaxoZE1HgjvL5v_ylitt
 cocoindex/tests/test_transform_flow.py,sha256=G69w-n-vnCTo3r9hVIk2lJNAQEkGUA7PZfHsXna3oS0,6030
 cocoindex/tests/test_typing.py,sha256=JoR-oMK-ZWjOGQi0pH5Etg5jp4oL_JSIreGBH247GCg,16291
 cocoindex/tests/test_validation.py,sha256=X6AQzVs-hVKIXcrHMEMQnhfUE8at7iXQnPq8nHNhZ2Q,4543
-cocoindex/typing.py,sha256=so_RusbhBmg_uLoZTY7W_pqU0aIJwFarkTF5NQufl4o,23944
+cocoindex/typing.py,sha256=qQj5uM6XAKHzRJ2BIEs7X-xeOXVcM9p_xz5SVqPVvS8,23914
 cocoindex/user_app_loader.py,sha256=bc3Af-gYRxJ9GpObtpjegZY855oQBCv5FGkrkWV2yGY,1873
 cocoindex/utils.py,sha256=hUhX-XV6XGCtJSEIpBOuDv6VvqImwPlgBxztBTw7u0U,598
 cocoindex/validation.py,sha256=PZnJoby4sLbsmPv9fOjOQXuefjfZ7gmtsiTGU8SH-tc,3090
-cocoindex-0.2.19.dist-info/RECORD,,
+cocoindex-0.2.21.dist-info/RECORD,,

{cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/licenses/THIRD_PARTY_NOTICES.html RENAMED Viewed

@@ -2428,7 +2428,7 @@ Software.
                 <h3 id="Apache-2.0">Apache License 2.0</h3>
                 <h4>Used by:</h4>
                 <ul class="license-used-by">
-                    <li><a href=" https://crates.io/crates/cocoindex ">cocoindex 0.2.19</a></li>
+                    <li><a href=" https://crates.io/crates/cocoindex ">cocoindex 0.2.21</a></li>
                     <li><a href=" https://github.com/awesomized/crc-fast-rust ">crc-fast 1.3.0</a></li>
                     <li><a href=" https://github.com/qdrant/rust-client ">qdrant-client 1.15.0</a></li>
                 </ul>
@@ -10677,6 +10677,38 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.</pre>
+            </li>
+            <li class="license">
+                <h3 id="MIT">MIT License</h3>
+                <h4>Used by:</h4>
+                <ul class="license-used-by">
+                    <li><a href=" https://github.com/tree-sitter/tree-sitter-scala ">tree-sitter-scala 0.24.0</a></li>
+                </ul>
+                <pre class="license-text">(The MIT License)
+Copyright (c) 2014 Nathan Rajlich &lt;nathan@tootallnate.net&gt;
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the &quot;Software&quot;), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED &quot;AS IS&quot;, WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+</pre>
             </li>
             <li class="license">
                 <h3 id="MIT">MIT License</h3>
@@ -12300,32 +12332,6 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
-</pre>
-            </li>
-            <li class="license">
-                <h3 id="MIT">MIT License</h3>
-                <h4>Used by:</h4>
-                <ul class="license-used-by">
-                    <li><a href=" https://github.com/tree-sitter/tree-sitter-scala ">tree-sitter-scala 0.24.0</a></li>
-                </ul>
-                <pre class="license-text">This software is released under the MIT license:
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the &quot;Software&quot;), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED &quot;AS IS&quot;, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 </pre>
             </li>
             <li class="license">

cocoindex/functions.py DELETED Viewed

@@ -1,375 +0,0 @@
-"""All builtin functions."""
-import dataclasses
-import functools
-from typing import Any, Literal
-import numpy as np
-from numpy.typing import NDArray
-from . import llm, op
-from .typing import Vector
-class ParseJson(op.FunctionSpec):
-    """Parse a text into a JSON object."""
-@dataclasses.dataclass
-class CustomLanguageSpec:
-    """Custom language specification."""
-    language_name: str
-    separators_regex: list[str]
-    aliases: list[str] = dataclasses.field(default_factory=list)
-@dataclasses.dataclass
-class ColPaliModelInfo:
-    """Data structure for ColPali model and processor."""
-    model: Any
-    processor: Any
-    dimension: int
-    device: Any
-class SplitRecursively(op.FunctionSpec):
-    """Split a document (in string) recursively."""
-    custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
-class SplitBySeparators(op.FunctionSpec):
-    """
-    Split text by specified regex separators only.
-    Output schema matches SplitRecursively for drop-in compatibility:
-        KTable rows with fields: location (Range), text (Str), start, end.
-    Args:
-        separators_regex: list[str]  # e.g., [r"\\n\\n+"]
-        keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
-        include_empty: bool = False
-        trim: bool = True
-    """
-    separators_regex: list[str] = dataclasses.field(default_factory=list)
-    keep_separator: Literal["NONE", "LEFT", "RIGHT"] = "NONE"
-    include_empty: bool = False
-    trim: bool = True
-class EmbedText(op.FunctionSpec):
-    """Embed a text into a vector space."""
-    api_type: llm.LlmApiType
-    model: str
-    address: str | None = None
-    output_dimension: int | None = None
-    task_type: str | None = None
-    api_config: llm.VertexAiConfig | None = None
-class ExtractByLlm(op.FunctionSpec):
-    """Extract information from a text using a LLM."""
-    llm_spec: llm.LlmSpec
-    output_type: type
-    instruction: str | None = None
-class SentenceTransformerEmbed(op.FunctionSpec):
-    """
-    `SentenceTransformerEmbed` embeds a text into a vector space using the [SentenceTransformer](https://huggingface.co/sentence-transformers) library.
-    Args:
-        model: The name of the SentenceTransformer model to use.
-        args: Additional arguments to pass to the SentenceTransformer constructor. e.g. {"trust_remote_code": True}
-    Note:
-        This function requires the optional sentence-transformers dependency.
-        Install it with: pip install 'cocoindex[embeddings]'
-    """
-    model: str
-    args: dict[str, Any] | None = None
-@op.executor_class(
-    gpu=True,
-    cache=True,
-    behavior_version=1,
-    arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),
-)
-class SentenceTransformerEmbedExecutor:
-    """Executor for SentenceTransformerEmbed."""
-    spec: SentenceTransformerEmbed
-    _model: Any | None = None
-    def analyze(self) -> type:
-        try:
-            # Only import sentence_transformers locally when it's needed, as its import is very slow.
-            import sentence_transformers  # pylint: disable=import-outside-toplevel
-        except ImportError as e:
-            raise ImportError(
-                "sentence_transformers is required for SentenceTransformerEmbed function. "
-                "Install it with one of these commands:\n"
-                "  pip install 'cocoindex[embeddings]'\n"
-                "  pip install sentence-transformers"
-            ) from e
-        args = self.spec.args or {}
-        self._model = sentence_transformers.SentenceTransformer(self.spec.model, **args)
-        dim = self._model.get_sentence_embedding_dimension()
-        return Vector[np.float32, Literal[dim]]  # type: ignore
-    def __call__(self, text: str) -> NDArray[np.float32]:
-        assert self._model is not None
-        result: NDArray[np.float32] = self._model.encode(text, convert_to_numpy=True)
-        return result
-@functools.cache
-def _get_colpali_model_and_processor(model_name: str) -> ColPaliModelInfo:
-    """Get or load ColPali model and processor, with caching."""
-    try:
-        from colpali_engine.models import (  # type: ignore[import-untyped]
-            ColPali,
-            ColPaliProcessor,
-            ColQwen2,
-            ColQwen2Processor,
-            ColQwen2_5,
-            ColQwen2_5_Processor,
-            ColIdefics3,
-            ColIdefics3Processor,
-        )
-        from colpali_engine.utils.torch_utils import get_torch_device  # type: ignore[import-untyped]
-        import torch
-    except ImportError as e:
-        raise ImportError(
-            "ColVision models are not available. Make sure cocoindex is installed with ColPali support."
-        ) from e
-    device = get_torch_device("auto")
-    # Manual model detection based on model name
-    model_name_lower = model_name.lower()
-    try:
-        if "qwen2.5" in model_name_lower:
-            model = ColQwen2_5.from_pretrained(
-                model_name,
-                torch_dtype=torch.bfloat16,
-                device_map=device,
-            ).eval()
-            processor = ColQwen2_5_Processor.from_pretrained(model_name)
-        elif "qwen2" in model_name_lower:
-            model = ColQwen2.from_pretrained(
-                model_name,
-                torch_dtype=torch.bfloat16,
-                device_map=device,
-            ).eval()
-            processor = ColQwen2Processor.from_pretrained(model_name)
-        elif "colsmol" in model_name_lower or "smol" in model_name_lower:
-            # ColSmol models use Idefics3 architecture
-            model = ColIdefics3.from_pretrained(
-                model_name,
-                torch_dtype=torch.bfloat16,
-                device_map=device,
-            ).eval()
-            processor = ColIdefics3Processor.from_pretrained(model_name)
-        else:
-            # Default to ColPali
-            model = ColPali.from_pretrained(
-                model_name,
-                torch_dtype=torch.bfloat16,
-                device_map=device,
-            ).eval()
-            processor = ColPaliProcessor.from_pretrained(model_name)
-    except Exception as e:
-        raise RuntimeError(f"Failed to load model {model_name}: {e}")
-    # Get dimension from the actual model
-    dimension = _detect_colpali_dimension(model, processor, device)
-    return ColPaliModelInfo(
-        model=model,
-        processor=processor,
-        dimension=dimension,
-        device=device,
-    )
-def _detect_colpali_dimension(model: Any, processor: Any, device: Any) -> int:
-    """Detect ColPali embedding dimension from the actual model config."""
-    # Try to access embedding dimension
-    if hasattr(model.config, "embedding_dim"):
-        dim = model.config.embedding_dim
-    else:
-        # Fallback: infer from output shape with dummy data
-        from PIL import Image
-        import numpy as np
-        import torch
-        dummy_img = Image.fromarray(np.zeros((224, 224, 3), np.uint8))
-        # Use the processor to process the dummy image
-        processed = processor.process_images([dummy_img]).to(device)
-        with torch.no_grad():
-            output = model(**processed)
-        dim = int(output.shape[-1])
-    if isinstance(dim, int):
-        return dim
-    else:
-        raise ValueError(f"Expected integer dimension, got {type(dim)}: {dim}")
-    return dim
-class ColPaliEmbedImage(op.FunctionSpec):
-    """
-    `ColPaliEmbedImage` embeds images using ColVision multimodal models.
-    Supports ALL models available in the colpali-engine library, including:
-    - ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
-    - ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
-    - ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
-    - Any future ColVision models supported by colpali-engine
-    These models use late interaction between image patch embeddings and text token
-    embeddings for retrieval.
-    Args:
-        model: Any ColVision model name supported by colpali-engine
-               (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
-               See https://github.com/illuin-tech/colpali for the complete list of supported models.
-    Note:
-        This function requires the optional colpali-engine dependency.
-        Install it with: pip install 'cocoindex[colpali]'
-    """
-    model: str
-@op.executor_class(
-    gpu=True,
-    cache=True,
-    behavior_version=1,
-)
-class ColPaliEmbedImageExecutor:
-    """Executor for ColVision image embedding (ColPali, ColQwen2, ColSmol, etc.)."""
-    spec: ColPaliEmbedImage
-    _model_info: ColPaliModelInfo
-    def analyze(self) -> type:
-        # Get shared model and dimension
-        self._model_info = _get_colpali_model_and_processor(self.spec.model)
-        # Return multi-vector type: Variable patches x Fixed hidden dimension
-        dimension = self._model_info.dimension
-        return Vector[Vector[np.float32, Literal[dimension]]]  # type: ignore
-    def __call__(self, img_bytes: bytes) -> Any:
-        try:
-            from PIL import Image
-            import torch
-            import io
-        except ImportError as e:
-            raise ImportError(
-                "Required dependencies (PIL, torch) are missing for ColVision image embedding."
-            ) from e
-        model = self._model_info.model
-        processor = self._model_info.processor
-        device = self._model_info.device
-        pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-        inputs = processor.process_images([pil_image]).to(device)
-        with torch.no_grad():
-            embeddings = model(**inputs)
-        # Return multi-vector format: [patches, hidden_dim]
-        if len(embeddings.shape) != 3:
-            raise ValueError(
-                f"Expected 3D tensor [batch, patches, hidden_dim], got shape {embeddings.shape}"
-            )
-        # Keep patch-level embeddings: [batch, patches, hidden_dim] -> [patches, hidden_dim]
-        patch_embeddings = embeddings[0]  # Remove batch dimension
-        return patch_embeddings.cpu().to(torch.float32).numpy()
-class ColPaliEmbedQuery(op.FunctionSpec):
-    """
-    `ColPaliEmbedQuery` embeds text queries using ColVision multimodal models.
-    Supports ALL models available in the colpali-engine library, including:
-    - ColPali models (colpali-*): PaliGemma-based, best for general document retrieval
-    - ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
-    - ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments
-    - Any future ColVision models supported by colpali-engine
-    This produces query embeddings compatible with ColVision image embeddings
-    for late interaction scoring (MaxSim).
-    Args:
-        model: Any ColVision model name supported by colpali-engine
-               (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0")
-               See https://github.com/illuin-tech/colpali for the complete list of supported models.
-    Note:
-        This function requires the optional colpali-engine dependency.
-        Install it with: pip install 'cocoindex[colpali]'
-    """
-    model: str
-@op.executor_class(
-    gpu=True,
-    cache=True,
-    behavior_version=1,
-)
-class ColPaliEmbedQueryExecutor:
-    """Executor for ColVision query embedding (ColPali, ColQwen2, ColSmol, etc.)."""
-    spec: ColPaliEmbedQuery
-    _model_info: ColPaliModelInfo
-    def analyze(self) -> type:
-        # Get shared model and dimension
-        self._model_info = _get_colpali_model_and_processor(self.spec.model)
-        # Return multi-vector type: Variable tokens x Fixed hidden dimension
-        dimension = self._model_info.dimension
-        return Vector[Vector[np.float32, Literal[dimension]]]  # type: ignore
-    def __call__(self, query: str) -> Any:
-        try:
-            import torch
-        except ImportError as e:
-            raise ImportError(
-                "Required dependencies (torch) are missing for ColVision query embedding."
-            ) from e
-        model = self._model_info.model
-        processor = self._model_info.processor
-        device = self._model_info.device
-        inputs = processor.process_queries([query]).to(device)
-        with torch.no_grad():
-            embeddings = model(**inputs)
-        # Return multi-vector format: [tokens, hidden_dim]
-        if len(embeddings.shape) != 3:
-            raise ValueError(
-                f"Expected 3D tensor [batch, tokens, hidden_dim], got shape {embeddings.shape}"
-            )
-        # Keep token-level embeddings: [batch, tokens, hidden_dim] -> [tokens, hidden_dim]
-        token_embeddings = embeddings[0]  # Remove batch dimension
-        return token_embeddings.cpu().to(torch.float32).numpy()

{cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/WHEEL RENAMED Viewed

File without changes

{cocoindex-0.2.19.dist-info → cocoindex-0.2.21.dist-info}/entry_points.txt RENAMED Viewed

File without changes