PyPI - cocoindex - Versions diffs - 0.3.4__cp311-abi3-manylinux_2_28_x86_64.whl - Mend

cocoindex 0.3.4__cp311-abi3-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

cocoindex/__init__.py +114 -0
cocoindex/_engine.abi3.so +0 -0
cocoindex/auth_registry.py +44 -0
cocoindex/cli.py +830 -0
cocoindex/engine_object.py +214 -0
cocoindex/engine_value.py +550 -0
cocoindex/flow.py +1281 -0
cocoindex/functions/__init__.py +40 -0
cocoindex/functions/_engine_builtin_specs.py +66 -0
cocoindex/functions/colpali.py +247 -0
cocoindex/functions/sbert.py +77 -0
cocoindex/index.py +50 -0
cocoindex/lib.py +75 -0
cocoindex/llm.py +47 -0
cocoindex/op.py +1047 -0
cocoindex/py.typed +0 -0
cocoindex/query_handler.py +57 -0
cocoindex/runtime.py +78 -0
cocoindex/setting.py +171 -0
cocoindex/setup.py +92 -0
cocoindex/sources/__init__.py +5 -0
cocoindex/sources/_engine_builtin_specs.py +120 -0
cocoindex/subprocess_exec.py +277 -0
cocoindex/targets/__init__.py +5 -0
cocoindex/targets/_engine_builtin_specs.py +153 -0
cocoindex/targets/lancedb.py +466 -0
cocoindex/tests/__init__.py +0 -0
cocoindex/tests/test_engine_object.py +331 -0
cocoindex/tests/test_engine_value.py +1724 -0
cocoindex/tests/test_optional_database.py +249 -0
cocoindex/tests/test_transform_flow.py +300 -0
cocoindex/tests/test_typing.py +553 -0
cocoindex/tests/test_validation.py +134 -0
cocoindex/typing.py +834 -0
cocoindex/user_app_loader.py +53 -0
cocoindex/utils.py +20 -0
cocoindex/validation.py +104 -0
cocoindex-0.3.4.dist-info/METADATA +288 -0
cocoindex-0.3.4.dist-info/RECORD +42 -0
cocoindex-0.3.4.dist-info/WHEEL +4 -0
cocoindex-0.3.4.dist-info/entry_points.txt +2 -0
cocoindex-0.3.4.dist-info/licenses/THIRD_PARTY_NOTICES.html +13249 -0

cocoindex/subprocess_exec.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""
+Lightweight subprocess-backed executor stub.
+- Uses a single global ProcessPoolExecutor (max_workers=1), created lazily.
+- In the subprocess, maintains a registry of executor instances keyed by
+  (executor_factory, pickled spec) to enable reuse.
+- Caches analyze() and prepare() results per key to avoid repeated calls
+  even if key collision happens.
+"""
+from __future__ import annotations
+from concurrent.futures import ProcessPoolExecutor
+from concurrent.futures.process import BrokenProcessPool
+from dataclasses import dataclass, field
+from typing import Any, Callable
+import pickle
+import threading
+import asyncio
+import os
+import time
+from .user_app_loader import load_user_app
+from .runtime import execution_context
+import logging
+import multiprocessing as mp
+WATCHDOG_INTERVAL_SECONDS = 10.0
+# ---------------------------------------------
+# Main process: single, lazily-created pool
+# ---------------------------------------------
+_pool_lock = threading.Lock()
+_pool: ProcessPoolExecutor | None = None
+_user_apps: list[str] = []
+_logger = logging.getLogger(__name__)
+def _get_pool() -> ProcessPoolExecutor:
+    global _pool  # pylint: disable=global-statement
+    with _pool_lock:
+        if _pool is None:
+            # Single worker process as requested
+            _pool = ProcessPoolExecutor(
+                max_workers=1,
+                initializer=_subprocess_init,
+                initargs=(_user_apps, os.getpid()),
+                mp_context=mp.get_context("spawn"),
+            )
+        return _pool
+def add_user_app(app_target: str) -> None:
+    with _pool_lock:
+        _user_apps.append(app_target)
+def _restart_pool(old_pool: ProcessPoolExecutor | None = None) -> None:
+    """Safely restart the global ProcessPoolExecutor.
+    Thread-safe via `_pool_lock`. Shuts down the old pool and re-creates a new
+    one with the same initializer/args.
+    """
+    global _pool
+    with _pool_lock:
+        # If another thread already swapped the pool, skip restart
+        if old_pool is not None and _pool is not old_pool:
+            return
+        _logger.error("Detected dead subprocess pool; restarting and retrying.")
+        prev_pool = _pool
+        _pool = ProcessPoolExecutor(
+            max_workers=1,
+            initializer=_subprocess_init,
+            initargs=(_user_apps, os.getpid()),
+            mp_context=mp.get_context("spawn"),
+        )
+        if prev_pool is not None:
+            # Best-effort shutdown of previous pool; letting exceptions bubble up
+            # is acceptable here and signals irrecoverable executor state.
+            prev_pool.shutdown(cancel_futures=True)
+async def _submit_with_restart(fn: Callable[..., Any], *args: Any) -> Any:
+    """Submit and await work, restarting the subprocess until it succeeds.
+    Retries on BrokenProcessPool or pool-shutdown RuntimeError; re-raises other
+    exceptions.
+    """
+    while True:
+        pool = _get_pool()
+        try:
+            fut = pool.submit(fn, *args)
+            return await asyncio.wrap_future(fut)
+        except BrokenProcessPool:
+            _restart_pool(old_pool=pool)
+            # loop and retry
+# ---------------------------------------------
+# Subprocess: executor registry and helpers
+# ---------------------------------------------
+def _start_parent_watchdog(
+    parent_pid: int, interval_seconds: float = WATCHDOG_INTERVAL_SECONDS
+) -> None:
+    """Terminate this process if the parent process exits or PPID changes.
+    This runs in a background daemon thread so it never blocks pool work.
+    """
+    import psutil  # type: ignore
+    if parent_pid is None:
+        parent_pid = os.getppid()
+    try:
+        p = psutil.Process(parent_pid)
+        # Cache create_time to defeat PID reuse.
+        created = p.create_time()
+    except psutil.Error:
+        # Parent already gone or not accessible
+        os._exit(1)
+    def _watch() -> None:
+        while True:
+            try:
+                # is_running() + same create_time => same process and still alive
+                if not (p.is_running() and p.create_time() == created):
+                    os._exit(1)
+            except psutil.NoSuchProcess:
+                os._exit(1)
+            time.sleep(interval_seconds)
+    threading.Thread(target=_watch, name="parent-watchdog", daemon=True).start()
+def _subprocess_init(user_apps: list[str], parent_pid: int) -> None:
+    import signal
+    import faulthandler
+    faulthandler.enable()
+    # Ignore SIGINT in the subprocess on best-effort basis.
+    try:
+        signal.signal(signal.SIGINT, signal.SIG_IGN)
+    except Exception:
+        pass
+    _start_parent_watchdog(parent_pid)
+    # In case any user app is already in this subprocess, e.g. the subprocess is forked, we need to avoid loading it again.
+    with _pool_lock:
+        already_loaded_apps = set(_user_apps)
+    loaded_apps = []
+    for app_target in user_apps:
+        if app_target not in already_loaded_apps:
+            load_user_app(app_target)
+            loaded_apps.append(app_target)
+    with _pool_lock:
+        _user_apps.extend(loaded_apps)
+class _OnceResult:
+    _result: Any = None
+    _done: bool = False
+    def run_once(self, method: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
+        if self._done:
+            return self._result
+        self._result = _call_method(method, *args, **kwargs)
+        self._done = True
+        return self._result
+@dataclass
+class _ExecutorEntry:
+    executor: Any
+    prepare: _OnceResult = field(default_factory=_OnceResult)
+    analyze: _OnceResult = field(default_factory=_OnceResult)
+    ready_to_call: bool = False
+_SUBPROC_EXECUTORS: dict[bytes, _ExecutorEntry] = {}
+def _call_method(method: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
+    """Run an awaitable/coroutine to completion synchronously, otherwise return as-is."""
+    try:
+        if asyncio.iscoroutinefunction(method):
+            return asyncio.run(method(*args, **kwargs))
+        else:
+            return method(*args, **kwargs)
+    except Exception as e:
+        raise RuntimeError(
+            f"Error calling method `{method.__name__}` from subprocess"
+        ) from e
+def _get_or_create_entry(key_bytes: bytes) -> _ExecutorEntry:
+    entry = _SUBPROC_EXECUTORS.get(key_bytes)
+    if entry is None:
+        executor_factory, spec = pickle.loads(key_bytes)
+        inst = executor_factory()
+        inst.spec = spec
+        entry = _ExecutorEntry(executor=inst)
+        _SUBPROC_EXECUTORS[key_bytes] = entry
+    return entry
+def _sp_analyze(key_bytes: bytes) -> Any:
+    entry = _get_or_create_entry(key_bytes)
+    return entry.analyze.run_once(entry.executor.analyze)
+def _sp_prepare(key_bytes: bytes) -> Any:
+    entry = _get_or_create_entry(key_bytes)
+    return entry.prepare.run_once(entry.executor.prepare)
+def _sp_call(key_bytes: bytes, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+    entry = _get_or_create_entry(key_bytes)
+    # There's a chance that the subprocess crashes and restarts in the middle.
+    # So we want to always make sure the executor is ready before each call.
+    if not entry.ready_to_call:
+        if analyze_fn := getattr(entry.executor, "analyze", None):
+            entry.analyze.run_once(analyze_fn)
+        if prepare_fn := getattr(entry.executor, "prepare", None):
+            entry.prepare.run_once(prepare_fn)
+        entry.ready_to_call = True
+    return _call_method(entry.executor.__call__, *args, **kwargs)
+# ---------------------------------------------
+# Public stub
+# ---------------------------------------------
+class _ExecutorStub:
+    _key_bytes: bytes
+    def __init__(self, executor_factory: type[Any], spec: Any) -> None:
+        self._key_bytes = pickle.dumps(
+            (executor_factory, spec), protocol=pickle.HIGHEST_PROTOCOL
+        )
+        # Conditionally expose analyze if underlying class has it
+        if hasattr(executor_factory, "analyze"):
+            # Bind as attribute so getattr(..., "analyze", None) works upstream
+            def analyze() -> Any:
+                return execution_context.run(
+                    _submit_with_restart(_sp_analyze, self._key_bytes)
+                )
+            # Attach method
+            setattr(self, "analyze", analyze)
+        if hasattr(executor_factory, "prepare"):
+            async def prepare() -> Any:
+                return await _submit_with_restart(_sp_prepare, self._key_bytes)
+            setattr(self, "prepare", prepare)
+    async def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        return await _submit_with_restart(_sp_call, self._key_bytes, args, kwargs)
+def executor_stub(executor_factory: type[Any], spec: Any) -> Any:
+    """
+    Create a subprocess-backed stub for the given executor class/spec.
+    - Lazily initializes a singleton ProcessPoolExecutor (max_workers=1).
+    - Returns a stub object exposing async __call__ and async prepare; analyze is
+      exposed if present on the original class.
+    """
+    return _ExecutorStub(executor_factory, spec)

cocoindex/targets/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""
+Targets supported by CocoIndex.
+"""
+from ._engine_builtin_specs import *

cocoindex/targets/_engine_builtin_specs.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""All builtin targets."""
+from dataclasses import dataclass
+from typing import Sequence, Literal
+from .. import op
+from .. import index
+from ..auth_registry import AuthEntryReference
+from ..setting import DatabaseConnectionSpec
+@dataclass
+class PostgresColumnOptions:
+    """Options for a Postgres column."""
+    # Specify the specific type of the column in Postgres. Can use it to override the default type derived from CocoIndex schema.
+    type: Literal["vector", "halfvec"] | None = None
+class Postgres(op.TargetSpec):
+    """Target powered by Postgres and pgvector."""
+    database: AuthEntryReference[DatabaseConnectionSpec] | None = None
+    table_name: str | None = None
+    schema: str | None = None
+    column_options: dict[str, PostgresColumnOptions] | None = None
+class PostgresSqlCommand(op.TargetAttachmentSpec):
+    """Attachment to execute specified SQL statements for Postgres targets."""
+    name: str
+    setup_sql: str
+    teardown_sql: str | None = None
+@dataclass
+class QdrantConnection:
+    """Connection spec for Qdrant."""
+    grpc_url: str
+    api_key: str | None = None
+@dataclass
+class Qdrant(op.TargetSpec):
+    """Target powered by Qdrant - https://qdrant.tech/."""
+    collection_name: str
+    connection: AuthEntryReference[QdrantConnection] | None = None
+@dataclass
+class TargetFieldMapping:
+    """Mapping for a graph element (node or relationship) field."""
+    source: str
+    # Field name for the node in the Knowledge Graph.
+    # If unspecified, it's the same as `field_name`.
+    target: str | None = None
+@dataclass
+class NodeFromFields:
+    """Spec for a referenced graph node, usually as part of a relationship."""
+    label: str
+    fields: list[TargetFieldMapping]
+@dataclass
+class ReferencedNode:
+    """Target spec for a graph node."""
+    label: str
+    primary_key_fields: Sequence[str]
+    vector_indexes: Sequence[index.VectorIndexDef] = ()
+@dataclass
+class Nodes:
+    """Spec to map a row to a graph node."""
+    kind = "Node"
+    label: str
+@dataclass
+class Relationships:
+    """Spec to map a row to a graph relationship."""
+    kind = "Relationship"
+    rel_type: str
+    source: NodeFromFields
+    target: NodeFromFields
+# For backwards compatibility only
+NodeMapping = Nodes
+RelationshipMapping = Relationships
+NodeReferenceMapping = NodeFromFields
+@dataclass
+class Neo4jConnection:
+    """Connection spec for Neo4j."""
+    uri: str
+    user: str
+    password: str
+    db: str | None = None
+class Neo4j(op.TargetSpec):
+    """Graph storage powered by Neo4j."""
+    connection: AuthEntryReference[Neo4jConnection]
+    mapping: Nodes | Relationships
+class Neo4jDeclaration(op.DeclarationSpec):
+    """Declarations for Neo4j."""
+    kind = "Neo4j"
+    connection: AuthEntryReference[Neo4jConnection]
+    nodes_label: str
+    primary_key_fields: Sequence[str]
+    vector_indexes: Sequence[index.VectorIndexDef] = ()
+@dataclass
+class KuzuConnection:
+    """Connection spec for Kuzu."""
+    api_server_url: str
+class Kuzu(op.TargetSpec):
+    """Graph storage powered by Kuzu."""
+    connection: AuthEntryReference[KuzuConnection]
+    mapping: Nodes | Relationships
+class KuzuDeclaration(op.DeclarationSpec):
+    """Declarations for Kuzu."""
+    kind = "Kuzu"
+    connection: AuthEntryReference[KuzuConnection]
+    nodes_label: str
+    primary_key_fields: Sequence[str]