cocoindex 0.3.4__cp311-abi3-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. cocoindex/__init__.py +114 -0
  2. cocoindex/_engine.abi3.so +0 -0
  3. cocoindex/auth_registry.py +44 -0
  4. cocoindex/cli.py +830 -0
  5. cocoindex/engine_object.py +214 -0
  6. cocoindex/engine_value.py +550 -0
  7. cocoindex/flow.py +1281 -0
  8. cocoindex/functions/__init__.py +40 -0
  9. cocoindex/functions/_engine_builtin_specs.py +66 -0
  10. cocoindex/functions/colpali.py +247 -0
  11. cocoindex/functions/sbert.py +77 -0
  12. cocoindex/index.py +50 -0
  13. cocoindex/lib.py +75 -0
  14. cocoindex/llm.py +47 -0
  15. cocoindex/op.py +1047 -0
  16. cocoindex/py.typed +0 -0
  17. cocoindex/query_handler.py +57 -0
  18. cocoindex/runtime.py +78 -0
  19. cocoindex/setting.py +171 -0
  20. cocoindex/setup.py +92 -0
  21. cocoindex/sources/__init__.py +5 -0
  22. cocoindex/sources/_engine_builtin_specs.py +120 -0
  23. cocoindex/subprocess_exec.py +277 -0
  24. cocoindex/targets/__init__.py +5 -0
  25. cocoindex/targets/_engine_builtin_specs.py +153 -0
  26. cocoindex/targets/lancedb.py +466 -0
  27. cocoindex/tests/__init__.py +0 -0
  28. cocoindex/tests/test_engine_object.py +331 -0
  29. cocoindex/tests/test_engine_value.py +1724 -0
  30. cocoindex/tests/test_optional_database.py +249 -0
  31. cocoindex/tests/test_transform_flow.py +300 -0
  32. cocoindex/tests/test_typing.py +553 -0
  33. cocoindex/tests/test_validation.py +134 -0
  34. cocoindex/typing.py +834 -0
  35. cocoindex/user_app_loader.py +53 -0
  36. cocoindex/utils.py +20 -0
  37. cocoindex/validation.py +104 -0
  38. cocoindex-0.3.4.dist-info/METADATA +288 -0
  39. cocoindex-0.3.4.dist-info/RECORD +42 -0
  40. cocoindex-0.3.4.dist-info/WHEEL +4 -0
  41. cocoindex-0.3.4.dist-info/entry_points.txt +2 -0
  42. cocoindex-0.3.4.dist-info/licenses/THIRD_PARTY_NOTICES.html +13249 -0
@@ -0,0 +1,277 @@
1
+ """
2
+ Lightweight subprocess-backed executor stub.
3
+
4
+ - Uses a single global ProcessPoolExecutor (max_workers=1), created lazily.
5
+ - In the subprocess, maintains a registry of executor instances keyed by
6
+ (executor_factory, pickled spec) to enable reuse.
7
+ - Caches analyze() and prepare() results per key to avoid repeated calls
8
+ even if key collision happens.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from concurrent.futures import ProcessPoolExecutor
14
+ from concurrent.futures.process import BrokenProcessPool
15
+ from dataclasses import dataclass, field
16
+ from typing import Any, Callable
17
+ import pickle
18
+ import threading
19
+ import asyncio
20
+ import os
21
+ import time
22
+ from .user_app_loader import load_user_app
23
+ from .runtime import execution_context
24
+ import logging
25
+ import multiprocessing as mp
26
+
27
+ WATCHDOG_INTERVAL_SECONDS = 10.0
28
+
29
+ # ---------------------------------------------
30
+ # Main process: single, lazily-created pool
31
+ # ---------------------------------------------
32
+ _pool_lock = threading.Lock()
33
+ _pool: ProcessPoolExecutor | None = None
34
+ _user_apps: list[str] = []
35
+ _logger = logging.getLogger(__name__)
36
+
37
+
38
+ def _get_pool() -> ProcessPoolExecutor:
39
+ global _pool # pylint: disable=global-statement
40
+ with _pool_lock:
41
+ if _pool is None:
42
+ # Single worker process as requested
43
+ _pool = ProcessPoolExecutor(
44
+ max_workers=1,
45
+ initializer=_subprocess_init,
46
+ initargs=(_user_apps, os.getpid()),
47
+ mp_context=mp.get_context("spawn"),
48
+ )
49
+ return _pool
50
+
51
+
52
+ def add_user_app(app_target: str) -> None:
53
+ with _pool_lock:
54
+ _user_apps.append(app_target)
55
+
56
+
57
+ def _restart_pool(old_pool: ProcessPoolExecutor | None = None) -> None:
58
+ """Safely restart the global ProcessPoolExecutor.
59
+
60
+ Thread-safe via `_pool_lock`. Shuts down the old pool and re-creates a new
61
+ one with the same initializer/args.
62
+ """
63
+ global _pool
64
+ with _pool_lock:
65
+ # If another thread already swapped the pool, skip restart
66
+ if old_pool is not None and _pool is not old_pool:
67
+ return
68
+ _logger.error("Detected dead subprocess pool; restarting and retrying.")
69
+ prev_pool = _pool
70
+ _pool = ProcessPoolExecutor(
71
+ max_workers=1,
72
+ initializer=_subprocess_init,
73
+ initargs=(_user_apps, os.getpid()),
74
+ mp_context=mp.get_context("spawn"),
75
+ )
76
+ if prev_pool is not None:
77
+ # Best-effort shutdown of previous pool; letting exceptions bubble up
78
+ # is acceptable here and signals irrecoverable executor state.
79
+ prev_pool.shutdown(cancel_futures=True)
80
+
81
+
82
+ async def _submit_with_restart(fn: Callable[..., Any], *args: Any) -> Any:
83
+ """Submit and await work, restarting the subprocess until it succeeds.
84
+
85
+ Retries on BrokenProcessPool or pool-shutdown RuntimeError; re-raises other
86
+ exceptions.
87
+ """
88
+ while True:
89
+ pool = _get_pool()
90
+ try:
91
+ fut = pool.submit(fn, *args)
92
+ return await asyncio.wrap_future(fut)
93
+ except BrokenProcessPool:
94
+ _restart_pool(old_pool=pool)
95
+ # loop and retry
96
+
97
+
98
+ # ---------------------------------------------
99
+ # Subprocess: executor registry and helpers
100
+ # ---------------------------------------------
101
+
102
+
103
+ def _start_parent_watchdog(
104
+ parent_pid: int, interval_seconds: float = WATCHDOG_INTERVAL_SECONDS
105
+ ) -> None:
106
+ """Terminate this process if the parent process exits or PPID changes.
107
+
108
+ This runs in a background daemon thread so it never blocks pool work.
109
+ """
110
+
111
+ import psutil # type: ignore
112
+
113
+ if parent_pid is None:
114
+ parent_pid = os.getppid()
115
+
116
+ try:
117
+ p = psutil.Process(parent_pid)
118
+ # Cache create_time to defeat PID reuse.
119
+ created = p.create_time()
120
+ except psutil.Error:
121
+ # Parent already gone or not accessible
122
+ os._exit(1)
123
+
124
+ def _watch() -> None:
125
+ while True:
126
+ try:
127
+ # is_running() + same create_time => same process and still alive
128
+ if not (p.is_running() and p.create_time() == created):
129
+ os._exit(1)
130
+ except psutil.NoSuchProcess:
131
+ os._exit(1)
132
+ time.sleep(interval_seconds)
133
+
134
+ threading.Thread(target=_watch, name="parent-watchdog", daemon=True).start()
135
+
136
+
137
+ def _subprocess_init(user_apps: list[str], parent_pid: int) -> None:
138
+ import signal
139
+ import faulthandler
140
+
141
+ faulthandler.enable()
142
+ # Ignore SIGINT in the subprocess on best-effort basis.
143
+ try:
144
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
145
+ except Exception:
146
+ pass
147
+
148
+ _start_parent_watchdog(parent_pid)
149
+
150
+ # In case any user app is already in this subprocess, e.g. the subprocess is forked, we need to avoid loading it again.
151
+ with _pool_lock:
152
+ already_loaded_apps = set(_user_apps)
153
+
154
+ loaded_apps = []
155
+ for app_target in user_apps:
156
+ if app_target not in already_loaded_apps:
157
+ load_user_app(app_target)
158
+ loaded_apps.append(app_target)
159
+
160
+ with _pool_lock:
161
+ _user_apps.extend(loaded_apps)
162
+
163
+
164
+ class _OnceResult:
165
+ _result: Any = None
166
+ _done: bool = False
167
+
168
+ def run_once(self, method: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
169
+ if self._done:
170
+ return self._result
171
+ self._result = _call_method(method, *args, **kwargs)
172
+ self._done = True
173
+ return self._result
174
+
175
+
176
+ @dataclass
177
+ class _ExecutorEntry:
178
+ executor: Any
179
+ prepare: _OnceResult = field(default_factory=_OnceResult)
180
+ analyze: _OnceResult = field(default_factory=_OnceResult)
181
+ ready_to_call: bool = False
182
+
183
+
184
+ _SUBPROC_EXECUTORS: dict[bytes, _ExecutorEntry] = {}
185
+
186
+
187
+ def _call_method(method: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
188
+ """Run an awaitable/coroutine to completion synchronously, otherwise return as-is."""
189
+ try:
190
+ if asyncio.iscoroutinefunction(method):
191
+ return asyncio.run(method(*args, **kwargs))
192
+ else:
193
+ return method(*args, **kwargs)
194
+ except Exception as e:
195
+ raise RuntimeError(
196
+ f"Error calling method `{method.__name__}` from subprocess"
197
+ ) from e
198
+
199
+
200
+ def _get_or_create_entry(key_bytes: bytes) -> _ExecutorEntry:
201
+ entry = _SUBPROC_EXECUTORS.get(key_bytes)
202
+ if entry is None:
203
+ executor_factory, spec = pickle.loads(key_bytes)
204
+ inst = executor_factory()
205
+ inst.spec = spec
206
+ entry = _ExecutorEntry(executor=inst)
207
+ _SUBPROC_EXECUTORS[key_bytes] = entry
208
+ return entry
209
+
210
+
211
+ def _sp_analyze(key_bytes: bytes) -> Any:
212
+ entry = _get_or_create_entry(key_bytes)
213
+ return entry.analyze.run_once(entry.executor.analyze)
214
+
215
+
216
+ def _sp_prepare(key_bytes: bytes) -> Any:
217
+ entry = _get_or_create_entry(key_bytes)
218
+ return entry.prepare.run_once(entry.executor.prepare)
219
+
220
+
221
+ def _sp_call(key_bytes: bytes, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
222
+ entry = _get_or_create_entry(key_bytes)
223
+ # There's a chance that the subprocess crashes and restarts in the middle.
224
+ # So we want to always make sure the executor is ready before each call.
225
+ if not entry.ready_to_call:
226
+ if analyze_fn := getattr(entry.executor, "analyze", None):
227
+ entry.analyze.run_once(analyze_fn)
228
+ if prepare_fn := getattr(entry.executor, "prepare", None):
229
+ entry.prepare.run_once(prepare_fn)
230
+ entry.ready_to_call = True
231
+ return _call_method(entry.executor.__call__, *args, **kwargs)
232
+
233
+
234
+ # ---------------------------------------------
235
+ # Public stub
236
+ # ---------------------------------------------
237
+
238
+
239
+ class _ExecutorStub:
240
+ _key_bytes: bytes
241
+
242
+ def __init__(self, executor_factory: type[Any], spec: Any) -> None:
243
+ self._key_bytes = pickle.dumps(
244
+ (executor_factory, spec), protocol=pickle.HIGHEST_PROTOCOL
245
+ )
246
+
247
+ # Conditionally expose analyze if underlying class has it
248
+ if hasattr(executor_factory, "analyze"):
249
+ # Bind as attribute so getattr(..., "analyze", None) works upstream
250
+ def analyze() -> Any:
251
+ return execution_context.run(
252
+ _submit_with_restart(_sp_analyze, self._key_bytes)
253
+ )
254
+
255
+ # Attach method
256
+ setattr(self, "analyze", analyze)
257
+
258
+ if hasattr(executor_factory, "prepare"):
259
+
260
+ async def prepare() -> Any:
261
+ return await _submit_with_restart(_sp_prepare, self._key_bytes)
262
+
263
+ setattr(self, "prepare", prepare)
264
+
265
+ async def __call__(self, *args: Any, **kwargs: Any) -> Any:
266
+ return await _submit_with_restart(_sp_call, self._key_bytes, args, kwargs)
267
+
268
+
269
+ def executor_stub(executor_factory: type[Any], spec: Any) -> Any:
270
+ """
271
+ Create a subprocess-backed stub for the given executor class/spec.
272
+
273
+ - Lazily initializes a singleton ProcessPoolExecutor (max_workers=1).
274
+ - Returns a stub object exposing async __call__ and async prepare; analyze is
275
+ exposed if present on the original class.
276
+ """
277
+ return _ExecutorStub(executor_factory, spec)
@@ -0,0 +1,5 @@
1
+ """
2
+ Targets supported by CocoIndex.
3
+ """
4
+
5
+ from ._engine_builtin_specs import *
@@ -0,0 +1,153 @@
1
+ """All builtin targets."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Sequence, Literal
5
+
6
+ from .. import op
7
+ from .. import index
8
+ from ..auth_registry import AuthEntryReference
9
+ from ..setting import DatabaseConnectionSpec
10
+
11
+
12
+ @dataclass
13
+ class PostgresColumnOptions:
14
+ """Options for a Postgres column."""
15
+
16
+ # Specify the specific type of the column in Postgres. Can use it to override the default type derived from CocoIndex schema.
17
+ type: Literal["vector", "halfvec"] | None = None
18
+
19
+
20
+ class Postgres(op.TargetSpec):
21
+ """Target powered by Postgres and pgvector."""
22
+
23
+ database: AuthEntryReference[DatabaseConnectionSpec] | None = None
24
+ table_name: str | None = None
25
+ schema: str | None = None
26
+ column_options: dict[str, PostgresColumnOptions] | None = None
27
+
28
+
29
+ class PostgresSqlCommand(op.TargetAttachmentSpec):
30
+ """Attachment to execute specified SQL statements for Postgres targets."""
31
+
32
+ name: str
33
+ setup_sql: str
34
+ teardown_sql: str | None = None
35
+
36
+
37
+ @dataclass
38
+ class QdrantConnection:
39
+ """Connection spec for Qdrant."""
40
+
41
+ grpc_url: str
42
+ api_key: str | None = None
43
+
44
+
45
+ @dataclass
46
+ class Qdrant(op.TargetSpec):
47
+ """Target powered by Qdrant - https://qdrant.tech/."""
48
+
49
+ collection_name: str
50
+ connection: AuthEntryReference[QdrantConnection] | None = None
51
+
52
+
53
+ @dataclass
54
+ class TargetFieldMapping:
55
+ """Mapping for a graph element (node or relationship) field."""
56
+
57
+ source: str
58
+ # Field name for the node in the Knowledge Graph.
59
+ # If unspecified, it's the same as `field_name`.
60
+ target: str | None = None
61
+
62
+
63
+ @dataclass
64
+ class NodeFromFields:
65
+ """Spec for a referenced graph node, usually as part of a relationship."""
66
+
67
+ label: str
68
+ fields: list[TargetFieldMapping]
69
+
70
+
71
+ @dataclass
72
+ class ReferencedNode:
73
+ """Target spec for a graph node."""
74
+
75
+ label: str
76
+ primary_key_fields: Sequence[str]
77
+ vector_indexes: Sequence[index.VectorIndexDef] = ()
78
+
79
+
80
+ @dataclass
81
+ class Nodes:
82
+ """Spec to map a row to a graph node."""
83
+
84
+ kind = "Node"
85
+
86
+ label: str
87
+
88
+
89
+ @dataclass
90
+ class Relationships:
91
+ """Spec to map a row to a graph relationship."""
92
+
93
+ kind = "Relationship"
94
+
95
+ rel_type: str
96
+ source: NodeFromFields
97
+ target: NodeFromFields
98
+
99
+
100
+ # For backwards compatibility only
101
+ NodeMapping = Nodes
102
+ RelationshipMapping = Relationships
103
+ NodeReferenceMapping = NodeFromFields
104
+
105
+
106
+ @dataclass
107
+ class Neo4jConnection:
108
+ """Connection spec for Neo4j."""
109
+
110
+ uri: str
111
+ user: str
112
+ password: str
113
+ db: str | None = None
114
+
115
+
116
+ class Neo4j(op.TargetSpec):
117
+ """Graph storage powered by Neo4j."""
118
+
119
+ connection: AuthEntryReference[Neo4jConnection]
120
+ mapping: Nodes | Relationships
121
+
122
+
123
+ class Neo4jDeclaration(op.DeclarationSpec):
124
+ """Declarations for Neo4j."""
125
+
126
+ kind = "Neo4j"
127
+ connection: AuthEntryReference[Neo4jConnection]
128
+ nodes_label: str
129
+ primary_key_fields: Sequence[str]
130
+ vector_indexes: Sequence[index.VectorIndexDef] = ()
131
+
132
+
133
+ @dataclass
134
+ class KuzuConnection:
135
+ """Connection spec for Kuzu."""
136
+
137
+ api_server_url: str
138
+
139
+
140
+ class Kuzu(op.TargetSpec):
141
+ """Graph storage powered by Kuzu."""
142
+
143
+ connection: AuthEntryReference[KuzuConnection]
144
+ mapping: Nodes | Relationships
145
+
146
+
147
+ class KuzuDeclaration(op.DeclarationSpec):
148
+ """Declarations for Kuzu."""
149
+
150
+ kind = "Kuzu"
151
+ connection: AuthEntryReference[KuzuConnection]
152
+ nodes_label: str
153
+ primary_key_fields: Sequence[str]