cocoindex 0.1.81__cp312-cp312-macosx_11_0_arm64.whl → 0.1.83__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cocoindex/op.py CHANGED
@@ -17,14 +17,15 @@ from typing import (
17
17
  )
18
18
 
19
19
  from . import _engine # type: ignore
20
+ from .subprocess_exec import executor_stub
20
21
  from .convert import (
21
- encode_engine_value,
22
+ make_engine_value_encoder,
22
23
  make_engine_value_decoder,
23
24
  make_engine_struct_decoder,
24
25
  )
25
26
  from .typing import (
26
27
  TypeAttr,
27
- encode_enriched_type,
28
+ encode_enriched_type_info,
28
29
  resolve_forward_ref,
29
30
  analyze_type_info,
30
31
  AnalyzedAnyType,
@@ -85,11 +86,13 @@ class Executor(Protocol):
85
86
  op_category: OpCategory
86
87
 
87
88
 
88
- def _load_spec_from_engine(spec_cls: type, spec: dict[str, Any]) -> Any:
89
+ def _load_spec_from_engine(
90
+ spec_loader: Callable[..., Any], spec: dict[str, Any]
91
+ ) -> Any:
89
92
  """
90
93
  Load a spec from the engine.
91
94
  """
92
- return spec_cls(**spec)
95
+ return spec_loader(**spec)
93
96
 
94
97
 
95
98
  def _get_required_method(cls: type, name: str) -> Callable[..., Any]:
@@ -101,26 +104,23 @@ def _get_required_method(cls: type, name: str) -> Callable[..., Any]:
101
104
  return method
102
105
 
103
106
 
104
- class _FunctionExecutorFactory:
105
- _spec_cls: type
107
+ class _EngineFunctionExecutorFactory:
108
+ _spec_loader: Callable[..., Any]
106
109
  _executor_cls: type
107
110
 
108
- def __init__(self, spec_cls: type, executor_cls: type):
109
- self._spec_cls = spec_cls
111
+ def __init__(self, spec_loader: Callable[..., Any], executor_cls: type):
112
+ self._spec_loader = spec_loader
110
113
  self._executor_cls = executor_cls
111
114
 
112
115
  def __call__(
113
116
  self, spec: dict[str, Any], *args: Any, **kwargs: Any
114
117
  ) -> tuple[dict[str, Any], Executor]:
115
- spec = _load_spec_from_engine(self._spec_cls, spec)
118
+ spec = _load_spec_from_engine(self._spec_loader, spec)
116
119
  executor = self._executor_cls(spec)
117
120
  result_type = executor.analyze_schema(*args, **kwargs)
118
121
  return (result_type, executor)
119
122
 
120
123
 
121
- _gpu_dispatch_lock = asyncio.Lock()
122
-
123
-
124
124
  _COCOINDEX_ATTR_PREFIX = "cocoindex.io/"
125
125
 
126
126
 
@@ -166,30 +166,32 @@ def _register_op_factory(
166
166
  category: OpCategory,
167
167
  expected_args: list[tuple[str, inspect.Parameter]],
168
168
  expected_return: Any,
169
- executor_cls: type,
170
- spec_cls: type,
169
+ executor_factory: Any,
170
+ spec_loader: Callable[..., Any],
171
+ op_kind: str,
171
172
  op_args: OpArgs,
172
- ) -> type:
173
+ ) -> None:
173
174
  """
174
175
  Register an op factory.
175
176
  """
176
177
 
177
- class _Fallback:
178
- def enable_cache(self) -> bool:
179
- return op_args.cache
180
-
181
- def behavior_version(self) -> int | None:
182
- return op_args.behavior_version
183
-
184
- class _WrappedClass(executor_cls, _Fallback): # type: ignore[misc]
178
+ class _WrappedExecutor:
179
+ _executor: Any
185
180
  _args_info: list[_ArgInfo]
186
181
  _kwargs_info: dict[str, _ArgInfo]
187
- _acall: Callable[..., Awaitable[Any]]
182
+ _result_encoder: Callable[[Any], Any]
183
+ _acall: Callable[..., Awaitable[Any]] | None = None
188
184
 
189
185
  def __init__(self, spec: Any) -> None:
190
- super().__init__()
191
- self.spec = spec
192
- self._acall = _to_async_call(super().__call__)
186
+ executor: Any
187
+
188
+ if op_args.gpu:
189
+ executor = executor_stub(executor_factory, spec)
190
+ else:
191
+ executor = executor_factory()
192
+ executor.spec = spec
193
+
194
+ self._executor = executor
193
195
 
194
196
  def analyze_schema(
195
197
  self, *args: _engine.OpArgSchema, **kwargs: _engine.OpArgSchema
@@ -293,17 +295,21 @@ def _register_op_factory(
293
295
  if len(missing_args) > 0:
294
296
  raise ValueError(f"Missing arguments: {', '.join(missing_args)}")
295
297
 
296
- base_analyze_method = getattr(self, "analyze", None)
298
+ base_analyze_method = getattr(self._executor, "analyze", None)
297
299
  if base_analyze_method is not None:
298
- result = base_analyze_method(*args, **kwargs)
300
+ result_type = base_analyze_method()
299
301
  else:
300
- result = expected_return
302
+ result_type = expected_return
301
303
  if len(attributes) > 0:
302
- result = Annotated[result, *attributes]
304
+ result_type = Annotated[result_type, *attributes]
303
305
 
304
- encoded_type = encode_enriched_type(result)
306
+ analyzed_result_type_info = analyze_type_info(result_type)
307
+ encoded_type = encode_enriched_type_info(analyzed_result_type_info)
305
308
  if potentially_missing_required_arg:
306
309
  encoded_type["nullable"] = True
310
+
311
+ self._result_encoder = make_engine_value_encoder(analyzed_result_type_info)
312
+
307
313
  return encoded_type
308
314
 
309
315
  async def prepare(self) -> None:
@@ -311,9 +317,10 @@ def _register_op_factory(
311
317
  Prepare for execution.
312
318
  It's executed after `analyze` and before any `__call__` execution.
313
319
  """
314
- prepare_method = getattr(super(), "prepare", None)
320
+ prepare_method = getattr(self._executor, "prepare", None)
315
321
  if prepare_method is not None:
316
322
  await _to_async_call(prepare_method)()
323
+ self._acall = _to_async_call(self._executor.__call__)
317
324
 
318
325
  async def __call__(self, *args: Any, **kwargs: Any) -> Any:
319
326
  decoded_args = []
@@ -333,33 +340,23 @@ def _register_op_factory(
333
340
  return None
334
341
  decoded_kwargs[kwarg_name] = kwarg_info.decoder(arg)
335
342
 
336
- if op_args.gpu:
337
- # For GPU executions, data-level parallelism is applied, so we don't want to
338
- # execute different tasks in parallel.
339
- # Besides, multiprocessing is more appropriate for pytorch.
340
- # For now, we use a lock to ensure only one task is executed at a time.
341
- # TODO: Implement multi-processing dispatching.
342
- async with _gpu_dispatch_lock:
343
- output = await self._acall(*decoded_args, **decoded_kwargs)
344
- else:
345
- output = await self._acall(*decoded_args, **decoded_kwargs)
346
- return encode_engine_value(output)
343
+ assert self._acall is not None
344
+ output = await self._acall(*decoded_args, **decoded_kwargs)
345
+ return self._result_encoder(output)
347
346
 
348
- _WrappedClass.__name__ = executor_cls.__name__
349
- _WrappedClass.__doc__ = executor_cls.__doc__
350
- _WrappedClass.__module__ = executor_cls.__module__
351
- _WrappedClass.__qualname__ = executor_cls.__qualname__
352
- _WrappedClass.__wrapped__ = executor_cls
347
+ def enable_cache(self) -> bool:
348
+ return op_args.cache
349
+
350
+ def behavior_version(self) -> int | None:
351
+ return op_args.behavior_version
353
352
 
354
353
  if category == OpCategory.FUNCTION:
355
354
  _engine.register_function_factory(
356
- spec_cls.__name__, _FunctionExecutorFactory(spec_cls, _WrappedClass)
355
+ op_kind, _EngineFunctionExecutorFactory(spec_loader, _WrappedExecutor)
357
356
  )
358
357
  else:
359
358
  raise ValueError(f"Unsupported executor type {category}")
360
359
 
361
- return _WrappedClass
362
-
363
360
 
364
361
  def executor_class(**args: Any) -> Callable[[type], type]:
365
362
  """
@@ -377,52 +374,53 @@ def executor_class(**args: Any) -> Callable[[type], type]:
377
374
  raise TypeError("Expect a `spec` field with type hint")
378
375
  spec_cls = resolve_forward_ref(type_hints["spec"])
379
376
  sig = inspect.signature(cls.__call__)
380
- return _register_op_factory(
377
+ _register_op_factory(
381
378
  category=spec_cls._op_category,
382
379
  expected_args=list(sig.parameters.items())[1:], # First argument is `self`
383
380
  expected_return=sig.return_annotation,
384
- executor_cls=cls,
385
- spec_cls=spec_cls,
381
+ executor_factory=cls,
382
+ spec_loader=spec_cls,
383
+ op_kind=spec_cls.__name__,
386
384
  op_args=op_args,
387
385
  )
386
+ return cls
388
387
 
389
388
  return _inner
390
389
 
391
390
 
392
- def function(**args: Any) -> Callable[[Callable[..., Any]], FunctionSpec]:
391
+ class EmptyFunctionSpec(FunctionSpec):
392
+ pass
393
+
394
+
395
+ class _SimpleFunctionExecutor:
396
+ spec: Callable[..., Any]
397
+
398
+ def prepare(self) -> None:
399
+ self.__call__ = staticmethod(self.spec)
400
+
401
+
402
+ def function(**args: Any) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
393
403
  """
394
404
  Decorate a function to provide a function for an op.
395
405
  """
396
406
  op_args = OpArgs(**args)
397
407
 
398
- def _inner(fn: Callable[..., Any]) -> FunctionSpec:
408
+ def _inner(fn: Callable[..., Any]) -> Callable[..., Any]:
399
409
  # Convert snake case to camel case.
400
- op_name = "".join(word.capitalize() for word in fn.__name__.split("_"))
410
+ op_kind = "".join(word.capitalize() for word in fn.__name__.split("_"))
401
411
  sig = inspect.signature(fn)
402
-
403
- class _Executor:
404
- def __call__(self, *args: Any, **kwargs: Any) -> Any:
405
- return fn(*args, **kwargs)
406
-
407
- class _Spec(FunctionSpec):
408
- def __call__(self, *args: Any, **kwargs: Any) -> Any:
409
- return fn(*args, **kwargs)
410
-
411
- _Spec.__name__ = op_name
412
- _Spec.__doc__ = fn.__doc__
413
- _Spec.__module__ = fn.__module__
414
- _Spec.__qualname__ = fn.__qualname__
415
-
412
+ fn.__cocoindex_op_kind__ = op_kind # type: ignore
416
413
  _register_op_factory(
417
414
  category=OpCategory.FUNCTION,
418
415
  expected_args=list(sig.parameters.items()),
419
416
  expected_return=sig.return_annotation,
420
- executor_cls=_Executor,
421
- spec_cls=_Spec,
417
+ executor_factory=_SimpleFunctionExecutor,
418
+ spec_loader=lambda: fn,
419
+ op_kind=op_kind,
422
420
  op_args=op_args,
423
421
  )
424
422
 
425
- return _Spec()
423
+ return fn
426
424
 
427
425
  return _inner
428
426
 
cocoindex/sources.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  from . import op
4
4
  from .auth_registry import TransientAuthEntryReference
5
+ from .setting import DatabaseConnectionSpec
5
6
  import datetime
6
7
 
7
8
 
@@ -67,3 +68,22 @@ class AzureBlob(op.SourceSpec):
67
68
 
68
69
  sas_token: TransientAuthEntryReference[str] | None = None
69
70
  account_access_key: TransientAuthEntryReference[str] | None = None
71
+
72
+
73
+ class Postgres(op.SourceSpec):
74
+ """Import data from a PostgreSQL table."""
75
+
76
+ _op_category = op.OpCategory.SOURCE
77
+
78
+ # Table name to read from (required)
79
+ table_name: str
80
+
81
+ # Database connection reference (optional - uses default if not provided)
82
+ database: TransientAuthEntryReference[DatabaseConnectionSpec] | None = None
83
+
84
+ # Optional: specific columns to include (if None, includes all columns)
85
+ included_columns: list[str] | None = None
86
+
87
+ # Optional: column name to use for ordinal tracking (for incremental updates)
88
+ # Should be a timestamp, serial, or other incrementing column
89
+ ordinal_column: str | None = None
@@ -0,0 +1,241 @@
1
+ """
2
+ Lightweight subprocess-backed executor stub.
3
+
4
+ - Uses a single global ProcessPoolExecutor (max_workers=1), created lazily.
5
+ - In the subprocess, maintains a registry of executor instances keyed by
6
+ (executor_factory, pickled spec) to enable reuse.
7
+ - Caches analyze() and prepare() results per key to avoid repeated calls
8
+ even if key collision happens.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from concurrent.futures import ProcessPoolExecutor
14
+ from concurrent.futures.process import BrokenProcessPool
15
+ from dataclasses import dataclass, field
16
+ from typing import Any, Callable
17
+ import pickle
18
+ import threading
19
+ import asyncio
20
+ import os
21
+ import time
22
+ from .user_app_loader import load_user_app
23
+ from .runtime import execution_context
24
+ import logging
25
+
26
+ WATCHDOG_INTERVAL_SECONDS = 10.0
27
+
28
+ # ---------------------------------------------
29
+ # Main process: single, lazily-created pool
30
+ # ---------------------------------------------
31
+ _pool_lock = threading.Lock()
32
+ _pool: ProcessPoolExecutor | None = None
33
+ _user_apps: list[str] = []
34
+ _logger = logging.getLogger(__name__)
35
+
36
+
37
+ def _get_pool() -> ProcessPoolExecutor:
38
+ global _pool
39
+ with _pool_lock:
40
+ if _pool is None:
41
+ # Single worker process as requested
42
+ _pool = ProcessPoolExecutor(
43
+ max_workers=1,
44
+ initializer=_subprocess_init,
45
+ initargs=(_user_apps, os.getpid()),
46
+ )
47
+ return _pool
48
+
49
+
50
+ def add_user_app(app_target: str) -> None:
51
+ with _pool_lock:
52
+ _user_apps.append(app_target)
53
+
54
+
55
+ def _restart_pool(old_pool: ProcessPoolExecutor | None = None) -> None:
56
+ """Safely restart the global ProcessPoolExecutor.
57
+
58
+ Thread-safe via `_pool_lock`. Shuts down the old pool and re-creates a new
59
+ one with the same initializer/args.
60
+ """
61
+ global _pool
62
+ with _pool_lock:
63
+ # If another thread already swapped the pool, skip restart
64
+ if old_pool is not None and _pool is not old_pool:
65
+ return
66
+ _logger.error("Detected dead subprocess pool; restarting and retrying.")
67
+ prev_pool = _pool
68
+ _pool = ProcessPoolExecutor(
69
+ max_workers=1,
70
+ initializer=_subprocess_init,
71
+ initargs=(_user_apps, os.getpid()),
72
+ )
73
+ if prev_pool is not None:
74
+ # Best-effort shutdown of previous pool; letting exceptions bubble up
75
+ # is acceptable here and signals irrecoverable executor state.
76
+ prev_pool.shutdown(cancel_futures=True)
77
+
78
+
79
+ async def _submit_with_restart(fn: Callable[..., Any], *args: Any) -> Any:
80
+ """Submit and await work, restarting the subprocess until it succeeds.
81
+
82
+ Retries on BrokenProcessPool or pool-shutdown RuntimeError; re-raises other
83
+ exceptions.
84
+ """
85
+ while True:
86
+ pool = _get_pool()
87
+ try:
88
+ fut = pool.submit(fn, *args)
89
+ return await asyncio.wrap_future(fut)
90
+ except BrokenProcessPool:
91
+ _restart_pool(old_pool=pool)
92
+ # loop and retry
93
+
94
+
95
+ # ---------------------------------------------
96
+ # Subprocess: executor registry and helpers
97
+ # ---------------------------------------------
98
+
99
+
100
+ def _start_parent_watchdog(
101
+ parent_pid: int, interval_seconds: float = WATCHDOG_INTERVAL_SECONDS
102
+ ) -> None:
103
+ """Terminate this process if the parent process exits or PPID changes.
104
+
105
+ This runs in a background daemon thread so it never blocks pool work.
106
+ """
107
+
108
+ def _watch() -> None:
109
+ while True:
110
+ # If PPID changed (parent died and we were reparented), exit.
111
+ if os.getppid() != parent_pid:
112
+ os._exit(1)
113
+
114
+ # Best-effort liveness probe in case PPID was reused.
115
+ try:
116
+ os.kill(parent_pid, 0)
117
+ except OSError:
118
+ os._exit(1)
119
+
120
+ time.sleep(interval_seconds)
121
+
122
+ threading.Thread(target=_watch, name="parent-watchdog", daemon=True).start()
123
+
124
+
125
+ def _subprocess_init(user_apps: list[str], parent_pid: int) -> None:
126
+ _start_parent_watchdog(parent_pid)
127
+ for app_target in user_apps:
128
+ load_user_app(app_target)
129
+
130
+
131
+ class _OnceResult:
132
+ _result: Any = None
133
+ _done: bool = False
134
+
135
+ def run_once(self, method: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
136
+ if self._done:
137
+ return self._result
138
+ self._result = _call_method(method, *args, **kwargs)
139
+ self._done = True
140
+ return self._result
141
+
142
+
143
+ @dataclass
144
+ class _ExecutorEntry:
145
+ executor: Any
146
+ prepare: _OnceResult = field(default_factory=_OnceResult)
147
+ analyze: _OnceResult = field(default_factory=_OnceResult)
148
+ ready_to_call: bool = False
149
+
150
+
151
+ _SUBPROC_EXECUTORS: dict[bytes, _ExecutorEntry] = {}
152
+
153
+
154
+ def _call_method(method: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
155
+ """Run an awaitable/coroutine to completion synchronously, otherwise return as-is."""
156
+ if asyncio.iscoroutinefunction(method):
157
+ return asyncio.run(method(*args, **kwargs))
158
+ else:
159
+ return method(*args, **kwargs)
160
+
161
+
162
+ def _get_or_create_entry(key_bytes: bytes) -> _ExecutorEntry:
163
+ entry = _SUBPROC_EXECUTORS.get(key_bytes)
164
+ if entry is None:
165
+ executor_factory, spec = pickle.loads(key_bytes)
166
+ inst = executor_factory()
167
+ inst.spec = spec
168
+ entry = _ExecutorEntry(executor=inst)
169
+ _SUBPROC_EXECUTORS[key_bytes] = entry
170
+ return entry
171
+
172
+
173
+ def _sp_analyze(key_bytes: bytes) -> Any:
174
+ entry = _get_or_create_entry(key_bytes)
175
+ return entry.analyze.run_once(entry.executor.analyze)
176
+
177
+
178
+ def _sp_prepare(key_bytes: bytes) -> Any:
179
+ entry = _get_or_create_entry(key_bytes)
180
+ return entry.prepare.run_once(entry.executor.prepare)
181
+
182
+
183
+ def _sp_call(key_bytes: bytes, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
184
+ entry = _get_or_create_entry(key_bytes)
185
+ # There's a chance that the subprocess crashes and restarts in the middle.
186
+ # So we want to always make sure the executor is ready before each call.
187
+ if not entry.ready_to_call:
188
+ if analyze_fn := getattr(entry.executor, "analyze", None):
189
+ entry.analyze.run_once(analyze_fn)
190
+ if prepare_fn := getattr(entry.executor, "prepare", None):
191
+ entry.prepare.run_once(prepare_fn)
192
+ entry.ready_to_call = True
193
+ return _call_method(entry.executor.__call__, *args, **kwargs)
194
+
195
+
196
+ # ---------------------------------------------
197
+ # Public stub
198
+ # ---------------------------------------------
199
+
200
+
201
+ class _ExecutorStub:
202
+ _pool: ProcessPoolExecutor
203
+ _key_bytes: bytes
204
+
205
+ def __init__(self, executor_factory: type[Any], spec: Any) -> None:
206
+ self._pool = _get_pool()
207
+ self._key_bytes = pickle.dumps(
208
+ (executor_factory, spec), protocol=pickle.HIGHEST_PROTOCOL
209
+ )
210
+
211
+ # Conditionally expose analyze if underlying class has it
212
+ if hasattr(executor_factory, "analyze"):
213
+ # Bind as attribute so getattr(..., "analyze", None) works upstream
214
+ def analyze() -> Any:
215
+ return execution_context.run(
216
+ _submit_with_restart(_sp_analyze, self._key_bytes)
217
+ )
218
+
219
+ # Attach method
220
+ setattr(self, "analyze", analyze)
221
+
222
+ if hasattr(executor_factory, "prepare"):
223
+
224
+ async def prepare() -> Any:
225
+ return await _submit_with_restart(_sp_prepare, self._key_bytes)
226
+
227
+ setattr(self, "prepare", prepare)
228
+
229
+ async def __call__(self, *args: Any, **kwargs: Any) -> Any:
230
+ return await _submit_with_restart(_sp_call, self._key_bytes, args, kwargs)
231
+
232
+
233
+ def executor_stub(executor_factory: type[Any], spec: Any) -> Any:
234
+ """
235
+ Create a subprocess-backed stub for the given executor class/spec.
236
+
237
+ - Lazily initializes a singleton ProcessPoolExecutor (max_workers=1).
238
+ - Returns a stub object exposing async __call__ and async prepare; analyze is
239
+ exposed if present on the original class.
240
+ """
241
+ return _ExecutorStub(executor_factory, spec)