cocoindex 0.3.4__cp311-abi3-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. cocoindex/__init__.py +114 -0
  2. cocoindex/_engine.abi3.so +0 -0
  3. cocoindex/auth_registry.py +44 -0
  4. cocoindex/cli.py +830 -0
  5. cocoindex/engine_object.py +214 -0
  6. cocoindex/engine_value.py +550 -0
  7. cocoindex/flow.py +1281 -0
  8. cocoindex/functions/__init__.py +40 -0
  9. cocoindex/functions/_engine_builtin_specs.py +66 -0
  10. cocoindex/functions/colpali.py +247 -0
  11. cocoindex/functions/sbert.py +77 -0
  12. cocoindex/index.py +50 -0
  13. cocoindex/lib.py +75 -0
  14. cocoindex/llm.py +47 -0
  15. cocoindex/op.py +1047 -0
  16. cocoindex/py.typed +0 -0
  17. cocoindex/query_handler.py +57 -0
  18. cocoindex/runtime.py +78 -0
  19. cocoindex/setting.py +171 -0
  20. cocoindex/setup.py +92 -0
  21. cocoindex/sources/__init__.py +5 -0
  22. cocoindex/sources/_engine_builtin_specs.py +120 -0
  23. cocoindex/subprocess_exec.py +277 -0
  24. cocoindex/targets/__init__.py +5 -0
  25. cocoindex/targets/_engine_builtin_specs.py +153 -0
  26. cocoindex/targets/lancedb.py +466 -0
  27. cocoindex/tests/__init__.py +0 -0
  28. cocoindex/tests/test_engine_object.py +331 -0
  29. cocoindex/tests/test_engine_value.py +1724 -0
  30. cocoindex/tests/test_optional_database.py +249 -0
  31. cocoindex/tests/test_transform_flow.py +300 -0
  32. cocoindex/tests/test_typing.py +553 -0
  33. cocoindex/tests/test_validation.py +134 -0
  34. cocoindex/typing.py +834 -0
  35. cocoindex/user_app_loader.py +53 -0
  36. cocoindex/utils.py +20 -0
  37. cocoindex/validation.py +104 -0
  38. cocoindex-0.3.4.dist-info/METADATA +288 -0
  39. cocoindex-0.3.4.dist-info/RECORD +42 -0
  40. cocoindex-0.3.4.dist-info/WHEEL +4 -0
  41. cocoindex-0.3.4.dist-info/entry_points.txt +2 -0
  42. cocoindex-0.3.4.dist-info/licenses/THIRD_PARTY_NOTICES.html +13249 -0
cocoindex/flow.py ADDED
@@ -0,0 +1,1281 @@
1
+ """
2
+ Flow is the main interface for building and running flows.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ import datetime
9
+ import functools
10
+ import inspect
11
+ import re
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+ from threading import Lock
15
+ from typing import (
16
+ Any,
17
+ Callable,
18
+ Generic,
19
+ Iterable,
20
+ Sequence,
21
+ TypeVar,
22
+ cast,
23
+ get_args,
24
+ get_origin,
25
+ )
26
+
27
+ from rich.text import Text
28
+ from rich.tree import Tree
29
+
30
+ from . import _engine # type: ignore
31
+ from . import index
32
+ from . import op
33
+ from . import setting
34
+ from .engine_object import dump_engine_object
35
+ from .engine_value import (
36
+ make_engine_value_decoder,
37
+ make_engine_value_encoder,
38
+ )
39
+ from .op import FunctionSpec
40
+ from .runtime import execution_context, to_async_call
41
+ from .setup import SetupChangeBundle
42
+ from .typing import analyze_type_info, encode_enriched_type, decode_engine_value_type
43
+ from .query_handler import QueryHandlerInfo, QueryHandlerResultFields
44
+ from .validation import (
45
+ validate_flow_name,
46
+ validate_full_flow_name,
47
+ validate_target_name,
48
+ )
49
+
50
+
51
+ class _NameBuilder:
52
+ _existing_names: set[str]
53
+ _next_name_index: dict[str, int]
54
+
55
+ def __init__(self) -> None:
56
+ self._existing_names = set()
57
+ self._next_name_index = {}
58
+
59
+ def build_name(self, name: str | None, /, prefix: str) -> str:
60
+ """
61
+ Build a name. If the name is None, generate a name with the given prefix.
62
+ """
63
+ if name is not None:
64
+ self._existing_names.add(name)
65
+ return name
66
+
67
+ next_idx = self._next_name_index.get(prefix, 0)
68
+ while True:
69
+ name = f"{prefix}{next_idx}"
70
+ next_idx += 1
71
+ self._next_name_index[prefix] = next_idx
72
+ if name not in self._existing_names:
73
+ self._existing_names.add(name)
74
+ return name
75
+
76
+
77
+ _WORD_BOUNDARY_RE = re.compile("(?<!^)(?=[A-Z])")
78
+
79
+
80
+ def _to_snake_case(name: str) -> str:
81
+ return _WORD_BOUNDARY_RE.sub("_", name).lower()
82
+
83
+
84
+ def _create_data_slice(
85
+ flow_builder_state: _FlowBuilderState,
86
+ creator: Callable[[_engine.DataScopeRef | None, str | None], _engine.DataSlice],
87
+ name: str | None = None,
88
+ ) -> DataSlice[T]:
89
+ if name is None:
90
+ return DataSlice(
91
+ _DataSliceState(
92
+ flow_builder_state,
93
+ lambda target: creator(target[0], target[1])
94
+ if target is not None
95
+ else creator(None, None),
96
+ )
97
+ )
98
+ else:
99
+ return DataSlice(_DataSliceState(flow_builder_state, creator(None, name)))
100
+
101
+
102
+ def _spec_kind(spec: Any) -> str:
103
+ return cast(str, spec.__class__.__name__)
104
+
105
+
106
+ def _transform_helper(
107
+ flow_builder_state: _FlowBuilderState,
108
+ fn_spec: FunctionSpec | Callable[..., Any],
109
+ transform_args: list[tuple[Any, str | None]],
110
+ name: str | None = None,
111
+ ) -> DataSlice[Any]:
112
+ if isinstance(fn_spec, FunctionSpec):
113
+ kind = _spec_kind(fn_spec)
114
+ spec = fn_spec
115
+ elif callable(fn_spec) and (
116
+ op_kind := getattr(fn_spec, "__cocoindex_op_kind__", None)
117
+ ):
118
+ kind = op_kind
119
+ spec = op.EmptyFunctionSpec()
120
+ else:
121
+ raise ValueError("transform() can only be called on a CocoIndex function")
122
+
123
+ def _create_data_slice_inner(
124
+ target_scope: _engine.DataScopeRef | None, name: str | None
125
+ ) -> _engine.DataSlice:
126
+ result = flow_builder_state.engine_flow_builder.transform(
127
+ kind,
128
+ dump_engine_object(spec),
129
+ transform_args,
130
+ target_scope,
131
+ flow_builder_state.field_name_builder.build_name(
132
+ name, prefix=_to_snake_case(_spec_kind(fn_spec)) + "_"
133
+ ),
134
+ )
135
+ return result
136
+
137
+ return _create_data_slice(
138
+ flow_builder_state,
139
+ _create_data_slice_inner,
140
+ name,
141
+ )
142
+
143
+
144
+ T = TypeVar("T")
145
+ S = TypeVar("S")
146
+
147
+
148
+ class _DataSliceState:
149
+ flow_builder_state: _FlowBuilderState
150
+
151
+ _lazy_lock: Lock | None = None # None means it's not lazy.
152
+ _data_slice: _engine.DataSlice | None = None
153
+ _data_slice_creator: (
154
+ Callable[[tuple[_engine.DataScopeRef, str] | None], _engine.DataSlice] | None
155
+ ) = None
156
+
157
+ def __init__(
158
+ self,
159
+ flow_builder_state: _FlowBuilderState,
160
+ data_slice: _engine.DataSlice
161
+ | Callable[[tuple[_engine.DataScopeRef, str] | None], _engine.DataSlice],
162
+ ):
163
+ self.flow_builder_state = flow_builder_state
164
+
165
+ if isinstance(data_slice, _engine.DataSlice):
166
+ self._data_slice = data_slice
167
+ else:
168
+ self._lazy_lock = Lock()
169
+ self._data_slice_creator = data_slice
170
+
171
+ @property
172
+ def engine_data_slice(self) -> _engine.DataSlice:
173
+ """
174
+ Get the internal DataSlice.
175
+ This can be blocking.
176
+ """
177
+ if self._lazy_lock is None:
178
+ if self._data_slice is None:
179
+ raise ValueError("Data slice is not initialized")
180
+ return self._data_slice
181
+ else:
182
+ if self._data_slice_creator is None:
183
+ raise ValueError("Data slice creator is not initialized")
184
+ with self._lazy_lock:
185
+ if self._data_slice is None:
186
+ self._data_slice = self._data_slice_creator(None)
187
+ return self._data_slice
188
+
189
+ async def engine_data_slice_async(self) -> _engine.DataSlice:
190
+ """
191
+ Get the internal DataSlice.
192
+ This can be blocking.
193
+ """
194
+ return await asyncio.to_thread(lambda: self.engine_data_slice)
195
+
196
+ def attach_to_scope(self, scope: _engine.DataScopeRef, field_name: str) -> None:
197
+ """
198
+ Attach the current data slice (if not yet attached) to the given scope.
199
+ """
200
+ if self._lazy_lock is not None:
201
+ with self._lazy_lock:
202
+ if self._data_slice_creator is None:
203
+ raise ValueError("Data slice creator is not initialized")
204
+ if self._data_slice is None:
205
+ self._data_slice = self._data_slice_creator((scope, field_name))
206
+ return
207
+ # TODO: We'll support this by an identity transformer or "aliasing" in the future.
208
+ raise ValueError("DataSlice is already attached to a field")
209
+
210
+
211
+ class DataSlice(Generic[T]):
212
+ """A data slice represents a slice of data in a flow. It's readonly."""
213
+
214
+ _state: _DataSliceState
215
+
216
+ def __init__(self, state: _DataSliceState):
217
+ self._state = state
218
+
219
+ def __str__(self) -> str:
220
+ return str(self._state.engine_data_slice)
221
+
222
+ def __repr__(self) -> str:
223
+ return repr(self._state.engine_data_slice)
224
+
225
+ def __getitem__(self, field_name: str) -> DataSlice[T]:
226
+ field_slice = self._state.engine_data_slice.field(field_name)
227
+ if field_slice is None:
228
+ raise KeyError(field_name)
229
+ return DataSlice(_DataSliceState(self._state.flow_builder_state, field_slice))
230
+
231
+ def row(
232
+ self,
233
+ /,
234
+ *,
235
+ max_inflight_rows: int | None = None,
236
+ max_inflight_bytes: int | None = None,
237
+ ) -> DataScope:
238
+ """
239
+ Return a scope representing each row of the table.
240
+ """
241
+ row_scope = self._state.flow_builder_state.engine_flow_builder.for_each(
242
+ self._state.engine_data_slice,
243
+ execution_options=dump_engine_object(
244
+ _ExecutionOptions(
245
+ max_inflight_rows=max_inflight_rows,
246
+ max_inflight_bytes=max_inflight_bytes,
247
+ ),
248
+ ),
249
+ )
250
+ return DataScope(self._state.flow_builder_state, row_scope)
251
+
252
+ def for_each(
253
+ self,
254
+ f: Callable[[DataScope], None],
255
+ /,
256
+ *,
257
+ max_inflight_rows: int | None = None,
258
+ max_inflight_bytes: int | None = None,
259
+ ) -> None:
260
+ """
261
+ Apply a function to each row of the collection.
262
+ """
263
+ with self.row(
264
+ max_inflight_rows=max_inflight_rows,
265
+ max_inflight_bytes=max_inflight_bytes,
266
+ ) as scope:
267
+ f(scope)
268
+
269
+ def transform(
270
+ self, fn_spec: op.FunctionSpec | Callable[..., Any], *args: Any, **kwargs: Any
271
+ ) -> DataSlice[Any]:
272
+ """
273
+ Apply a function to the data slice.
274
+ """
275
+ transform_args: list[tuple[Any, str | None]] = [
276
+ (self._state.engine_data_slice, None)
277
+ ]
278
+ transform_args += [
279
+ (self._state.flow_builder_state.get_data_slice(v), None) for v in args
280
+ ]
281
+ transform_args += [
282
+ (self._state.flow_builder_state.get_data_slice(v), k)
283
+ for k, v in kwargs.items()
284
+ ]
285
+
286
+ return _transform_helper(
287
+ self._state.flow_builder_state, fn_spec, transform_args
288
+ )
289
+
290
+ def call(self, func: Callable[..., S], *args: Any, **kwargs: Any) -> S:
291
+ """
292
+ Call a function with the data slice.
293
+ """
294
+ return func(self, *args, **kwargs)
295
+
296
+
297
+ def _data_slice_state(data_slice: DataSlice[T]) -> _DataSliceState:
298
+ return data_slice._state # pylint: disable=protected-access
299
+
300
+
301
+ class DataScope:
302
+ """
303
+ A data scope in a flow.
304
+ It has multple fields and collectors, and allow users to add new fields and collectors.
305
+ """
306
+
307
+ _flow_builder_state: _FlowBuilderState
308
+ _engine_data_scope: _engine.DataScopeRef
309
+
310
+ def __init__(
311
+ self, flow_builder_state: _FlowBuilderState, data_scope: _engine.DataScopeRef
312
+ ):
313
+ self._flow_builder_state = flow_builder_state
314
+ self._engine_data_scope = data_scope
315
+
316
+ def __str__(self) -> str:
317
+ return str(self._engine_data_scope)
318
+
319
+ def __repr__(self) -> str:
320
+ return repr(self._engine_data_scope)
321
+
322
+ def __getitem__(self, field_name: str) -> DataSlice[T]:
323
+ return DataSlice(
324
+ _DataSliceState(
325
+ self._flow_builder_state,
326
+ self._flow_builder_state.engine_flow_builder.scope_field(
327
+ self._engine_data_scope, field_name
328
+ ),
329
+ )
330
+ )
331
+
332
+ def __setitem__(self, field_name: str, value: DataSlice[T]) -> None:
333
+ from .validation import validate_field_name
334
+
335
+ validate_field_name(field_name)
336
+ value._state.attach_to_scope(self._engine_data_scope, field_name)
337
+
338
+ def __enter__(self) -> DataScope:
339
+ return self
340
+
341
+ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
342
+ del self._engine_data_scope
343
+
344
+ def add_collector(self, name: str | None = None) -> DataCollector:
345
+ """
346
+ Add a collector to the flow.
347
+ """
348
+ return DataCollector(
349
+ self._flow_builder_state,
350
+ self._engine_data_scope.add_collector(
351
+ self._flow_builder_state.field_name_builder.build_name(
352
+ name, prefix="_collector_"
353
+ )
354
+ ),
355
+ )
356
+
357
+
358
+ class GeneratedField(Enum):
359
+ """
360
+ A generated field is automatically set by the engine.
361
+ """
362
+
363
+ UUID = "Uuid"
364
+
365
+
366
+ class DataCollector:
367
+ """A data collector is used to collect data into a collector."""
368
+
369
+ _flow_builder_state: _FlowBuilderState
370
+ _engine_data_collector: _engine.DataCollector
371
+
372
+ def __init__(
373
+ self,
374
+ flow_builder_state: _FlowBuilderState,
375
+ data_collector: _engine.DataCollector,
376
+ ):
377
+ self._flow_builder_state = flow_builder_state
378
+ self._engine_data_collector = data_collector
379
+
380
+ def collect(self, **kwargs: Any) -> None:
381
+ """
382
+ Collect data into the collector.
383
+ """
384
+ regular_kwargs = []
385
+ auto_uuid_field = None
386
+ for k, v in kwargs.items():
387
+ if isinstance(v, GeneratedField):
388
+ if v == GeneratedField.UUID:
389
+ if auto_uuid_field is not None:
390
+ raise ValueError("Only one generated UUID field is allowed")
391
+ auto_uuid_field = k
392
+ else:
393
+ raise ValueError(f"Unexpected generated field: {v}")
394
+ else:
395
+ regular_kwargs.append((k, self._flow_builder_state.get_data_slice(v)))
396
+
397
+ self._flow_builder_state.engine_flow_builder.collect(
398
+ self._engine_data_collector, regular_kwargs, auto_uuid_field
399
+ )
400
+
401
+ def export(
402
+ self,
403
+ target_name: str,
404
+ target_spec: op.TargetSpec,
405
+ /,
406
+ *,
407
+ primary_key_fields: Sequence[str],
408
+ attachments: Sequence[op.TargetAttachmentSpec] = (),
409
+ vector_indexes: Sequence[index.VectorIndexDef] = (),
410
+ vector_index: Sequence[tuple[str, index.VectorSimilarityMetric]] = (),
411
+ setup_by_user: bool = False,
412
+ ) -> None:
413
+ """
414
+ Export the collected data to the specified target.
415
+
416
+ `vector_index` is for backward compatibility only. Please use `vector_indexes` instead.
417
+ """
418
+
419
+ validate_target_name(target_name)
420
+ if not isinstance(target_spec, op.TargetSpec):
421
+ raise ValueError(
422
+ "export() can only be called on a CocoIndex target storage"
423
+ )
424
+
425
+ # For backward compatibility only.
426
+ if len(vector_indexes) == 0 and len(vector_index) > 0:
427
+ vector_indexes = [
428
+ index.VectorIndexDef(field_name=field_name, metric=metric)
429
+ for field_name, metric in vector_index
430
+ ]
431
+
432
+ index_options = index.IndexOptions(
433
+ primary_key_fields=primary_key_fields,
434
+ vector_indexes=vector_indexes,
435
+ )
436
+ self._flow_builder_state.engine_flow_builder.export(
437
+ target_name,
438
+ _spec_kind(target_spec),
439
+ dump_engine_object(target_spec),
440
+ [
441
+ {"kind": _spec_kind(att), **dump_engine_object(att)}
442
+ for att in attachments
443
+ ],
444
+ dump_engine_object(index_options),
445
+ self._engine_data_collector,
446
+ setup_by_user,
447
+ )
448
+
449
+
450
+ _flow_name_builder = _NameBuilder()
451
+
452
+
453
+ class _FlowBuilderState:
454
+ """
455
+ A flow builder is used to build a flow.
456
+ """
457
+
458
+ engine_flow_builder: _engine.FlowBuilder
459
+ field_name_builder: _NameBuilder
460
+
461
+ def __init__(self, full_name: str):
462
+ self.engine_flow_builder = _engine.FlowBuilder(
463
+ full_name, execution_context.event_loop
464
+ )
465
+ self.field_name_builder = _NameBuilder()
466
+
467
+ def get_data_slice(self, v: Any) -> _engine.DataSlice:
468
+ """
469
+ Return a data slice that represents the given value.
470
+ """
471
+ if isinstance(v, DataSlice):
472
+ return v._state.engine_data_slice
473
+ return self.engine_flow_builder.constant(encode_enriched_type(type(v)), v)
474
+
475
+
476
+ @dataclass
477
+ class _SourceRefreshOptions:
478
+ """
479
+ Options for refreshing a source.
480
+ """
481
+
482
+ refresh_interval: datetime.timedelta | None = None
483
+
484
+
485
+ @dataclass
486
+ class _ExecutionOptions:
487
+ max_inflight_rows: int | None = None
488
+ max_inflight_bytes: int | None = None
489
+
490
+
491
+ class FlowBuilder:
492
+ """
493
+ A flow builder is used to build a flow.
494
+ """
495
+
496
+ _state: _FlowBuilderState
497
+
498
+ def __init__(self, state: _FlowBuilderState):
499
+ self._state = state
500
+
501
+ def __str__(self) -> str:
502
+ return str(self._state.engine_flow_builder)
503
+
504
+ def __repr__(self) -> str:
505
+ return repr(self._state.engine_flow_builder)
506
+
507
+ def add_source(
508
+ self,
509
+ spec: op.SourceSpec,
510
+ /,
511
+ *,
512
+ name: str | None = None,
513
+ refresh_interval: datetime.timedelta | None = None,
514
+ max_inflight_rows: int | None = None,
515
+ max_inflight_bytes: int | None = None,
516
+ ) -> DataSlice[T]:
517
+ """
518
+ Import a source to the flow.
519
+ """
520
+ if not isinstance(spec, op.SourceSpec):
521
+ raise ValueError("add_source() can only be called on a CocoIndex source")
522
+ return _create_data_slice(
523
+ self._state,
524
+ lambda target_scope, name: self._state.engine_flow_builder.add_source(
525
+ _spec_kind(spec),
526
+ dump_engine_object(spec),
527
+ target_scope,
528
+ self._state.field_name_builder.build_name(
529
+ name, prefix=_to_snake_case(_spec_kind(spec)) + "_"
530
+ ),
531
+ refresh_options=dump_engine_object(
532
+ _SourceRefreshOptions(refresh_interval=refresh_interval)
533
+ ),
534
+ execution_options=dump_engine_object(
535
+ _ExecutionOptions(
536
+ max_inflight_rows=max_inflight_rows,
537
+ max_inflight_bytes=max_inflight_bytes,
538
+ )
539
+ ),
540
+ ),
541
+ name,
542
+ )
543
+
544
+ def transform(
545
+ self, fn_spec: FunctionSpec | Callable[..., Any], *args: Any, **kwargs: Any
546
+ ) -> DataSlice[Any]:
547
+ """
548
+ Apply a function to inputs, returning a DataSlice.
549
+ """
550
+ transform_args: list[tuple[Any, str | None]] = [
551
+ (self._state.get_data_slice(v), None) for v in args
552
+ ]
553
+ transform_args += [
554
+ (self._state.get_data_slice(v), k) for k, v in kwargs.items()
555
+ ]
556
+
557
+ if not transform_args:
558
+ raise ValueError("At least one input is required for transformation")
559
+
560
+ return _transform_helper(self._state, fn_spec, transform_args)
561
+
562
+ def declare(self, spec: op.DeclarationSpec) -> None:
563
+ """
564
+ Add a declaration to the flow.
565
+ """
566
+ self._state.engine_flow_builder.declare(dump_engine_object(spec))
567
+
568
+
569
+ @dataclass
570
+ class FlowLiveUpdaterOptions:
571
+ """
572
+ Options for live updating a flow.
573
+
574
+ - live_mode: Whether to perform live update for data sources with change capture mechanisms.
575
+ - reexport_targets: Whether to reexport to targets even if there's no change.
576
+ - print_stats: Whether to print stats during update.
577
+ """
578
+
579
+ live_mode: bool = True
580
+ reexport_targets: bool = False
581
+ print_stats: bool = False
582
+
583
+
584
+ @dataclass
585
+ class FlowUpdaterStatusUpdates:
586
+ """
587
+ Status updates for a flow updater.
588
+ """
589
+
590
+ # Sources that are still active, i.e. not stopped processing.
591
+ active_sources: list[str]
592
+
593
+ # Sources with updates since last time.
594
+ updated_sources: list[str]
595
+
596
+
597
+ class FlowLiveUpdater:
598
+ """
599
+ A live updater for a flow.
600
+ """
601
+
602
+ _flow: Flow
603
+ _options: FlowLiveUpdaterOptions
604
+ _engine_live_updater: _engine.FlowLiveUpdater | None = None
605
+
606
+ def __init__(self, fl: Flow, options: FlowLiveUpdaterOptions | None = None):
607
+ self._flow = fl
608
+ self._options = options or FlowLiveUpdaterOptions()
609
+
610
+ def __enter__(self) -> FlowLiveUpdater:
611
+ self.start()
612
+ return self
613
+
614
+ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
615
+ self.abort()
616
+ self.wait()
617
+
618
+ async def __aenter__(self) -> FlowLiveUpdater:
619
+ await self.start_async()
620
+ return self
621
+
622
+ async def __aexit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
623
+ self.abort()
624
+ await self.wait_async()
625
+
626
+ def start(self) -> None:
627
+ """
628
+ Start the live updater.
629
+ """
630
+ execution_context.run(self.start_async())
631
+
632
+ async def start_async(self) -> None:
633
+ """
634
+ Start the live updater.
635
+ """
636
+ self._engine_live_updater = await _engine.FlowLiveUpdater.create(
637
+ await self._flow.internal_flow_async(), dump_engine_object(self._options)
638
+ )
639
+
640
+ def wait(self) -> None:
641
+ """
642
+ Wait for the live updater to finish.
643
+ """
644
+ execution_context.run(self.wait_async())
645
+
646
+ async def wait_async(self) -> None:
647
+ """
648
+ Wait for the live updater to finish. Async version.
649
+ """
650
+ await self._get_engine_live_updater().wait_async()
651
+
652
+ def next_status_updates(self) -> FlowUpdaterStatusUpdates:
653
+ """
654
+ Get the next status updates.
655
+
656
+ It blocks until there's a new status updates, including the processing finishes for a bunch of source updates,
657
+ and live updater stops (aborted, or no more sources to process).
658
+ """
659
+ return execution_context.run(self.next_status_updates_async())
660
+
661
+ async def next_status_updates_async(self) -> FlowUpdaterStatusUpdates:
662
+ """
663
+ Get the next status updates. Async version.
664
+ """
665
+ updates = await self._get_engine_live_updater().next_status_updates_async()
666
+ return FlowUpdaterStatusUpdates(
667
+ active_sources=updates.active_sources,
668
+ updated_sources=updates.updated_sources,
669
+ )
670
+
671
+ def abort(self) -> None:
672
+ """
673
+ Abort the live updater.
674
+ """
675
+ self._get_engine_live_updater().abort()
676
+
677
+ def update_stats(self) -> _engine.IndexUpdateInfo:
678
+ """
679
+ Get the index update info.
680
+ """
681
+ return self._get_engine_live_updater().index_update_info()
682
+
683
+ def _get_engine_live_updater(self) -> _engine.FlowLiveUpdater:
684
+ if self._engine_live_updater is None:
685
+ raise RuntimeError("Live updater is not started")
686
+ return self._engine_live_updater
687
+
688
+
689
+ @dataclass
690
+ class EvaluateAndDumpOptions:
691
+ """
692
+ Options for evaluating and dumping a flow.
693
+ """
694
+
695
+ output_dir: str
696
+ use_cache: bool = True
697
+
698
+
699
+ class Flow:
700
+ """
701
+ A flow describes an indexing pipeline.
702
+ """
703
+
704
+ _name: str
705
+ _engine_flow_creator: Callable[[], _engine.Flow]
706
+
707
+ _lazy_flow_lock: Lock
708
+ _lazy_query_handler_args: list[tuple[Any, ...]]
709
+ _lazy_engine_flow: _engine.Flow | None = None
710
+
711
+ def __init__(self, name: str, engine_flow_creator: Callable[[], _engine.Flow]):
712
+ validate_flow_name(name)
713
+ self._name = name
714
+ self._engine_flow_creator = engine_flow_creator
715
+ self._lazy_flow_lock = Lock()
716
+ self._lazy_query_handler_args = []
717
+
718
+ def _render_spec(self, verbose: bool = False) -> Tree:
719
+ """
720
+ Render the flow spec as a styled rich Tree with hierarchical structure.
721
+ """
722
+ spec = self._get_spec(verbose=verbose)
723
+ tree = Tree(f"Flow: {self.full_name}", style="cyan")
724
+
725
+ def build_tree(label: str, lines: list[Any]) -> Tree:
726
+ node = Tree(label=label if lines else label + " None", style="cyan")
727
+ for line in lines:
728
+ child_node = node.add(Text(line.content, style="yellow"))
729
+ child_node.children = build_tree("", line.children).children
730
+ return node
731
+
732
+ for section, lines in spec.sections:
733
+ section_node = build_tree(f"{section}:", lines)
734
+ tree.children.append(section_node)
735
+ return tree
736
+
737
+ def _get_spec(self, verbose: bool = False) -> _engine.RenderedSpec:
738
+ return self.internal_flow().get_spec(
739
+ output_mode="verbose" if verbose else "concise"
740
+ )
741
+
742
+ def _get_schema(self) -> list[tuple[str, str, str]]:
743
+ return cast(list[tuple[str, str, str]], self.internal_flow().get_schema())
744
+
745
+ def __str__(self) -> str:
746
+ return str(self._get_spec())
747
+
748
+ def __repr__(self) -> str:
749
+ return repr(self.internal_flow())
750
+
751
+ @property
752
+ def name(self) -> str:
753
+ """
754
+ Get the name of the flow.
755
+ """
756
+ return self._name
757
+
758
+ @property
759
+ def full_name(self) -> str:
760
+ """
761
+ Get the full name of the flow.
762
+ """
763
+ return get_flow_full_name(self._name)
764
+
765
+ def update(self, /, *, reexport_targets: bool = False) -> _engine.IndexUpdateInfo:
766
+ """
767
+ Update the index defined by the flow.
768
+ Once the function returns, the index is fresh up to the moment when the function is called.
769
+ """
770
+ return execution_context.run(
771
+ self.update_async(reexport_targets=reexport_targets)
772
+ )
773
+
774
+ async def update_async(
775
+ self, /, *, reexport_targets: bool = False
776
+ ) -> _engine.IndexUpdateInfo:
777
+ """
778
+ Update the index defined by the flow.
779
+ Once the function returns, the index is fresh up to the moment when the function is called.
780
+ """
781
+ async with FlowLiveUpdater(
782
+ self,
783
+ FlowLiveUpdaterOptions(live_mode=False, reexport_targets=reexport_targets),
784
+ ) as updater:
785
+ await updater.wait_async()
786
+ return updater.update_stats()
787
+
788
+ def evaluate_and_dump(
789
+ self, options: EvaluateAndDumpOptions
790
+ ) -> _engine.IndexUpdateInfo:
791
+ """
792
+ Evaluate the flow and dump flow outputs to files.
793
+ """
794
+ return self.internal_flow().evaluate_and_dump(dump_engine_object(options))
795
+
796
+ def internal_flow(self) -> _engine.Flow:
797
+ """
798
+ Get the engine flow.
799
+ """
800
+ if self._lazy_engine_flow is not None:
801
+ return self._lazy_engine_flow
802
+ return self._internal_flow()
803
+
804
+ async def internal_flow_async(self) -> _engine.Flow:
805
+ """
806
+ Get the engine flow. The async version.
807
+ """
808
+ if self._lazy_engine_flow is not None:
809
+ return self._lazy_engine_flow
810
+ return await asyncio.to_thread(self._internal_flow)
811
+
812
+ def _internal_flow(self) -> _engine.Flow:
813
+ """
814
+ Get the engine flow. The async version.
815
+ """
816
+ with self._lazy_flow_lock:
817
+ if self._lazy_engine_flow is not None:
818
+ return self._lazy_engine_flow
819
+
820
+ engine_flow = self._engine_flow_creator()
821
+ self._lazy_engine_flow = engine_flow
822
+ for args in self._lazy_query_handler_args:
823
+ engine_flow.add_query_handler(*args)
824
+ self._lazy_query_handler_args = []
825
+
826
+ return engine_flow
827
+
828
+ def setup(self, report_to_stdout: bool = False) -> None:
829
+ """
830
+ Setup persistent backends of the flow.
831
+ """
832
+ execution_context.run(self.setup_async(report_to_stdout=report_to_stdout))
833
+
834
+ async def setup_async(self, report_to_stdout: bool = False) -> None:
835
+ """
836
+ Setup persistent backends of the flow. The async version.
837
+ """
838
+ bundle = await make_setup_bundle_async([self])
839
+ await bundle.describe_and_apply_async(report_to_stdout=report_to_stdout)
840
+
841
+ def drop(self, report_to_stdout: bool = False) -> None:
842
+ """
843
+ Drop persistent backends of the flow.
844
+
845
+ The current instance is still valid after it's called.
846
+ For example, you can still call `setup()` after it, to setup the persistent backends again.
847
+
848
+ Call `close()` if you want to remove the flow from the current process.
849
+ """
850
+ execution_context.run(self.drop_async(report_to_stdout=report_to_stdout))
851
+
852
+ async def drop_async(self, report_to_stdout: bool = False) -> None:
853
+ """
854
+ Drop persistent backends of the flow. The async version.
855
+ """
856
+ bundle = await make_drop_bundle_async([self])
857
+ await bundle.describe_and_apply_async(report_to_stdout=report_to_stdout)
858
+
859
+ def close(self) -> None:
860
+ """
861
+ Close the flow. It will remove the flow from the current process to free up resources.
862
+ After it's called, methods of the flow should no longer be called.
863
+
864
+ This will NOT touch the persistent backends of the flow.
865
+ """
866
+ _engine.remove_flow_context(self.full_name)
867
+ self._lazy_engine_flow = None
868
+ with _flows_lock:
869
+ del _flows[self.name]
870
+
871
+ def add_query_handler(
872
+ self,
873
+ name: str,
874
+ handler: Callable[[str], Any],
875
+ /,
876
+ *,
877
+ result_fields: QueryHandlerResultFields | None = None,
878
+ ) -> None:
879
+ """
880
+ Add a query handler to the flow.
881
+ """
882
+ async_handler = to_async_call(handler)
883
+
884
+ async def _handler(query: str) -> dict[str, Any]:
885
+ handler_result = await async_handler(query)
886
+ return {
887
+ "results": [
888
+ [(k, dump_engine_object(v)) for (k, v) in result.items()]
889
+ for result in handler_result.results
890
+ ],
891
+ "query_info": dump_engine_object(handler_result.query_info),
892
+ }
893
+
894
+ handler_info = dump_engine_object(QueryHandlerInfo(result_fields=result_fields))
895
+ with self._lazy_flow_lock:
896
+ if self._lazy_engine_flow is not None:
897
+ self._lazy_engine_flow.add_query_handler(name, _handler, handler_info)
898
+ else:
899
+ self._lazy_query_handler_args.append((name, _handler, handler_info))
900
+
901
+ def query_handler(
902
+ self,
903
+ name: str | None = None,
904
+ result_fields: QueryHandlerResultFields | None = None,
905
+ ) -> Callable[[Callable[[str], Any]], Callable[[str], Any]]:
906
+ """
907
+ A decorator to declare a query handler.
908
+ """
909
+
910
+ def _inner(handler: Callable[[str], Any]) -> Callable[[str], Any]:
911
+ self.add_query_handler(
912
+ name or handler.__qualname__, handler, result_fields=result_fields
913
+ )
914
+ return handler
915
+
916
+ return _inner
917
+
918
+
919
+ def _create_lazy_flow(
920
+ name: str | None, fl_def: Callable[[FlowBuilder, DataScope], None]
921
+ ) -> Flow:
922
+ """
923
+ Create a flow without really building it yet.
924
+ The flow will be built the first time when it's really needed.
925
+ """
926
+ flow_name = _flow_name_builder.build_name(name, prefix="_flow_")
927
+
928
+ def _create_engine_flow() -> _engine.Flow:
929
+ flow_full_name = get_flow_full_name(flow_name)
930
+ validate_full_flow_name(flow_full_name)
931
+ flow_builder_state = _FlowBuilderState(flow_full_name)
932
+ root_scope = DataScope(
933
+ flow_builder_state, flow_builder_state.engine_flow_builder.root_scope()
934
+ )
935
+ fl_def(FlowBuilder(flow_builder_state), root_scope)
936
+ return flow_builder_state.engine_flow_builder.build_flow()
937
+
938
+ return Flow(flow_name, _create_engine_flow)
939
+
940
+
941
+ _flows_lock = Lock()
942
+ _flows: dict[str, Flow] = {}
943
+
944
+
945
+ def get_flow_full_name(name: str) -> str:
946
+ """
947
+ Get the full name of a flow.
948
+ """
949
+ return f"{setting.get_app_namespace(trailing_delimiter='.')}{name}"
950
+
951
+
952
+ def open_flow(name: str, fl_def: Callable[[FlowBuilder, DataScope], None]) -> Flow:
953
+ """
954
+ Open a flow, with the given name and definition.
955
+ """
956
+ with _flows_lock:
957
+ if name in _flows:
958
+ raise KeyError(f"Flow with name {name} already exists")
959
+ fl = _flows[name] = _create_lazy_flow(name, fl_def)
960
+ return fl
961
+
962
+
963
+ def add_flow_def(name: str, fl_def: Callable[[FlowBuilder, DataScope], None]) -> Flow:
964
+ """
965
+ DEPRECATED: Use `open_flow()` instead.
966
+ """
967
+ return open_flow(name, fl_def)
968
+
969
+
970
+ def remove_flow(fl: Flow) -> None:
971
+ """
972
+ DEPRECATED: Use `Flow.close()` instead.
973
+ """
974
+ fl.close()
975
+
976
+
977
+ def flow_def(
978
+ name: str | None = None,
979
+ ) -> Callable[[Callable[[FlowBuilder, DataScope], None]], Flow]:
980
+ """
981
+ A decorator to wrap the flow definition.
982
+ """
983
+ return lambda fl_def: open_flow(name or fl_def.__name__, fl_def)
984
+
985
+
986
+ def flow_names() -> list[str]:
987
+ """
988
+ Get the names of all flows.
989
+ """
990
+ with _flows_lock:
991
+ return list(_flows.keys())
992
+
993
+
994
+ def flows() -> dict[str, Flow]:
995
+ """
996
+ Get all flows.
997
+ """
998
+ with _flows_lock:
999
+ return dict(_flows)
1000
+
1001
+
1002
+ def flow_by_name(name: str) -> Flow:
1003
+ """
1004
+ Get a flow by name.
1005
+ """
1006
+ with _flows_lock:
1007
+ return _flows[name]
1008
+
1009
+
1010
+ def ensure_all_flows_built() -> None:
1011
+ """
1012
+ Ensure all flows are built.
1013
+ """
1014
+ execution_context.run(ensure_all_flows_built_async())
1015
+
1016
+
1017
+ async def ensure_all_flows_built_async() -> None:
1018
+ """
1019
+ Ensure all flows are built.
1020
+ """
1021
+ for fl in flows().values():
1022
+ await fl.internal_flow_async()
1023
+
1024
+
1025
+ def update_all_flows(
1026
+ options: FlowLiveUpdaterOptions,
1027
+ ) -> dict[str, _engine.IndexUpdateInfo]:
1028
+ """
1029
+ Update all flows.
1030
+ """
1031
+ return execution_context.run(update_all_flows_async(options))
1032
+
1033
+
1034
+ async def update_all_flows_async(
1035
+ options: FlowLiveUpdaterOptions,
1036
+ ) -> dict[str, _engine.IndexUpdateInfo]:
1037
+ """
1038
+ Update all flows.
1039
+ """
1040
+ await ensure_all_flows_built_async()
1041
+
1042
+ async def _update_flow(name: str, fl: Flow) -> tuple[str, _engine.IndexUpdateInfo]:
1043
+ async with FlowLiveUpdater(fl, options) as updater:
1044
+ await updater.wait_async()
1045
+ return (name, updater.update_stats())
1046
+
1047
+ fls = flows()
1048
+ all_stats = await asyncio.gather(
1049
+ *(_update_flow(name, fl) for (name, fl) in fls.items())
1050
+ )
1051
+ return dict(all_stats)
1052
+
1053
+
1054
+ def _get_data_slice_annotation_type(
1055
+ data_slice_type: type[DataSlice[T] | inspect._empty],
1056
+ ) -> type[T] | None:
1057
+ type_args = get_args(data_slice_type)
1058
+ if data_slice_type is inspect.Parameter.empty or data_slice_type is DataSlice:
1059
+ return None
1060
+ if get_origin(data_slice_type) != DataSlice or len(type_args) != 1:
1061
+ raise ValueError(f"Expect a DataSlice[T] type, but got {data_slice_type}")
1062
+ return cast(type[T] | None, type_args[0])
1063
+
1064
+
1065
+ _transform_flow_name_builder = _NameBuilder()
1066
+
1067
+
1068
+ @dataclass
1069
+ class TransformFlowInfo(Generic[T]):
1070
+ engine_flow: _engine.TransientFlow
1071
+ result_decoder: Callable[[Any], T]
1072
+
1073
+
1074
+ @dataclass
1075
+ class FlowArgInfo:
1076
+ name: str
1077
+ type_hint: Any
1078
+ encoder: Callable[[Any], Any]
1079
+
1080
+
1081
+ class TransformFlow(Generic[T]):
1082
+ """
1083
+ A transient transformation flow that transforms in-memory data.
1084
+ """
1085
+
1086
+ _flow_fn: Callable[..., DataSlice[T]]
1087
+ _flow_name: str
1088
+ _args_info: list[FlowArgInfo]
1089
+
1090
+ _lazy_lock: asyncio.Lock
1091
+ _lazy_flow_info: TransformFlowInfo[T] | None = None
1092
+
1093
+ def __init__(
1094
+ self,
1095
+ flow_fn: Callable[..., DataSlice[T]],
1096
+ /,
1097
+ name: str | None = None,
1098
+ ):
1099
+ self._flow_fn = flow_fn
1100
+ self._flow_name = _transform_flow_name_builder.build_name(
1101
+ name, prefix="_transform_flow_"
1102
+ )
1103
+ self._lazy_lock = asyncio.Lock()
1104
+
1105
+ sig = inspect.signature(flow_fn)
1106
+ args_info = []
1107
+ for param_name, param in sig.parameters.items():
1108
+ if param.kind not in (
1109
+ inspect.Parameter.POSITIONAL_OR_KEYWORD,
1110
+ inspect.Parameter.KEYWORD_ONLY,
1111
+ ):
1112
+ raise ValueError(
1113
+ f"Parameter `{param_name}` is not a parameter can be passed by name"
1114
+ )
1115
+ value_type_annotation: type | None = _get_data_slice_annotation_type(
1116
+ param.annotation
1117
+ )
1118
+ if value_type_annotation is None:
1119
+ raise ValueError(
1120
+ f"Parameter `{param_name}` for {flow_fn} has no value type annotation. "
1121
+ "Please use `cocoindex.DataSlice[T]` where T is the type of the value."
1122
+ )
1123
+ encoder = make_engine_value_encoder(
1124
+ analyze_type_info(value_type_annotation)
1125
+ )
1126
+ args_info.append(FlowArgInfo(param_name, value_type_annotation, encoder))
1127
+ self._args_info = args_info
1128
+
1129
+ def __call__(self, *args: Any, **kwargs: Any) -> DataSlice[T]:
1130
+ return self._flow_fn(*args, **kwargs)
1131
+
1132
+ @property
1133
+ def _flow_info(self) -> TransformFlowInfo[T]:
1134
+ if self._lazy_flow_info is not None:
1135
+ return self._lazy_flow_info
1136
+ return execution_context.run(self._flow_info_async())
1137
+
1138
+ async def _flow_info_async(self) -> TransformFlowInfo[T]:
1139
+ if self._lazy_flow_info is not None:
1140
+ return self._lazy_flow_info
1141
+ async with self._lazy_lock:
1142
+ if self._lazy_flow_info is None:
1143
+ self._lazy_flow_info = await self._build_flow_info_async()
1144
+ return self._lazy_flow_info
1145
+
1146
+ async def _build_flow_info_async(self) -> TransformFlowInfo[T]:
1147
+ flow_builder_state = _FlowBuilderState(self._flow_name)
1148
+ kwargs: dict[str, DataSlice[T]] = {}
1149
+ for arg_info in self._args_info:
1150
+ encoded_type = encode_enriched_type(arg_info.type_hint)
1151
+ if encoded_type is None:
1152
+ raise ValueError(f"Parameter `{arg_info.name}` has no type annotation")
1153
+ engine_ds = flow_builder_state.engine_flow_builder.add_direct_input(
1154
+ arg_info.name, encoded_type
1155
+ )
1156
+ kwargs[arg_info.name] = DataSlice(
1157
+ _DataSliceState(flow_builder_state, engine_ds)
1158
+ )
1159
+
1160
+ output = await asyncio.to_thread(lambda: self._flow_fn(**kwargs))
1161
+ output_data_slice = await _data_slice_state(output).engine_data_slice_async()
1162
+
1163
+ flow_builder_state.engine_flow_builder.set_direct_output(output_data_slice)
1164
+ engine_flow = (
1165
+ await flow_builder_state.engine_flow_builder.build_transient_flow_async(
1166
+ execution_context.event_loop
1167
+ )
1168
+ )
1169
+ engine_return_type = output_data_slice.data_type().schema()
1170
+ python_return_type: type[T] | None = _get_data_slice_annotation_type(
1171
+ inspect.signature(self._flow_fn).return_annotation
1172
+ )
1173
+ result_decoder = make_engine_value_decoder(
1174
+ [],
1175
+ decode_engine_value_type(engine_return_type["type"]),
1176
+ analyze_type_info(python_return_type),
1177
+ )
1178
+
1179
+ return TransformFlowInfo(engine_flow, result_decoder)
1180
+
1181
+ def __str__(self) -> str:
1182
+ return str(self._flow_info.engine_flow)
1183
+
1184
+ def __repr__(self) -> str:
1185
+ return repr(self._flow_info.engine_flow)
1186
+
1187
+ def internal_flow(self) -> _engine.TransientFlow:
1188
+ """
1189
+ Get the internal flow.
1190
+ """
1191
+ return self._flow_info.engine_flow
1192
+
1193
+ def eval(self, *args: Any, **kwargs: Any) -> T:
1194
+ """
1195
+ Evaluate the transform flow.
1196
+ """
1197
+ return execution_context.run(self.eval_async(*args, **kwargs))
1198
+
1199
+ async def eval_async(self, *args: Any, **kwargs: Any) -> T:
1200
+ """
1201
+ Evaluate the transform flow.
1202
+ """
1203
+ flow_info = await self._flow_info_async()
1204
+ params = []
1205
+ for i, arg_info in enumerate(self._args_info):
1206
+ if i < len(args):
1207
+ arg = args[i]
1208
+ elif arg in kwargs:
1209
+ arg = kwargs[arg]
1210
+ else:
1211
+ raise ValueError(f"Parameter {arg} is not provided")
1212
+ params.append(arg_info.encoder(arg))
1213
+ engine_result = await flow_info.engine_flow.evaluate_async(params)
1214
+ return flow_info.result_decoder(engine_result)
1215
+
1216
+
1217
+ def transform_flow() -> Callable[[Callable[..., DataSlice[T]]], TransformFlow[T]]:
1218
+ """
1219
+ A decorator to wrap the transform function.
1220
+ """
1221
+
1222
+ def _transform_flow_wrapper(fn: Callable[..., DataSlice[T]]) -> TransformFlow[T]:
1223
+ _transform_flow = TransformFlow(fn)
1224
+ functools.update_wrapper(_transform_flow, fn)
1225
+ return _transform_flow
1226
+
1227
+ return _transform_flow_wrapper
1228
+
1229
+
1230
+ async def make_setup_bundle_async(flow_iter: Iterable[Flow]) -> SetupChangeBundle:
1231
+ """
1232
+ Make a bundle to setup flows with the given names.
1233
+ """
1234
+ full_names = []
1235
+ for fl in flow_iter:
1236
+ await fl.internal_flow_async()
1237
+ full_names.append(fl.full_name)
1238
+ return SetupChangeBundle(_engine.make_setup_bundle(full_names))
1239
+
1240
+
1241
+ def make_setup_bundle(flow_iter: Iterable[Flow]) -> SetupChangeBundle:
1242
+ """
1243
+ Make a bundle to setup flows with the given names.
1244
+ """
1245
+ return execution_context.run(make_setup_bundle_async(flow_iter))
1246
+
1247
+
1248
+ async def make_drop_bundle_async(flow_iter: Iterable[Flow]) -> SetupChangeBundle:
1249
+ """
1250
+ Make a bundle to drop flows with the given names.
1251
+ """
1252
+ full_names = []
1253
+ for fl in flow_iter:
1254
+ await fl.internal_flow_async()
1255
+ full_names.append(fl.full_name)
1256
+ return SetupChangeBundle(_engine.make_drop_bundle(full_names))
1257
+
1258
+
1259
+ def make_drop_bundle(flow_iter: Iterable[Flow]) -> SetupChangeBundle:
1260
+ """
1261
+ Make a bundle to drop flows with the given names.
1262
+ """
1263
+ return execution_context.run(make_drop_bundle_async(flow_iter))
1264
+
1265
+
1266
+ def setup_all_flows(report_to_stdout: bool = False) -> None:
1267
+ """
1268
+ Setup all flows registered in the current process.
1269
+ """
1270
+ with _flows_lock:
1271
+ flow_list = list(_flows.values())
1272
+ make_setup_bundle(flow_list).describe_and_apply(report_to_stdout=report_to_stdout)
1273
+
1274
+
1275
+ def drop_all_flows(report_to_stdout: bool = False) -> None:
1276
+ """
1277
+ Drop all flows registered in the current process.
1278
+ """
1279
+ with _flows_lock:
1280
+ flow_list = list(_flows.values())
1281
+ make_drop_bundle(flow_list).describe_and_apply(report_to_stdout=report_to_stdout)