cocoindex 0.1.81__cp313-cp313-manylinux_2_28_x86_64.whl → 0.1.83__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cocoindex/cli.py CHANGED
@@ -4,9 +4,7 @@ import datetime
4
4
  import importlib.util
5
5
  import os
6
6
  import signal
7
- import sys
8
7
  import threading
9
- import types
10
8
  from types import FrameType
11
9
  from typing import Any, Iterable
12
10
 
@@ -20,6 +18,8 @@ from rich.table import Table
20
18
  from . import flow, lib, setting
21
19
  from .setup import flow_names_with_setup
22
20
  from .runtime import execution_context
21
+ from .subprocess_exec import add_user_app
22
+ from .user_app_loader import load_user_app
23
23
 
24
24
  # Create ServerSettings lazily upon first call, as environment variables may be loaded from files, etc.
25
25
  COCOINDEX_HOST = "https://cocoindex.io"
@@ -76,50 +76,9 @@ def _get_app_ref_from_specifier(
76
76
  return app_ref
77
77
 
78
78
 
79
- def _load_user_app(app_target: str) -> types.ModuleType:
80
- """
81
- Loads the user's application, which can be a file path or an installed module name.
82
- Exits on failure.
83
- """
84
- if not app_target:
85
- raise click.ClickException("Application target not provided.")
86
-
87
- looks_like_path = os.sep in app_target or app_target.lower().endswith(".py")
88
-
89
- if looks_like_path:
90
- if not os.path.isfile(app_target):
91
- raise click.ClickException(f"Application file path not found: {app_target}")
92
- app_path = os.path.abspath(app_target)
93
- app_dir = os.path.dirname(app_path)
94
- module_name = os.path.splitext(os.path.basename(app_path))[0]
95
-
96
- if app_dir not in sys.path:
97
- sys.path.insert(0, app_dir)
98
- try:
99
- spec = importlib.util.spec_from_file_location(module_name, app_path)
100
- if spec is None:
101
- raise ImportError(f"Could not create spec for file: {app_path}")
102
- module = importlib.util.module_from_spec(spec)
103
- sys.modules[spec.name] = module
104
- if spec.loader is None:
105
- raise ImportError(f"Could not create loader for file: {app_path}")
106
- spec.loader.exec_module(module)
107
- return module
108
- except (ImportError, FileNotFoundError, PermissionError) as e:
109
- raise click.ClickException(f"Failed importing file '{app_path}': {e}")
110
- finally:
111
- if app_dir in sys.path and sys.path[0] == app_dir:
112
- sys.path.pop(0)
113
-
114
- # Try as module
115
- try:
116
- return importlib.import_module(app_target)
117
- except ImportError as e:
118
- raise click.ClickException(f"Failed to load module '{app_target}': {e}")
119
- except Exception as e:
120
- raise click.ClickException(
121
- f"Unexpected error importing module '{app_target}': {e}"
122
- )
79
+ def _load_user_app(app_target: str) -> None:
80
+ load_user_app(app_target)
81
+ add_user_app(app_target)
123
82
 
124
83
 
125
84
  def _initialize_cocoindex_in_process() -> None:
cocoindex/convert.py CHANGED
@@ -9,26 +9,26 @@ import datetime
9
9
  import inspect
10
10
  import warnings
11
11
  from enum import Enum
12
- from typing import Any, Callable, Mapping, get_origin
12
+ from typing import Any, Callable, Mapping, Type, get_origin
13
13
 
14
14
  import numpy as np
15
15
 
16
16
  from .typing import (
17
17
  KEY_FIELD_NAME,
18
18
  TABLE_TYPES,
19
- analyze_type_info,
20
- encode_enriched_type,
21
- is_namedtuple_type,
22
- is_struct_type,
23
- AnalyzedTypeInfo,
24
19
  AnalyzedAnyType,
20
+ AnalyzedBasicType,
25
21
  AnalyzedDictType,
26
22
  AnalyzedListType,
27
- AnalyzedBasicType,
23
+ AnalyzedStructType,
24
+ AnalyzedTypeInfo,
28
25
  AnalyzedUnionType,
29
26
  AnalyzedUnknownType,
30
- AnalyzedStructType,
27
+ analyze_type_info,
28
+ encode_enriched_type,
29
+ is_namedtuple_type,
31
30
  is_numpy_number_type,
31
+ is_struct_type,
32
32
  )
33
33
 
34
34
 
@@ -50,34 +50,6 @@ class ChildFieldPath:
50
50
  self._field_path.pop()
51
51
 
52
52
 
53
- def encode_engine_value(value: Any) -> Any:
54
- """Encode a Python value to an engine value."""
55
- if dataclasses.is_dataclass(value):
56
- return [
57
- encode_engine_value(getattr(value, f.name))
58
- for f in dataclasses.fields(value)
59
- ]
60
- if is_namedtuple_type(type(value)):
61
- return [encode_engine_value(getattr(value, name)) for name in value._fields]
62
- if isinstance(value, np.number):
63
- return value.item()
64
- if isinstance(value, np.ndarray):
65
- return value
66
- if isinstance(value, (list, tuple)):
67
- return [encode_engine_value(v) for v in value]
68
- if isinstance(value, dict):
69
- if not value:
70
- return {}
71
-
72
- first_val = next(iter(value.values()))
73
- if is_struct_type(type(first_val)): # KTable
74
- return [
75
- [encode_engine_value(k)] + encode_engine_value(v)
76
- for k, v in value.items()
77
- ]
78
- return value
79
-
80
-
81
53
  _CONVERTIBLE_KINDS = {
82
54
  ("Float32", "Float64"),
83
55
  ("LocalDateTime", "OffsetDateTime"),
@@ -91,6 +63,118 @@ def _is_type_kind_convertible_to(src_type_kind: str, dst_type_kind: str) -> bool
91
63
  )
92
64
 
93
65
 
66
+ # Pre-computed type info for missing/Any type annotations
67
+ ANY_TYPE_INFO = analyze_type_info(inspect.Parameter.empty)
68
+
69
+
70
+ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], Any]:
71
+ """
72
+ Create an encoder closure for a specific type.
73
+ """
74
+ variant = type_info.variant
75
+
76
+ if isinstance(variant, AnalyzedUnknownType):
77
+ raise ValueError(f"Type annotation `{type_info.core_type}` is unsupported")
78
+
79
+ if isinstance(variant, AnalyzedListType):
80
+ elem_type_info = (
81
+ analyze_type_info(variant.elem_type) if variant.elem_type else ANY_TYPE_INFO
82
+ )
83
+ if isinstance(elem_type_info.variant, AnalyzedStructType):
84
+ elem_encoder = make_engine_value_encoder(elem_type_info)
85
+
86
+ def encode_struct_list(value: Any) -> Any:
87
+ return None if value is None else [elem_encoder(v) for v in value]
88
+
89
+ return encode_struct_list
90
+
91
+ if isinstance(variant, AnalyzedDictType):
92
+ if not variant.value_type:
93
+ return lambda value: value
94
+
95
+ value_type_info = analyze_type_info(variant.value_type)
96
+ if isinstance(value_type_info.variant, AnalyzedStructType):
97
+
98
+ def encode_struct_dict(value: Any) -> Any:
99
+ if not isinstance(value, dict):
100
+ return value
101
+ if not value:
102
+ return []
103
+
104
+ sample_key, sample_val = next(iter(value.items()))
105
+ key_type, val_type = type(sample_key), type(sample_val)
106
+
107
+ # Handle KTable case
108
+ if value and is_struct_type(val_type):
109
+ key_encoder = (
110
+ make_engine_value_encoder(analyze_type_info(key_type))
111
+ if is_struct_type(key_type)
112
+ else make_engine_value_encoder(ANY_TYPE_INFO)
113
+ )
114
+ value_encoder = make_engine_value_encoder(
115
+ analyze_type_info(val_type)
116
+ )
117
+ return [
118
+ [key_encoder(k)] + value_encoder(v) for k, v in value.items()
119
+ ]
120
+ return {key_encoder(k): value_encoder(v) for k, v in value.items()}
121
+
122
+ return encode_struct_dict
123
+
124
+ if isinstance(variant, AnalyzedStructType):
125
+ struct_type = variant.struct_type
126
+
127
+ if dataclasses.is_dataclass(struct_type):
128
+ fields = dataclasses.fields(struct_type)
129
+ field_encoders = [
130
+ make_engine_value_encoder(analyze_type_info(f.type)) for f in fields
131
+ ]
132
+ field_names = [f.name for f in fields]
133
+
134
+ def encode_dataclass(value: Any) -> Any:
135
+ if not dataclasses.is_dataclass(value):
136
+ return value
137
+ return [
138
+ encoder(getattr(value, name))
139
+ for encoder, name in zip(field_encoders, field_names)
140
+ ]
141
+
142
+ return encode_dataclass
143
+
144
+ elif is_namedtuple_type(struct_type):
145
+ annotations = struct_type.__annotations__
146
+ field_names = list(getattr(struct_type, "_fields", ()))
147
+ field_encoders = [
148
+ make_engine_value_encoder(
149
+ analyze_type_info(annotations[name])
150
+ if name in annotations
151
+ else ANY_TYPE_INFO
152
+ )
153
+ for name in field_names
154
+ ]
155
+
156
+ def encode_namedtuple(value: Any) -> Any:
157
+ if not is_namedtuple_type(type(value)):
158
+ return value
159
+ return [
160
+ encoder(getattr(value, name))
161
+ for encoder, name in zip(field_encoders, field_names)
162
+ ]
163
+
164
+ return encode_namedtuple
165
+
166
+ def encode_basic_value(value: Any) -> Any:
167
+ if isinstance(value, np.number):
168
+ return value.item()
169
+ if isinstance(value, np.ndarray):
170
+ return value
171
+ if isinstance(value, (list, tuple)):
172
+ return [encode_basic_value(v) for v in value]
173
+ return value
174
+
175
+ return encode_basic_value
176
+
177
+
94
178
  def make_engine_value_decoder(
95
179
  field_path: list[str],
96
180
  src_type: dict[str, Any],
cocoindex/flow.py CHANGED
@@ -9,15 +9,6 @@ import datetime
9
9
  import functools
10
10
  import inspect
11
11
  import re
12
-
13
- from .validation import (
14
- validate_flow_name,
15
- NamingError,
16
- validate_full_flow_name,
17
- validate_target_name,
18
- )
19
- from .typing import analyze_type_info
20
-
21
12
  from dataclasses import dataclass
22
13
  from enum import Enum
23
14
  from threading import Lock
@@ -25,13 +16,13 @@ from typing import (
25
16
  Any,
26
17
  Callable,
27
18
  Generic,
19
+ Iterable,
28
20
  NamedTuple,
29
21
  Sequence,
30
22
  TypeVar,
31
23
  cast,
32
24
  get_args,
33
25
  get_origin,
34
- Iterable,
35
26
  )
36
27
 
37
28
  from rich.text import Text
@@ -41,11 +32,20 @@ from . import _engine # type: ignore
41
32
  from . import index
42
33
  from . import op
43
34
  from . import setting
44
- from .convert import dump_engine_object, encode_engine_value, make_engine_value_decoder
35
+ from .convert import (
36
+ dump_engine_object,
37
+ make_engine_value_decoder,
38
+ make_engine_value_encoder,
39
+ )
45
40
  from .op import FunctionSpec
46
41
  from .runtime import execution_context
47
42
  from .setup import SetupChangeBundle
48
- from .typing import encode_enriched_type
43
+ from .typing import analyze_type_info, encode_enriched_type
44
+ from .validation import (
45
+ validate_flow_name,
46
+ validate_full_flow_name,
47
+ validate_target_name,
48
+ )
49
49
 
50
50
 
51
51
  class _NameBuilder:
@@ -105,18 +105,26 @@ def _spec_kind(spec: Any) -> str:
105
105
 
106
106
  def _transform_helper(
107
107
  flow_builder_state: _FlowBuilderState,
108
- fn_spec: FunctionSpec,
108
+ fn_spec: FunctionSpec | Callable[..., Any],
109
109
  transform_args: list[tuple[Any, str | None]],
110
110
  name: str | None = None,
111
111
  ) -> DataSlice[Any]:
112
- if not isinstance(fn_spec, FunctionSpec):
112
+ if isinstance(fn_spec, FunctionSpec):
113
+ kind = _spec_kind(fn_spec)
114
+ spec = fn_spec
115
+ elif callable(fn_spec) and (
116
+ op_kind := getattr(fn_spec, "__cocoindex_op_kind__", None)
117
+ ):
118
+ kind = op_kind
119
+ spec = op.EmptyFunctionSpec()
120
+ else:
113
121
  raise ValueError("transform() can only be called on a CocoIndex function")
114
122
 
115
123
  return _create_data_slice(
116
124
  flow_builder_state,
117
125
  lambda target_scope, name: flow_builder_state.engine_flow_builder.transform(
118
- _spec_kind(fn_spec),
119
- dump_engine_object(fn_spec),
126
+ kind,
127
+ dump_engine_object(spec),
120
128
  transform_args,
121
129
  target_scope,
122
130
  flow_builder_state.field_name_builder.build_name(
@@ -245,7 +253,7 @@ class DataSlice(Generic[T]):
245
253
  f(scope)
246
254
 
247
255
  def transform(
248
- self, fn_spec: op.FunctionSpec, *args: Any, **kwargs: Any
256
+ self, fn_spec: op.FunctionSpec | Callable[..., Any], *args: Any, **kwargs: Any
249
257
  ) -> DataSlice[Any]:
250
258
  """
251
259
  Apply a function to the data slice.
@@ -513,7 +521,7 @@ class FlowBuilder:
513
521
  )
514
522
 
515
523
  def transform(
516
- self, fn_spec: FunctionSpec, *args: Any, **kwargs: Any
524
+ self, fn_spec: FunctionSpec | Callable[..., Any], *args: Any, **kwargs: Any
517
525
  ) -> DataSlice[Any]:
518
526
  """
519
527
  Apply a function to inputs, returning a DataSlice.
@@ -978,6 +986,12 @@ class TransformFlowInfo(NamedTuple):
978
986
  result_decoder: Callable[[Any], T]
979
987
 
980
988
 
989
+ class FlowArgInfo(NamedTuple):
990
+ name: str
991
+ type_hint: Any
992
+ encoder: Callable[[Any], Any]
993
+
994
+
981
995
  class TransformFlow(Generic[T]):
982
996
  """
983
997
  A transient transformation flow that transforms in-memory data.
@@ -985,8 +999,7 @@ class TransformFlow(Generic[T]):
985
999
 
986
1000
  _flow_fn: Callable[..., DataSlice[T]]
987
1001
  _flow_name: str
988
- _flow_arg_types: list[Any]
989
- _param_names: list[str]
1002
+ _args_info: list[FlowArgInfo]
990
1003
 
991
1004
  _lazy_lock: asyncio.Lock
992
1005
  _lazy_flow_info: TransformFlowInfo | None = None
@@ -994,7 +1007,6 @@ class TransformFlow(Generic[T]):
994
1007
  def __init__(
995
1008
  self,
996
1009
  flow_fn: Callable[..., DataSlice[T]],
997
- flow_arg_types: Sequence[Any],
998
1010
  /,
999
1011
  name: str | None = None,
1000
1012
  ):
@@ -1002,9 +1014,32 @@ class TransformFlow(Generic[T]):
1002
1014
  self._flow_name = _transform_flow_name_builder.build_name(
1003
1015
  name, prefix="_transform_flow_"
1004
1016
  )
1005
- self._flow_arg_types = list(flow_arg_types)
1006
1017
  self._lazy_lock = asyncio.Lock()
1007
1018
 
1019
+ sig = inspect.signature(flow_fn)
1020
+ args_info = []
1021
+ for param_name, param in sig.parameters.items():
1022
+ if param.kind not in (
1023
+ inspect.Parameter.POSITIONAL_OR_KEYWORD,
1024
+ inspect.Parameter.KEYWORD_ONLY,
1025
+ ):
1026
+ raise ValueError(
1027
+ f"Parameter `{param_name}` is not a parameter can be passed by name"
1028
+ )
1029
+ value_type_annotation: type | None = _get_data_slice_annotation_type(
1030
+ param.annotation
1031
+ )
1032
+ if value_type_annotation is None:
1033
+ raise ValueError(
1034
+ f"Parameter `{param_name}` for {flow_fn} has no value type annotation. "
1035
+ "Please use `cocoindex.DataSlice[T]` where T is the type of the value."
1036
+ )
1037
+ encoder = make_engine_value_encoder(
1038
+ analyze_type_info(value_type_annotation)
1039
+ )
1040
+ args_info.append(FlowArgInfo(param_name, value_type_annotation, encoder))
1041
+ self._args_info = args_info
1042
+
1008
1043
  def __call__(self, *args: Any, **kwargs: Any) -> DataSlice[T]:
1009
1044
  return self._flow_fn(*args, **kwargs)
1010
1045
 
@@ -1024,31 +1059,15 @@ class TransformFlow(Generic[T]):
1024
1059
 
1025
1060
  async def _build_flow_info_async(self) -> TransformFlowInfo:
1026
1061
  flow_builder_state = _FlowBuilderState(self._flow_name)
1027
- sig = inspect.signature(self._flow_fn)
1028
- if len(sig.parameters) != len(self._flow_arg_types):
1029
- raise ValueError(
1030
- f"Number of parameters in the flow function ({len(sig.parameters)}) "
1031
- f"does not match the number of argument types ({len(self._flow_arg_types)})"
1032
- )
1033
-
1034
1062
  kwargs: dict[str, DataSlice[T]] = {}
1035
- for (param_name, param), param_type in zip(
1036
- sig.parameters.items(), self._flow_arg_types
1037
- ):
1038
- if param.kind not in (
1039
- inspect.Parameter.POSITIONAL_OR_KEYWORD,
1040
- inspect.Parameter.KEYWORD_ONLY,
1041
- ):
1042
- raise ValueError(
1043
- f"Parameter `{param_name}` is not a parameter can be passed by name"
1044
- )
1045
- encoded_type = encode_enriched_type(param_type)
1063
+ for arg_info in self._args_info:
1064
+ encoded_type = encode_enriched_type(arg_info.type_hint)
1046
1065
  if encoded_type is None:
1047
- raise ValueError(f"Parameter `{param_name}` has no type annotation")
1066
+ raise ValueError(f"Parameter `{arg_info.name}` has no type annotation")
1048
1067
  engine_ds = flow_builder_state.engine_flow_builder.add_direct_input(
1049
- param_name, encoded_type
1068
+ arg_info.name, encoded_type
1050
1069
  )
1051
- kwargs[param_name] = DataSlice(
1070
+ kwargs[arg_info.name] = DataSlice(
1052
1071
  _DataSliceState(flow_builder_state, engine_ds)
1053
1072
  )
1054
1073
 
@@ -1061,13 +1080,12 @@ class TransformFlow(Generic[T]):
1061
1080
  execution_context.event_loop
1062
1081
  )
1063
1082
  )
1064
- self._param_names = list(sig.parameters.keys())
1065
1083
 
1066
1084
  engine_return_type = (
1067
1085
  _data_slice_state(output).engine_data_slice.data_type().schema()
1068
1086
  )
1069
1087
  python_return_type: type[T] | None = _get_data_slice_annotation_type(
1070
- sig.return_annotation
1088
+ inspect.signature(self._flow_fn).return_annotation
1071
1089
  )
1072
1090
  result_decoder = make_engine_value_decoder(
1073
1091
  [], engine_return_type["type"], analyze_type_info(python_return_type)
@@ -1099,13 +1117,14 @@ class TransformFlow(Generic[T]):
1099
1117
  """
1100
1118
  flow_info = await self._flow_info_async()
1101
1119
  params = []
1102
- for i, arg in enumerate(self._param_names):
1120
+ for i, arg_info in enumerate(self._args_info):
1103
1121
  if i < len(args):
1104
- params.append(encode_engine_value(args[i]))
1122
+ arg = args[i]
1105
1123
  elif arg in kwargs:
1106
- params.append(encode_engine_value(kwargs[arg]))
1124
+ arg = kwargs[arg]
1107
1125
  else:
1108
1126
  raise ValueError(f"Parameter {arg} is not provided")
1127
+ params.append(arg_info.encoder(arg))
1109
1128
  engine_result = await flow_info.engine_flow.evaluate_async(params)
1110
1129
  return flow_info.result_decoder(engine_result)
1111
1130
 
@@ -1116,27 +1135,7 @@ def transform_flow() -> Callable[[Callable[..., DataSlice[T]]], TransformFlow[T]
1116
1135
  """
1117
1136
 
1118
1137
  def _transform_flow_wrapper(fn: Callable[..., DataSlice[T]]) -> TransformFlow[T]:
1119
- sig = inspect.signature(fn)
1120
- arg_types = []
1121
- for param_name, param in sig.parameters.items():
1122
- if param.kind not in (
1123
- inspect.Parameter.POSITIONAL_OR_KEYWORD,
1124
- inspect.Parameter.KEYWORD_ONLY,
1125
- ):
1126
- raise ValueError(
1127
- f"Parameter `{param_name}` is not a parameter can be passed by name"
1128
- )
1129
- value_type_annotation: type[T] | None = _get_data_slice_annotation_type(
1130
- param.annotation
1131
- )
1132
- if value_type_annotation is None:
1133
- raise ValueError(
1134
- f"Parameter `{param_name}` for {fn} has no value type annotation. "
1135
- "Please use `cocoindex.DataSlice[T]` where T is the type of the value."
1136
- )
1137
- arg_types.append(value_type_annotation)
1138
-
1139
- _transform_flow = TransformFlow(fn, arg_types)
1138
+ _transform_flow = TransformFlow(fn)
1140
1139
  functools.update_wrapper(_transform_flow, fn)
1141
1140
  return _transform_flow
1142
1141
 
cocoindex/functions.py CHANGED
@@ -89,7 +89,7 @@ class SentenceTransformerEmbedExecutor:
89
89
  spec: SentenceTransformerEmbed
90
90
  _model: Any | None = None
91
91
 
92
- def analyze(self, _text: Any) -> type:
92
+ def analyze(self) -> type:
93
93
  try:
94
94
  # Only import sentence_transformers locally when it's needed, as its import is very slow.
95
95
  import sentence_transformers # pylint: disable=import-outside-toplevel
@@ -245,7 +245,7 @@ class ColPaliEmbedImageExecutor:
245
245
  spec: ColPaliEmbedImage
246
246
  _model_info: ColPaliModelInfo
247
247
 
248
- def analyze(self, _img_bytes: Any) -> type:
248
+ def analyze(self) -> type:
249
249
  # Get shared model and dimension
250
250
  self._model_info = _get_colpali_model_and_processor(self.spec.model)
251
251
 
@@ -321,7 +321,7 @@ class ColPaliEmbedQueryExecutor:
321
321
  spec: ColPaliEmbedQuery
322
322
  _model_info: ColPaliModelInfo
323
323
 
324
- def analyze(self, _query: Any) -> type:
324
+ def analyze(self) -> type:
325
325
  # Get shared model and dimension
326
326
  self._model_info = _get_colpali_model_and_processor(self.spec.model)
327
327