cocoindex 0.1.81__cp312-cp312-manylinux_2_28_x86_64.whl → 0.1.82__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cocoindex/cli.py CHANGED
@@ -4,9 +4,7 @@ import datetime
4
4
  import importlib.util
5
5
  import os
6
6
  import signal
7
- import sys
8
7
  import threading
9
- import types
10
8
  from types import FrameType
11
9
  from typing import Any, Iterable
12
10
 
@@ -20,6 +18,8 @@ from rich.table import Table
20
18
  from . import flow, lib, setting
21
19
  from .setup import flow_names_with_setup
22
20
  from .runtime import execution_context
21
+ from .subprocess_exec import add_user_app
22
+ from .user_app_loader import load_user_app
23
23
 
24
24
  # Create ServerSettings lazily upon first call, as environment variables may be loaded from files, etc.
25
25
  COCOINDEX_HOST = "https://cocoindex.io"
@@ -76,50 +76,9 @@ def _get_app_ref_from_specifier(
76
76
  return app_ref
77
77
 
78
78
 
79
- def _load_user_app(app_target: str) -> types.ModuleType:
80
- """
81
- Loads the user's application, which can be a file path or an installed module name.
82
- Exits on failure.
83
- """
84
- if not app_target:
85
- raise click.ClickException("Application target not provided.")
86
-
87
- looks_like_path = os.sep in app_target or app_target.lower().endswith(".py")
88
-
89
- if looks_like_path:
90
- if not os.path.isfile(app_target):
91
- raise click.ClickException(f"Application file path not found: {app_target}")
92
- app_path = os.path.abspath(app_target)
93
- app_dir = os.path.dirname(app_path)
94
- module_name = os.path.splitext(os.path.basename(app_path))[0]
95
-
96
- if app_dir not in sys.path:
97
- sys.path.insert(0, app_dir)
98
- try:
99
- spec = importlib.util.spec_from_file_location(module_name, app_path)
100
- if spec is None:
101
- raise ImportError(f"Could not create spec for file: {app_path}")
102
- module = importlib.util.module_from_spec(spec)
103
- sys.modules[spec.name] = module
104
- if spec.loader is None:
105
- raise ImportError(f"Could not create loader for file: {app_path}")
106
- spec.loader.exec_module(module)
107
- return module
108
- except (ImportError, FileNotFoundError, PermissionError) as e:
109
- raise click.ClickException(f"Failed importing file '{app_path}': {e}")
110
- finally:
111
- if app_dir in sys.path and sys.path[0] == app_dir:
112
- sys.path.pop(0)
113
-
114
- # Try as module
115
- try:
116
- return importlib.import_module(app_target)
117
- except ImportError as e:
118
- raise click.ClickException(f"Failed to load module '{app_target}': {e}")
119
- except Exception as e:
120
- raise click.ClickException(
121
- f"Unexpected error importing module '{app_target}': {e}"
122
- )
79
+ def _load_user_app(app_target: str) -> None:
80
+ load_user_app(app_target)
81
+ add_user_app(app_target)
123
82
 
124
83
 
125
84
  def _initialize_cocoindex_in_process() -> None:
cocoindex/convert.py CHANGED
@@ -9,26 +9,26 @@ import datetime
9
9
  import inspect
10
10
  import warnings
11
11
  from enum import Enum
12
- from typing import Any, Callable, Mapping, get_origin
12
+ from typing import Any, Callable, Mapping, Type, get_origin
13
13
 
14
14
  import numpy as np
15
15
 
16
16
  from .typing import (
17
17
  KEY_FIELD_NAME,
18
18
  TABLE_TYPES,
19
- analyze_type_info,
20
- encode_enriched_type,
21
- is_namedtuple_type,
22
- is_struct_type,
23
- AnalyzedTypeInfo,
24
19
  AnalyzedAnyType,
20
+ AnalyzedBasicType,
25
21
  AnalyzedDictType,
26
22
  AnalyzedListType,
27
- AnalyzedBasicType,
23
+ AnalyzedStructType,
24
+ AnalyzedTypeInfo,
28
25
  AnalyzedUnionType,
29
26
  AnalyzedUnknownType,
30
- AnalyzedStructType,
27
+ analyze_type_info,
28
+ encode_enriched_type,
29
+ is_namedtuple_type,
31
30
  is_numpy_number_type,
31
+ is_struct_type,
32
32
  )
33
33
 
34
34
 
@@ -50,34 +50,6 @@ class ChildFieldPath:
50
50
  self._field_path.pop()
51
51
 
52
52
 
53
- def encode_engine_value(value: Any) -> Any:
54
- """Encode a Python value to an engine value."""
55
- if dataclasses.is_dataclass(value):
56
- return [
57
- encode_engine_value(getattr(value, f.name))
58
- for f in dataclasses.fields(value)
59
- ]
60
- if is_namedtuple_type(type(value)):
61
- return [encode_engine_value(getattr(value, name)) for name in value._fields]
62
- if isinstance(value, np.number):
63
- return value.item()
64
- if isinstance(value, np.ndarray):
65
- return value
66
- if isinstance(value, (list, tuple)):
67
- return [encode_engine_value(v) for v in value]
68
- if isinstance(value, dict):
69
- if not value:
70
- return {}
71
-
72
- first_val = next(iter(value.values()))
73
- if is_struct_type(type(first_val)): # KTable
74
- return [
75
- [encode_engine_value(k)] + encode_engine_value(v)
76
- for k, v in value.items()
77
- ]
78
- return value
79
-
80
-
81
53
  _CONVERTIBLE_KINDS = {
82
54
  ("Float32", "Float64"),
83
55
  ("LocalDateTime", "OffsetDateTime"),
@@ -91,6 +63,118 @@ def _is_type_kind_convertible_to(src_type_kind: str, dst_type_kind: str) -> bool
91
63
  )
92
64
 
93
65
 
66
+ # Pre-computed type info for missing/Any type annotations
67
+ ANY_TYPE_INFO = analyze_type_info(inspect.Parameter.empty)
68
+
69
+
70
+ def make_engine_value_encoder(type_info: AnalyzedTypeInfo) -> Callable[[Any], Any]:
71
+ """
72
+ Create an encoder closure for a specific type.
73
+ """
74
+ variant = type_info.variant
75
+
76
+ if isinstance(variant, AnalyzedUnknownType):
77
+ raise ValueError(f"Type annotation `{type_info.core_type}` is unsupported")
78
+
79
+ if isinstance(variant, AnalyzedListType):
80
+ elem_type_info = (
81
+ analyze_type_info(variant.elem_type) if variant.elem_type else ANY_TYPE_INFO
82
+ )
83
+ if isinstance(elem_type_info.variant, AnalyzedStructType):
84
+ elem_encoder = make_engine_value_encoder(elem_type_info)
85
+
86
+ def encode_struct_list(value: Any) -> Any:
87
+ return None if value is None else [elem_encoder(v) for v in value]
88
+
89
+ return encode_struct_list
90
+
91
+ if isinstance(variant, AnalyzedDictType):
92
+ if not variant.value_type:
93
+ return lambda value: value
94
+
95
+ value_type_info = analyze_type_info(variant.value_type)
96
+ if isinstance(value_type_info.variant, AnalyzedStructType):
97
+
98
+ def encode_struct_dict(value: Any) -> Any:
99
+ if not isinstance(value, dict):
100
+ return value
101
+ if not value:
102
+ return []
103
+
104
+ sample_key, sample_val = next(iter(value.items()))
105
+ key_type, val_type = type(sample_key), type(sample_val)
106
+
107
+ # Handle KTable case
108
+ if value and is_struct_type(val_type):
109
+ key_encoder = (
110
+ make_engine_value_encoder(analyze_type_info(key_type))
111
+ if is_struct_type(key_type)
112
+ else make_engine_value_encoder(ANY_TYPE_INFO)
113
+ )
114
+ value_encoder = make_engine_value_encoder(
115
+ analyze_type_info(val_type)
116
+ )
117
+ return [
118
+ [key_encoder(k)] + value_encoder(v) for k, v in value.items()
119
+ ]
120
+ return {key_encoder(k): value_encoder(v) for k, v in value.items()}
121
+
122
+ return encode_struct_dict
123
+
124
+ if isinstance(variant, AnalyzedStructType):
125
+ struct_type = variant.struct_type
126
+
127
+ if dataclasses.is_dataclass(struct_type):
128
+ fields = dataclasses.fields(struct_type)
129
+ field_encoders = [
130
+ make_engine_value_encoder(analyze_type_info(f.type)) for f in fields
131
+ ]
132
+ field_names = [f.name for f in fields]
133
+
134
+ def encode_dataclass(value: Any) -> Any:
135
+ if not dataclasses.is_dataclass(value):
136
+ return value
137
+ return [
138
+ encoder(getattr(value, name))
139
+ for encoder, name in zip(field_encoders, field_names)
140
+ ]
141
+
142
+ return encode_dataclass
143
+
144
+ elif is_namedtuple_type(struct_type):
145
+ annotations = struct_type.__annotations__
146
+ field_names = list(getattr(struct_type, "_fields", ()))
147
+ field_encoders = [
148
+ make_engine_value_encoder(
149
+ analyze_type_info(annotations[name])
150
+ if name in annotations
151
+ else ANY_TYPE_INFO
152
+ )
153
+ for name in field_names
154
+ ]
155
+
156
+ def encode_namedtuple(value: Any) -> Any:
157
+ if not is_namedtuple_type(type(value)):
158
+ return value
159
+ return [
160
+ encoder(getattr(value, name))
161
+ for encoder, name in zip(field_encoders, field_names)
162
+ ]
163
+
164
+ return encode_namedtuple
165
+
166
+ def encode_basic_value(value: Any) -> Any:
167
+ if isinstance(value, np.number):
168
+ return value.item()
169
+ if isinstance(value, np.ndarray):
170
+ return value
171
+ if isinstance(value, (list, tuple)):
172
+ return [encode_basic_value(v) for v in value]
173
+ return value
174
+
175
+ return encode_basic_value
176
+
177
+
94
178
  def make_engine_value_decoder(
95
179
  field_path: list[str],
96
180
  src_type: dict[str, Any],
cocoindex/flow.py CHANGED
@@ -9,15 +9,6 @@ import datetime
9
9
  import functools
10
10
  import inspect
11
11
  import re
12
-
13
- from .validation import (
14
- validate_flow_name,
15
- NamingError,
16
- validate_full_flow_name,
17
- validate_target_name,
18
- )
19
- from .typing import analyze_type_info
20
-
21
12
  from dataclasses import dataclass
22
13
  from enum import Enum
23
14
  from threading import Lock
@@ -25,13 +16,13 @@ from typing import (
25
16
  Any,
26
17
  Callable,
27
18
  Generic,
19
+ Iterable,
28
20
  NamedTuple,
29
21
  Sequence,
30
22
  TypeVar,
31
23
  cast,
32
24
  get_args,
33
25
  get_origin,
34
- Iterable,
35
26
  )
36
27
 
37
28
  from rich.text import Text
@@ -41,11 +32,20 @@ from . import _engine # type: ignore
41
32
  from . import index
42
33
  from . import op
43
34
  from . import setting
44
- from .convert import dump_engine_object, encode_engine_value, make_engine_value_decoder
35
+ from .convert import (
36
+ dump_engine_object,
37
+ make_engine_value_decoder,
38
+ make_engine_value_encoder,
39
+ )
45
40
  from .op import FunctionSpec
46
41
  from .runtime import execution_context
47
42
  from .setup import SetupChangeBundle
48
- from .typing import encode_enriched_type
43
+ from .typing import analyze_type_info, encode_enriched_type
44
+ from .validation import (
45
+ validate_flow_name,
46
+ validate_full_flow_name,
47
+ validate_target_name,
48
+ )
49
49
 
50
50
 
51
51
  class _NameBuilder:
@@ -978,6 +978,12 @@ class TransformFlowInfo(NamedTuple):
978
978
  result_decoder: Callable[[Any], T]
979
979
 
980
980
 
981
+ class FlowArgInfo(NamedTuple):
982
+ name: str
983
+ type_hint: Any
984
+ encoder: Callable[[Any], Any]
985
+
986
+
981
987
  class TransformFlow(Generic[T]):
982
988
  """
983
989
  A transient transformation flow that transforms in-memory data.
@@ -985,8 +991,7 @@ class TransformFlow(Generic[T]):
985
991
 
986
992
  _flow_fn: Callable[..., DataSlice[T]]
987
993
  _flow_name: str
988
- _flow_arg_types: list[Any]
989
- _param_names: list[str]
994
+ _args_info: list[FlowArgInfo]
990
995
 
991
996
  _lazy_lock: asyncio.Lock
992
997
  _lazy_flow_info: TransformFlowInfo | None = None
@@ -994,7 +999,6 @@ class TransformFlow(Generic[T]):
994
999
  def __init__(
995
1000
  self,
996
1001
  flow_fn: Callable[..., DataSlice[T]],
997
- flow_arg_types: Sequence[Any],
998
1002
  /,
999
1003
  name: str | None = None,
1000
1004
  ):
@@ -1002,9 +1006,32 @@ class TransformFlow(Generic[T]):
1002
1006
  self._flow_name = _transform_flow_name_builder.build_name(
1003
1007
  name, prefix="_transform_flow_"
1004
1008
  )
1005
- self._flow_arg_types = list(flow_arg_types)
1006
1009
  self._lazy_lock = asyncio.Lock()
1007
1010
 
1011
+ sig = inspect.signature(flow_fn)
1012
+ args_info = []
1013
+ for param_name, param in sig.parameters.items():
1014
+ if param.kind not in (
1015
+ inspect.Parameter.POSITIONAL_OR_KEYWORD,
1016
+ inspect.Parameter.KEYWORD_ONLY,
1017
+ ):
1018
+ raise ValueError(
1019
+ f"Parameter `{param_name}` is not a parameter can be passed by name"
1020
+ )
1021
+ value_type_annotation: type | None = _get_data_slice_annotation_type(
1022
+ param.annotation
1023
+ )
1024
+ if value_type_annotation is None:
1025
+ raise ValueError(
1026
+ f"Parameter `{param_name}` for {flow_fn} has no value type annotation. "
1027
+ "Please use `cocoindex.DataSlice[T]` where T is the type of the value."
1028
+ )
1029
+ encoder = make_engine_value_encoder(
1030
+ analyze_type_info(value_type_annotation)
1031
+ )
1032
+ args_info.append(FlowArgInfo(param_name, value_type_annotation, encoder))
1033
+ self._args_info = args_info
1034
+
1008
1035
  def __call__(self, *args: Any, **kwargs: Any) -> DataSlice[T]:
1009
1036
  return self._flow_fn(*args, **kwargs)
1010
1037
 
@@ -1024,31 +1051,15 @@ class TransformFlow(Generic[T]):
1024
1051
 
1025
1052
  async def _build_flow_info_async(self) -> TransformFlowInfo:
1026
1053
  flow_builder_state = _FlowBuilderState(self._flow_name)
1027
- sig = inspect.signature(self._flow_fn)
1028
- if len(sig.parameters) != len(self._flow_arg_types):
1029
- raise ValueError(
1030
- f"Number of parameters in the flow function ({len(sig.parameters)}) "
1031
- f"does not match the number of argument types ({len(self._flow_arg_types)})"
1032
- )
1033
-
1034
1054
  kwargs: dict[str, DataSlice[T]] = {}
1035
- for (param_name, param), param_type in zip(
1036
- sig.parameters.items(), self._flow_arg_types
1037
- ):
1038
- if param.kind not in (
1039
- inspect.Parameter.POSITIONAL_OR_KEYWORD,
1040
- inspect.Parameter.KEYWORD_ONLY,
1041
- ):
1042
- raise ValueError(
1043
- f"Parameter `{param_name}` is not a parameter can be passed by name"
1044
- )
1045
- encoded_type = encode_enriched_type(param_type)
1055
+ for arg_info in self._args_info:
1056
+ encoded_type = encode_enriched_type(arg_info.type_hint)
1046
1057
  if encoded_type is None:
1047
- raise ValueError(f"Parameter `{param_name}` has no type annotation")
1058
+ raise ValueError(f"Parameter `{arg_info.name}` has no type annotation")
1048
1059
  engine_ds = flow_builder_state.engine_flow_builder.add_direct_input(
1049
- param_name, encoded_type
1060
+ arg_info.name, encoded_type
1050
1061
  )
1051
- kwargs[param_name] = DataSlice(
1062
+ kwargs[arg_info.name] = DataSlice(
1052
1063
  _DataSliceState(flow_builder_state, engine_ds)
1053
1064
  )
1054
1065
 
@@ -1061,13 +1072,12 @@ class TransformFlow(Generic[T]):
1061
1072
  execution_context.event_loop
1062
1073
  )
1063
1074
  )
1064
- self._param_names = list(sig.parameters.keys())
1065
1075
 
1066
1076
  engine_return_type = (
1067
1077
  _data_slice_state(output).engine_data_slice.data_type().schema()
1068
1078
  )
1069
1079
  python_return_type: type[T] | None = _get_data_slice_annotation_type(
1070
- sig.return_annotation
1080
+ inspect.signature(self._flow_fn).return_annotation
1071
1081
  )
1072
1082
  result_decoder = make_engine_value_decoder(
1073
1083
  [], engine_return_type["type"], analyze_type_info(python_return_type)
@@ -1099,13 +1109,14 @@ class TransformFlow(Generic[T]):
1099
1109
  """
1100
1110
  flow_info = await self._flow_info_async()
1101
1111
  params = []
1102
- for i, arg in enumerate(self._param_names):
1112
+ for i, arg_info in enumerate(self._args_info):
1103
1113
  if i < len(args):
1104
- params.append(encode_engine_value(args[i]))
1114
+ arg = args[i]
1105
1115
  elif arg in kwargs:
1106
- params.append(encode_engine_value(kwargs[arg]))
1116
+ arg = kwargs[arg]
1107
1117
  else:
1108
1118
  raise ValueError(f"Parameter {arg} is not provided")
1119
+ params.append(arg_info.encoder(arg))
1109
1120
  engine_result = await flow_info.engine_flow.evaluate_async(params)
1110
1121
  return flow_info.result_decoder(engine_result)
1111
1122
 
@@ -1116,27 +1127,7 @@ def transform_flow() -> Callable[[Callable[..., DataSlice[T]]], TransformFlow[T]
1116
1127
  """
1117
1128
 
1118
1129
  def _transform_flow_wrapper(fn: Callable[..., DataSlice[T]]) -> TransformFlow[T]:
1119
- sig = inspect.signature(fn)
1120
- arg_types = []
1121
- for param_name, param in sig.parameters.items():
1122
- if param.kind not in (
1123
- inspect.Parameter.POSITIONAL_OR_KEYWORD,
1124
- inspect.Parameter.KEYWORD_ONLY,
1125
- ):
1126
- raise ValueError(
1127
- f"Parameter `{param_name}` is not a parameter can be passed by name"
1128
- )
1129
- value_type_annotation: type[T] | None = _get_data_slice_annotation_type(
1130
- param.annotation
1131
- )
1132
- if value_type_annotation is None:
1133
- raise ValueError(
1134
- f"Parameter `{param_name}` for {fn} has no value type annotation. "
1135
- "Please use `cocoindex.DataSlice[T]` where T is the type of the value."
1136
- )
1137
- arg_types.append(value_type_annotation)
1138
-
1139
- _transform_flow = TransformFlow(fn, arg_types)
1130
+ _transform_flow = TransformFlow(fn)
1140
1131
  functools.update_wrapper(_transform_flow, fn)
1141
1132
  return _transform_flow
1142
1133
 
cocoindex/functions.py CHANGED
@@ -89,7 +89,7 @@ class SentenceTransformerEmbedExecutor:
89
89
  spec: SentenceTransformerEmbed
90
90
  _model: Any | None = None
91
91
 
92
- def analyze(self, _text: Any) -> type:
92
+ def analyze(self) -> type:
93
93
  try:
94
94
  # Only import sentence_transformers locally when it's needed, as its import is very slow.
95
95
  import sentence_transformers # pylint: disable=import-outside-toplevel
@@ -245,7 +245,7 @@ class ColPaliEmbedImageExecutor:
245
245
  spec: ColPaliEmbedImage
246
246
  _model_info: ColPaliModelInfo
247
247
 
248
- def analyze(self, _img_bytes: Any) -> type:
248
+ def analyze(self) -> type:
249
249
  # Get shared model and dimension
250
250
  self._model_info = _get_colpali_model_and_processor(self.spec.model)
251
251
 
@@ -321,7 +321,7 @@ class ColPaliEmbedQueryExecutor:
321
321
  spec: ColPaliEmbedQuery
322
322
  _model_info: ColPaliModelInfo
323
323
 
324
- def analyze(self, _query: Any) -> type:
324
+ def analyze(self) -> type:
325
325
  # Get shared model and dimension
326
326
  self._model_info = _get_colpali_model_and_processor(self.spec.model)
327
327