cocoindex 0.3.4__cp311-abi3-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoindex/__init__.py +114 -0
- cocoindex/_engine.abi3.so +0 -0
- cocoindex/auth_registry.py +44 -0
- cocoindex/cli.py +830 -0
- cocoindex/engine_object.py +214 -0
- cocoindex/engine_value.py +550 -0
- cocoindex/flow.py +1281 -0
- cocoindex/functions/__init__.py +40 -0
- cocoindex/functions/_engine_builtin_specs.py +66 -0
- cocoindex/functions/colpali.py +247 -0
- cocoindex/functions/sbert.py +77 -0
- cocoindex/index.py +50 -0
- cocoindex/lib.py +75 -0
- cocoindex/llm.py +47 -0
- cocoindex/op.py +1047 -0
- cocoindex/py.typed +0 -0
- cocoindex/query_handler.py +57 -0
- cocoindex/runtime.py +78 -0
- cocoindex/setting.py +171 -0
- cocoindex/setup.py +92 -0
- cocoindex/sources/__init__.py +5 -0
- cocoindex/sources/_engine_builtin_specs.py +120 -0
- cocoindex/subprocess_exec.py +277 -0
- cocoindex/targets/__init__.py +5 -0
- cocoindex/targets/_engine_builtin_specs.py +153 -0
- cocoindex/targets/lancedb.py +466 -0
- cocoindex/tests/__init__.py +0 -0
- cocoindex/tests/test_engine_object.py +331 -0
- cocoindex/tests/test_engine_value.py +1724 -0
- cocoindex/tests/test_optional_database.py +249 -0
- cocoindex/tests/test_transform_flow.py +300 -0
- cocoindex/tests/test_typing.py +553 -0
- cocoindex/tests/test_validation.py +134 -0
- cocoindex/typing.py +834 -0
- cocoindex/user_app_loader.py +53 -0
- cocoindex/utils.py +20 -0
- cocoindex/validation.py +104 -0
- cocoindex-0.3.4.dist-info/METADATA +288 -0
- cocoindex-0.3.4.dist-info/RECORD +42 -0
- cocoindex-0.3.4.dist-info/WHEEL +4 -0
- cocoindex-0.3.4.dist-info/entry_points.txt +2 -0
- cocoindex-0.3.4.dist-info/licenses/THIRD_PARTY_NOTICES.html +13249 -0
cocoindex/typing.py
ADDED
|
@@ -0,0 +1,834 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import dataclasses
|
|
3
|
+
import datetime
|
|
4
|
+
import inspect
|
|
5
|
+
import types
|
|
6
|
+
import typing
|
|
7
|
+
import uuid
|
|
8
|
+
from typing import (
|
|
9
|
+
TYPE_CHECKING,
|
|
10
|
+
Annotated,
|
|
11
|
+
Any,
|
|
12
|
+
Generic,
|
|
13
|
+
Iterator,
|
|
14
|
+
Literal,
|
|
15
|
+
NamedTuple,
|
|
16
|
+
Protocol,
|
|
17
|
+
TypeVar,
|
|
18
|
+
overload,
|
|
19
|
+
Self,
|
|
20
|
+
get_type_hints,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
from numpy.typing import NDArray
|
|
25
|
+
|
|
26
|
+
# Optional Pydantic support
|
|
27
|
+
try:
|
|
28
|
+
import pydantic
|
|
29
|
+
|
|
30
|
+
PYDANTIC_AVAILABLE = True
|
|
31
|
+
except ImportError:
|
|
32
|
+
pydantic = None # type: ignore[assignment]
|
|
33
|
+
PYDANTIC_AVAILABLE = False
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
if PYDANTIC_AVAILABLE:
|
|
37
|
+
from pydantic import BaseModel
|
|
38
|
+
else:
|
|
39
|
+
BaseModel = object # type: ignore[misc,assignment]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class VectorInfo(NamedTuple):
|
|
43
|
+
dim: int | None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class TypeKind(NamedTuple):
|
|
47
|
+
kind: str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class TypeAttr:
|
|
51
|
+
key: str
|
|
52
|
+
value: Any
|
|
53
|
+
|
|
54
|
+
def __init__(self, key: str, value: Any):
|
|
55
|
+
self.key = key
|
|
56
|
+
self.value = value
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
Annotation = TypeKind | TypeAttr | VectorInfo
|
|
60
|
+
|
|
61
|
+
Int64 = Annotated[int, TypeKind("Int64")]
|
|
62
|
+
Float32 = Annotated[float, TypeKind("Float32")]
|
|
63
|
+
Float64 = Annotated[float, TypeKind("Float64")]
|
|
64
|
+
Range = Annotated[tuple[int, int], TypeKind("Range")]
|
|
65
|
+
Json = Annotated[Any, TypeKind("Json")]
|
|
66
|
+
LocalDateTime = Annotated[datetime.datetime, TypeKind("LocalDateTime")]
|
|
67
|
+
OffsetDateTime = Annotated[datetime.datetime, TypeKind("OffsetDateTime")]
|
|
68
|
+
|
|
69
|
+
if TYPE_CHECKING:
|
|
70
|
+
T_co = TypeVar("T_co", covariant=True)
|
|
71
|
+
Dim_co = TypeVar("Dim_co", bound=int | None, covariant=True, default=None)
|
|
72
|
+
|
|
73
|
+
class Vector(Protocol, Generic[T_co, Dim_co]):
|
|
74
|
+
"""Vector[T, Dim] is a special typing alias for an NDArray[T] with optional dimension info"""
|
|
75
|
+
|
|
76
|
+
def __getitem__(self, index: int) -> T_co: ...
|
|
77
|
+
def __len__(self) -> int: ...
|
|
78
|
+
|
|
79
|
+
else:
|
|
80
|
+
|
|
81
|
+
class Vector: # type: ignore[unreachable]
|
|
82
|
+
"""A special typing alias for an NDArray[T] with optional dimension info"""
|
|
83
|
+
|
|
84
|
+
def __class_getitem__(self, params):
|
|
85
|
+
if not isinstance(params, tuple):
|
|
86
|
+
# No dimension provided, e.g., Vector[np.float32]
|
|
87
|
+
dtype = params
|
|
88
|
+
vector_info = VectorInfo(dim=None)
|
|
89
|
+
else:
|
|
90
|
+
# Element type and dimension provided, e.g., Vector[np.float32, Literal[3]]
|
|
91
|
+
dtype, dim_literal = params
|
|
92
|
+
# Extract the literal value
|
|
93
|
+
dim_val = (
|
|
94
|
+
typing.get_args(dim_literal)[0]
|
|
95
|
+
if typing.get_origin(dim_literal) is Literal
|
|
96
|
+
else None
|
|
97
|
+
)
|
|
98
|
+
vector_info = VectorInfo(dim=dim_val)
|
|
99
|
+
|
|
100
|
+
# Use NDArray for supported numeric dtypes, else list
|
|
101
|
+
base_type = analyze_type_info(dtype).base_type
|
|
102
|
+
if is_numpy_number_type(base_type) or base_type is np.ndarray:
|
|
103
|
+
return Annotated[NDArray[dtype], vector_info]
|
|
104
|
+
return Annotated[list[dtype], vector_info]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
TABLE_TYPES: tuple[str, str] = ("KTable", "LTable")
|
|
108
|
+
KEY_FIELD_NAME: str = "_key"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def extract_ndarray_elem_dtype(ndarray_type: Any) -> Any:
|
|
112
|
+
args = typing.get_args(ndarray_type)
|
|
113
|
+
_, dtype_spec = args
|
|
114
|
+
dtype_args = typing.get_args(dtype_spec)
|
|
115
|
+
if not dtype_args:
|
|
116
|
+
raise ValueError(f"Invalid dtype specification: {dtype_spec}")
|
|
117
|
+
return dtype_args[0]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def is_numpy_number_type(t: type) -> bool:
|
|
121
|
+
return isinstance(t, type) and issubclass(t, (np.integer, np.floating))
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def is_namedtuple_type(t: type) -> bool:
|
|
125
|
+
return isinstance(t, type) and issubclass(t, tuple) and hasattr(t, "_fields")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def is_pydantic_model(t: Any) -> bool:
|
|
129
|
+
"""Check if a type is a Pydantic model."""
|
|
130
|
+
if not PYDANTIC_AVAILABLE or not isinstance(t, type):
|
|
131
|
+
return False
|
|
132
|
+
try:
|
|
133
|
+
return issubclass(t, pydantic.BaseModel)
|
|
134
|
+
except TypeError:
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def is_struct_type(t: Any) -> bool:
|
|
139
|
+
return isinstance(t, type) and (
|
|
140
|
+
dataclasses.is_dataclass(t) or is_namedtuple_type(t) or is_pydantic_model(t)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class DtypeRegistry:
|
|
145
|
+
"""
|
|
146
|
+
Registry for NumPy dtypes used in CocoIndex.
|
|
147
|
+
Maps NumPy dtypes to their CocoIndex type kind.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
_DTYPE_TO_KIND: dict[Any, str] = {
|
|
151
|
+
np.float32: "Float32",
|
|
152
|
+
np.float64: "Float64",
|
|
153
|
+
np.int64: "Int64",
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
def validate_dtype_and_get_kind(cls, dtype: Any) -> str:
|
|
158
|
+
"""
|
|
159
|
+
Validate that the given dtype is supported, and get its CocoIndex kind by dtype.
|
|
160
|
+
"""
|
|
161
|
+
if dtype is Any:
|
|
162
|
+
raise TypeError(
|
|
163
|
+
"NDArray for Vector must use a concrete numpy dtype, got `Any`."
|
|
164
|
+
)
|
|
165
|
+
kind = cls._DTYPE_TO_KIND.get(dtype)
|
|
166
|
+
if kind is None:
|
|
167
|
+
raise ValueError(
|
|
168
|
+
f"Unsupported NumPy dtype in NDArray: {dtype}. "
|
|
169
|
+
f"Supported dtypes: {cls._DTYPE_TO_KIND.keys()}"
|
|
170
|
+
)
|
|
171
|
+
return kind
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class AnalyzedAnyType(NamedTuple):
|
|
175
|
+
"""
|
|
176
|
+
When the type annotation is missing or matches any type.
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class AnalyzedBasicType(NamedTuple):
|
|
181
|
+
"""
|
|
182
|
+
For types that fit into basic type, and annotated with basic type or Json type.
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
kind: str
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class AnalyzedListType(NamedTuple):
|
|
189
|
+
"""
|
|
190
|
+
Any list type, e.g. list[T], Sequence[T], NDArray[T], etc.
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
elem_type: Any
|
|
194
|
+
vector_info: VectorInfo | None
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class AnalyzedStructFieldInfo(NamedTuple):
|
|
198
|
+
"""
|
|
199
|
+
Info about a field in a struct type.
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
name: str
|
|
203
|
+
type_hint: Any
|
|
204
|
+
default_value: Any
|
|
205
|
+
description: str | None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class AnalyzedStructType(NamedTuple):
|
|
209
|
+
"""
|
|
210
|
+
Any struct type, e.g. dataclass, NamedTuple, etc.
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
struct_type: type
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
def fields(self) -> Iterator[AnalyzedStructFieldInfo]:
|
|
217
|
+
type_hints = get_type_hints(self.struct_type, include_extras=True)
|
|
218
|
+
if dataclasses.is_dataclass(self.struct_type):
|
|
219
|
+
parameters = inspect.signature(self.struct_type).parameters
|
|
220
|
+
for name, parameter in parameters.items():
|
|
221
|
+
yield AnalyzedStructFieldInfo(
|
|
222
|
+
name=name,
|
|
223
|
+
type_hint=type_hints.get(name, Any),
|
|
224
|
+
default_value=parameter.default,
|
|
225
|
+
description=None,
|
|
226
|
+
)
|
|
227
|
+
elif is_namedtuple_type(self.struct_type):
|
|
228
|
+
fields = getattr(self.struct_type, "_fields", ())
|
|
229
|
+
defaults = getattr(self.struct_type, "_field_defaults", {})
|
|
230
|
+
for name in fields:
|
|
231
|
+
yield AnalyzedStructFieldInfo(
|
|
232
|
+
name=name,
|
|
233
|
+
type_hint=type_hints.get(name, Any),
|
|
234
|
+
default_value=defaults.get(name, inspect.Parameter.empty),
|
|
235
|
+
description=None,
|
|
236
|
+
)
|
|
237
|
+
elif is_pydantic_model(self.struct_type):
|
|
238
|
+
model_fields = getattr(self.struct_type, "model_fields", {})
|
|
239
|
+
for name, field_info in model_fields.items():
|
|
240
|
+
yield AnalyzedStructFieldInfo(
|
|
241
|
+
name=name,
|
|
242
|
+
type_hint=type_hints.get(name, Any),
|
|
243
|
+
default_value=field_info.default
|
|
244
|
+
if field_info.default is not ...
|
|
245
|
+
else inspect.Parameter.empty,
|
|
246
|
+
description=field_info.description,
|
|
247
|
+
)
|
|
248
|
+
else:
|
|
249
|
+
raise ValueError(f"Unsupported struct type: {self.struct_type}")
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class AnalyzedUnionType(NamedTuple):
|
|
253
|
+
"""
|
|
254
|
+
Any union type, e.g. T1 | T2 | ..., etc.
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
variant_types: list[Any]
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class AnalyzedDictType(NamedTuple):
|
|
261
|
+
"""
|
|
262
|
+
Any dict type, e.g. dict[T1, T2], Mapping[T1, T2], etc.
|
|
263
|
+
"""
|
|
264
|
+
|
|
265
|
+
key_type: Any
|
|
266
|
+
value_type: Any
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class AnalyzedUnknownType(NamedTuple):
|
|
270
|
+
"""
|
|
271
|
+
Any type that is not supported by CocoIndex.
|
|
272
|
+
"""
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
AnalyzedTypeVariant = (
|
|
276
|
+
AnalyzedAnyType
|
|
277
|
+
| AnalyzedBasicType
|
|
278
|
+
| AnalyzedListType
|
|
279
|
+
| AnalyzedStructType
|
|
280
|
+
| AnalyzedUnionType
|
|
281
|
+
| AnalyzedDictType
|
|
282
|
+
| AnalyzedUnknownType
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
@dataclasses.dataclass
|
|
287
|
+
class AnalyzedTypeInfo:
|
|
288
|
+
"""
|
|
289
|
+
Analyzed info of a Python type.
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
# The type without annotations. e.g. int, list[int], dict[str, int]
|
|
293
|
+
core_type: Any
|
|
294
|
+
# The type without annotations and parameters. e.g. int, list, dict
|
|
295
|
+
base_type: Any
|
|
296
|
+
variant: AnalyzedTypeVariant
|
|
297
|
+
attrs: dict[str, Any] | None
|
|
298
|
+
nullable: bool = False
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def analyze_type_info(t: Any) -> AnalyzedTypeInfo:
|
|
302
|
+
"""
|
|
303
|
+
Analyze a Python type annotation and extract CocoIndex-specific type information.
|
|
304
|
+
"""
|
|
305
|
+
|
|
306
|
+
annotations: tuple[Annotation, ...] = ()
|
|
307
|
+
base_type = None
|
|
308
|
+
type_args: tuple[Any, ...] = ()
|
|
309
|
+
nullable = False
|
|
310
|
+
while True:
|
|
311
|
+
base_type = typing.get_origin(t)
|
|
312
|
+
if base_type is Annotated:
|
|
313
|
+
annotations = t.__metadata__
|
|
314
|
+
t = t.__origin__
|
|
315
|
+
else:
|
|
316
|
+
if base_type is None:
|
|
317
|
+
base_type = t
|
|
318
|
+
else:
|
|
319
|
+
type_args = typing.get_args(t)
|
|
320
|
+
break
|
|
321
|
+
core_type = t
|
|
322
|
+
|
|
323
|
+
attrs: dict[str, Any] | None = None
|
|
324
|
+
vector_info: VectorInfo | None = None
|
|
325
|
+
kind: str | None = None
|
|
326
|
+
for attr in annotations:
|
|
327
|
+
if isinstance(attr, TypeAttr):
|
|
328
|
+
if attrs is None:
|
|
329
|
+
attrs = dict()
|
|
330
|
+
attrs[attr.key] = attr.value
|
|
331
|
+
elif isinstance(attr, VectorInfo):
|
|
332
|
+
vector_info = attr
|
|
333
|
+
elif isinstance(attr, TypeKind):
|
|
334
|
+
kind = attr.kind
|
|
335
|
+
|
|
336
|
+
variant: AnalyzedTypeVariant | None = None
|
|
337
|
+
|
|
338
|
+
if kind is not None:
|
|
339
|
+
variant = AnalyzedBasicType(kind=kind)
|
|
340
|
+
elif base_type is Any or base_type is inspect.Parameter.empty:
|
|
341
|
+
variant = AnalyzedAnyType()
|
|
342
|
+
elif is_struct_type(base_type):
|
|
343
|
+
variant = AnalyzedStructType(struct_type=t)
|
|
344
|
+
elif is_numpy_number_type(t):
|
|
345
|
+
kind = DtypeRegistry.validate_dtype_and_get_kind(t)
|
|
346
|
+
variant = AnalyzedBasicType(kind=kind)
|
|
347
|
+
elif base_type is collections.abc.Sequence or base_type is list:
|
|
348
|
+
elem_type = type_args[0] if len(type_args) > 0 else Any
|
|
349
|
+
variant = AnalyzedListType(elem_type=elem_type, vector_info=vector_info)
|
|
350
|
+
elif base_type is np.ndarray:
|
|
351
|
+
np_number_type = t
|
|
352
|
+
elem_type = extract_ndarray_elem_dtype(np_number_type)
|
|
353
|
+
variant = AnalyzedListType(elem_type=elem_type, vector_info=vector_info)
|
|
354
|
+
elif base_type is collections.abc.Mapping or base_type is dict or t is dict:
|
|
355
|
+
key_type = type_args[0] if len(type_args) > 0 else Any
|
|
356
|
+
elem_type = type_args[1] if len(type_args) > 1 else Any
|
|
357
|
+
variant = AnalyzedDictType(key_type=key_type, value_type=elem_type)
|
|
358
|
+
elif base_type in (types.UnionType, typing.Union):
|
|
359
|
+
non_none_types = [arg for arg in type_args if arg not in (None, types.NoneType)]
|
|
360
|
+
if len(non_none_types) == 0:
|
|
361
|
+
return analyze_type_info(None)
|
|
362
|
+
|
|
363
|
+
nullable = len(non_none_types) < len(type_args)
|
|
364
|
+
if len(non_none_types) == 1:
|
|
365
|
+
result = analyze_type_info(non_none_types[0])
|
|
366
|
+
result.nullable = nullable
|
|
367
|
+
return result
|
|
368
|
+
|
|
369
|
+
variant = AnalyzedUnionType(variant_types=non_none_types)
|
|
370
|
+
else:
|
|
371
|
+
if t is bytes:
|
|
372
|
+
kind = "Bytes"
|
|
373
|
+
elif t is str:
|
|
374
|
+
kind = "Str"
|
|
375
|
+
elif t is bool:
|
|
376
|
+
kind = "Bool"
|
|
377
|
+
elif t is int:
|
|
378
|
+
kind = "Int64"
|
|
379
|
+
elif t is float:
|
|
380
|
+
kind = "Float64"
|
|
381
|
+
elif t is uuid.UUID:
|
|
382
|
+
kind = "Uuid"
|
|
383
|
+
elif t is datetime.date:
|
|
384
|
+
kind = "Date"
|
|
385
|
+
elif t is datetime.time:
|
|
386
|
+
kind = "Time"
|
|
387
|
+
elif t is datetime.datetime:
|
|
388
|
+
kind = "OffsetDateTime"
|
|
389
|
+
elif t is datetime.timedelta:
|
|
390
|
+
kind = "TimeDelta"
|
|
391
|
+
|
|
392
|
+
if kind is None:
|
|
393
|
+
variant = AnalyzedUnknownType()
|
|
394
|
+
else:
|
|
395
|
+
variant = AnalyzedBasicType(kind=kind)
|
|
396
|
+
|
|
397
|
+
return AnalyzedTypeInfo(
|
|
398
|
+
core_type=core_type,
|
|
399
|
+
base_type=base_type,
|
|
400
|
+
variant=variant,
|
|
401
|
+
attrs=attrs,
|
|
402
|
+
nullable=nullable,
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _encode_struct_schema(
|
|
407
|
+
struct_info: AnalyzedStructType, key_type: type | None = None
|
|
408
|
+
) -> tuple[dict[str, Any], int | None]:
|
|
409
|
+
fields = []
|
|
410
|
+
|
|
411
|
+
def add_field(
|
|
412
|
+
name: str, analyzed_type: AnalyzedTypeInfo, description: str | None = None
|
|
413
|
+
) -> None:
|
|
414
|
+
try:
|
|
415
|
+
type_info = encode_enriched_type_info(analyzed_type)
|
|
416
|
+
except ValueError as e:
|
|
417
|
+
e.add_note(
|
|
418
|
+
f"Failed to encode annotation for field - "
|
|
419
|
+
f"{struct_info.struct_type.__name__}.{name}: {analyzed_type.core_type}"
|
|
420
|
+
)
|
|
421
|
+
raise
|
|
422
|
+
type_info["name"] = name
|
|
423
|
+
if description is not None:
|
|
424
|
+
type_info["description"] = description
|
|
425
|
+
fields.append(type_info)
|
|
426
|
+
|
|
427
|
+
def add_fields_from_struct(struct_info: AnalyzedStructType) -> None:
|
|
428
|
+
for field in struct_info.fields:
|
|
429
|
+
add_field(field.name, analyze_type_info(field.type_hint), field.description)
|
|
430
|
+
|
|
431
|
+
result: dict[str, Any] = {}
|
|
432
|
+
num_key_parts = None
|
|
433
|
+
if key_type is not None:
|
|
434
|
+
key_type_info = analyze_type_info(key_type)
|
|
435
|
+
if isinstance(key_type_info.variant, AnalyzedBasicType):
|
|
436
|
+
add_field(KEY_FIELD_NAME, key_type_info)
|
|
437
|
+
num_key_parts = 1
|
|
438
|
+
elif isinstance(key_type_info.variant, AnalyzedStructType):
|
|
439
|
+
add_fields_from_struct(key_type_info.variant)
|
|
440
|
+
num_key_parts = len(fields)
|
|
441
|
+
else:
|
|
442
|
+
raise ValueError(f"Unsupported key type: {key_type}")
|
|
443
|
+
|
|
444
|
+
add_fields_from_struct(struct_info)
|
|
445
|
+
|
|
446
|
+
result["fields"] = fields
|
|
447
|
+
if doc := inspect.getdoc(struct_info):
|
|
448
|
+
result["description"] = doc
|
|
449
|
+
return result, num_key_parts
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _encode_type(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
|
|
453
|
+
variant = type_info.variant
|
|
454
|
+
|
|
455
|
+
if isinstance(variant, AnalyzedAnyType):
|
|
456
|
+
raise ValueError("Specific type annotation is expected")
|
|
457
|
+
|
|
458
|
+
if isinstance(variant, AnalyzedUnknownType):
|
|
459
|
+
raise ValueError(f"Unsupported type annotation: {type_info.core_type}")
|
|
460
|
+
|
|
461
|
+
if isinstance(variant, AnalyzedBasicType):
|
|
462
|
+
return {"kind": variant.kind}
|
|
463
|
+
|
|
464
|
+
if isinstance(variant, AnalyzedStructType):
|
|
465
|
+
encoded_type, _ = _encode_struct_schema(variant)
|
|
466
|
+
encoded_type["kind"] = "Struct"
|
|
467
|
+
return encoded_type
|
|
468
|
+
|
|
469
|
+
if isinstance(variant, AnalyzedListType):
|
|
470
|
+
elem_type_info = analyze_type_info(variant.elem_type)
|
|
471
|
+
encoded_elem_type = _encode_type(elem_type_info)
|
|
472
|
+
if isinstance(elem_type_info.variant, AnalyzedStructType):
|
|
473
|
+
if variant.vector_info is not None:
|
|
474
|
+
raise ValueError("LTable type must not have a vector info")
|
|
475
|
+
row_type, _ = _encode_struct_schema(elem_type_info.variant)
|
|
476
|
+
return {"kind": "LTable", "row": row_type}
|
|
477
|
+
else:
|
|
478
|
+
vector_info = variant.vector_info
|
|
479
|
+
return {
|
|
480
|
+
"kind": "Vector",
|
|
481
|
+
"element_type": encoded_elem_type,
|
|
482
|
+
"dimension": vector_info and vector_info.dim,
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
if isinstance(variant, AnalyzedDictType):
|
|
486
|
+
value_type_info = analyze_type_info(variant.value_type)
|
|
487
|
+
if not isinstance(value_type_info.variant, AnalyzedStructType):
|
|
488
|
+
raise ValueError(
|
|
489
|
+
f"KTable value must have a Struct type, got {value_type_info.core_type}"
|
|
490
|
+
)
|
|
491
|
+
row_type, num_key_parts = _encode_struct_schema(
|
|
492
|
+
value_type_info.variant,
|
|
493
|
+
variant.key_type,
|
|
494
|
+
)
|
|
495
|
+
return {
|
|
496
|
+
"kind": "KTable",
|
|
497
|
+
"row": row_type,
|
|
498
|
+
"num_key_parts": num_key_parts,
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
if isinstance(variant, AnalyzedUnionType):
|
|
502
|
+
return {
|
|
503
|
+
"kind": "Union",
|
|
504
|
+
"types": [
|
|
505
|
+
_encode_type(analyze_type_info(typ)) for typ in variant.variant_types
|
|
506
|
+
],
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def encode_enriched_type_info(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
|
|
511
|
+
"""
|
|
512
|
+
Encode an `AnalyzedTypeInfo` to a CocoIndex engine's `EnrichedValueType` representation
|
|
513
|
+
"""
|
|
514
|
+
encoded: dict[str, Any] = {"type": _encode_type(type_info)}
|
|
515
|
+
|
|
516
|
+
if type_info.attrs is not None:
|
|
517
|
+
encoded["attrs"] = type_info.attrs
|
|
518
|
+
|
|
519
|
+
if type_info.nullable:
|
|
520
|
+
encoded["nullable"] = True
|
|
521
|
+
|
|
522
|
+
return encoded
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
@overload
|
|
526
|
+
def encode_enriched_type(t: None) -> None: ...
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
@overload
|
|
530
|
+
def encode_enriched_type(t: Any) -> dict[str, Any]: ...
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def encode_enriched_type(t: Any) -> dict[str, Any] | None:
|
|
534
|
+
"""
|
|
535
|
+
Convert a Python type to a CocoIndex engine's type representation
|
|
536
|
+
"""
|
|
537
|
+
if t is None:
|
|
538
|
+
return None
|
|
539
|
+
|
|
540
|
+
return encode_enriched_type_info(analyze_type_info(t))
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def resolve_forward_ref(t: Any) -> Any:
|
|
544
|
+
if isinstance(t, str):
|
|
545
|
+
return eval(t) # pylint: disable=eval-used
|
|
546
|
+
return t
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
# ========================= Engine Schema Types (Python mirror of Rust) =========================
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
@dataclasses.dataclass
|
|
553
|
+
class VectorTypeSchema:
|
|
554
|
+
element_type: "BasicValueType"
|
|
555
|
+
dimension: int | None
|
|
556
|
+
|
|
557
|
+
def __str__(self) -> str:
|
|
558
|
+
dimension_str = f", {self.dimension}" if self.dimension is not None else ""
|
|
559
|
+
return f"Vector[{self.element_type}{dimension_str}]"
|
|
560
|
+
|
|
561
|
+
def __repr__(self) -> str:
|
|
562
|
+
return self.__str__()
|
|
563
|
+
|
|
564
|
+
@staticmethod
|
|
565
|
+
def decode(obj: dict[str, Any]) -> "VectorTypeSchema":
|
|
566
|
+
return VectorTypeSchema(
|
|
567
|
+
element_type=BasicValueType.decode(obj["element_type"]),
|
|
568
|
+
dimension=obj.get("dimension"),
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
def encode(self) -> dict[str, Any]:
|
|
572
|
+
return {
|
|
573
|
+
"element_type": self.element_type.encode(),
|
|
574
|
+
"dimension": self.dimension,
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
@dataclasses.dataclass
|
|
579
|
+
class UnionTypeSchema:
|
|
580
|
+
variants: list["BasicValueType"]
|
|
581
|
+
|
|
582
|
+
def __str__(self) -> str:
|
|
583
|
+
types_str = " | ".join(str(t) for t in self.variants)
|
|
584
|
+
return f"Union[{types_str}]"
|
|
585
|
+
|
|
586
|
+
def __repr__(self) -> str:
|
|
587
|
+
return self.__str__()
|
|
588
|
+
|
|
589
|
+
@staticmethod
|
|
590
|
+
def decode(obj: dict[str, Any]) -> "UnionTypeSchema":
|
|
591
|
+
return UnionTypeSchema(
|
|
592
|
+
variants=[BasicValueType.decode(t) for t in obj["types"]]
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
def encode(self) -> dict[str, Any]:
|
|
596
|
+
return {"types": [variant.encode() for variant in self.variants]}
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
@dataclasses.dataclass
|
|
600
|
+
class BasicValueType:
|
|
601
|
+
"""
|
|
602
|
+
Mirror of Rust BasicValueType in JSON form.
|
|
603
|
+
|
|
604
|
+
For Vector and Union kinds, extra fields are populated accordingly.
|
|
605
|
+
"""
|
|
606
|
+
|
|
607
|
+
kind: Literal[
|
|
608
|
+
"Bytes",
|
|
609
|
+
"Str",
|
|
610
|
+
"Bool",
|
|
611
|
+
"Int64",
|
|
612
|
+
"Float32",
|
|
613
|
+
"Float64",
|
|
614
|
+
"Range",
|
|
615
|
+
"Uuid",
|
|
616
|
+
"Date",
|
|
617
|
+
"Time",
|
|
618
|
+
"LocalDateTime",
|
|
619
|
+
"OffsetDateTime",
|
|
620
|
+
"TimeDelta",
|
|
621
|
+
"Json",
|
|
622
|
+
"Vector",
|
|
623
|
+
"Union",
|
|
624
|
+
]
|
|
625
|
+
vector: VectorTypeSchema | None = None
|
|
626
|
+
union: UnionTypeSchema | None = None
|
|
627
|
+
|
|
628
|
+
def __str__(self) -> str:
|
|
629
|
+
if self.kind == "Vector" and self.vector is not None:
|
|
630
|
+
dimension_str = (
|
|
631
|
+
f", {self.vector.dimension}"
|
|
632
|
+
if self.vector.dimension is not None
|
|
633
|
+
else ""
|
|
634
|
+
)
|
|
635
|
+
return f"Vector[{self.vector.element_type}{dimension_str}]"
|
|
636
|
+
elif self.kind == "Union" and self.union is not None:
|
|
637
|
+
types_str = " | ".join(str(t) for t in self.union.variants)
|
|
638
|
+
return f"Union[{types_str}]"
|
|
639
|
+
else:
|
|
640
|
+
return self.kind
|
|
641
|
+
|
|
642
|
+
def __repr__(self) -> str:
|
|
643
|
+
return self.__str__()
|
|
644
|
+
|
|
645
|
+
@staticmethod
|
|
646
|
+
def decode(obj: dict[str, Any]) -> "BasicValueType":
|
|
647
|
+
kind = obj["kind"]
|
|
648
|
+
if kind == "Vector":
|
|
649
|
+
return BasicValueType(
|
|
650
|
+
kind=kind, # type: ignore[arg-type]
|
|
651
|
+
vector=VectorTypeSchema.decode(obj),
|
|
652
|
+
)
|
|
653
|
+
if kind == "Union":
|
|
654
|
+
return BasicValueType(
|
|
655
|
+
kind=kind, # type: ignore[arg-type]
|
|
656
|
+
union=UnionTypeSchema.decode(obj),
|
|
657
|
+
)
|
|
658
|
+
return BasicValueType(kind=kind) # type: ignore[arg-type]
|
|
659
|
+
|
|
660
|
+
def encode(self) -> dict[str, Any]:
|
|
661
|
+
result = {"kind": self.kind}
|
|
662
|
+
if self.kind == "Vector" and self.vector is not None:
|
|
663
|
+
result.update(self.vector.encode())
|
|
664
|
+
elif self.kind == "Union" and self.union is not None:
|
|
665
|
+
result.update(self.union.encode())
|
|
666
|
+
return result
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
@dataclasses.dataclass
|
|
670
|
+
class EnrichedValueType:
|
|
671
|
+
type: "ValueType"
|
|
672
|
+
nullable: bool = False
|
|
673
|
+
attrs: dict[str, Any] | None = None
|
|
674
|
+
|
|
675
|
+
def __str__(self) -> str:
|
|
676
|
+
result = str(self.type)
|
|
677
|
+
if self.nullable:
|
|
678
|
+
result += "?"
|
|
679
|
+
if self.attrs:
|
|
680
|
+
attrs_str = ", ".join(f"{k}: {v}" for k, v in self.attrs.items())
|
|
681
|
+
result += f" [{attrs_str}]"
|
|
682
|
+
return result
|
|
683
|
+
|
|
684
|
+
def __repr__(self) -> str:
|
|
685
|
+
return self.__str__()
|
|
686
|
+
|
|
687
|
+
@staticmethod
|
|
688
|
+
def decode(obj: dict[str, Any]) -> "EnrichedValueType":
|
|
689
|
+
return EnrichedValueType(
|
|
690
|
+
type=decode_engine_value_type(obj["type"]),
|
|
691
|
+
nullable=obj.get("nullable", False),
|
|
692
|
+
attrs=obj.get("attrs"),
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
def encode(self) -> dict[str, Any]:
|
|
696
|
+
result: dict[str, Any] = {"type": self.type.encode()}
|
|
697
|
+
if self.nullable:
|
|
698
|
+
result["nullable"] = True
|
|
699
|
+
if self.attrs is not None:
|
|
700
|
+
result["attrs"] = self.attrs
|
|
701
|
+
return result
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
@dataclasses.dataclass
|
|
705
|
+
class FieldSchema:
|
|
706
|
+
name: str
|
|
707
|
+
value_type: EnrichedValueType
|
|
708
|
+
description: str | None = None
|
|
709
|
+
|
|
710
|
+
def __str__(self) -> str:
|
|
711
|
+
return f"{self.name}: {self.value_type}"
|
|
712
|
+
|
|
713
|
+
def __repr__(self) -> str:
|
|
714
|
+
return self.__str__()
|
|
715
|
+
|
|
716
|
+
@staticmethod
|
|
717
|
+
def decode(obj: dict[str, Any]) -> "FieldSchema":
|
|
718
|
+
return FieldSchema(
|
|
719
|
+
name=obj["name"],
|
|
720
|
+
value_type=EnrichedValueType.decode(obj),
|
|
721
|
+
description=obj.get("description"),
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
def encode(self) -> dict[str, Any]:
|
|
725
|
+
result = self.value_type.encode()
|
|
726
|
+
result["name"] = self.name
|
|
727
|
+
if self.description is not None:
|
|
728
|
+
result["description"] = self.description
|
|
729
|
+
return result
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
@dataclasses.dataclass
|
|
733
|
+
class StructSchema:
|
|
734
|
+
fields: list[FieldSchema]
|
|
735
|
+
description: str | None = None
|
|
736
|
+
|
|
737
|
+
def __str__(self) -> str:
|
|
738
|
+
fields_str = ", ".join(str(field) for field in self.fields)
|
|
739
|
+
return f"Struct({fields_str})"
|
|
740
|
+
|
|
741
|
+
def __repr__(self) -> str:
|
|
742
|
+
return self.__str__()
|
|
743
|
+
|
|
744
|
+
@classmethod
|
|
745
|
+
def decode(cls, obj: dict[str, Any]) -> Self:
|
|
746
|
+
return cls(
|
|
747
|
+
fields=[FieldSchema.decode(f) for f in obj["fields"]],
|
|
748
|
+
description=obj.get("description"),
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
def encode(self) -> dict[str, Any]:
|
|
752
|
+
result: dict[str, Any] = {"fields": [field.encode() for field in self.fields]}
|
|
753
|
+
if self.description is not None:
|
|
754
|
+
result["description"] = self.description
|
|
755
|
+
return result
|
|
756
|
+
|
|
757
|
+
|
|
758
|
+
@dataclasses.dataclass
|
|
759
|
+
class StructType(StructSchema):
|
|
760
|
+
kind: Literal["Struct"] = "Struct"
|
|
761
|
+
|
|
762
|
+
def __str__(self) -> str:
|
|
763
|
+
# Use the parent's __str__ method for consistency
|
|
764
|
+
return super().__str__()
|
|
765
|
+
|
|
766
|
+
def __repr__(self) -> str:
|
|
767
|
+
return self.__str__()
|
|
768
|
+
|
|
769
|
+
def encode(self) -> dict[str, Any]:
|
|
770
|
+
result = super().encode()
|
|
771
|
+
result["kind"] = self.kind
|
|
772
|
+
return result
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
@dataclasses.dataclass
|
|
776
|
+
class TableType:
|
|
777
|
+
kind: Literal["KTable", "LTable"]
|
|
778
|
+
row: StructSchema
|
|
779
|
+
num_key_parts: int | None = None # Only for KTable
|
|
780
|
+
|
|
781
|
+
def __str__(self) -> str:
|
|
782
|
+
if self.kind == "KTable":
|
|
783
|
+
num_parts = self.num_key_parts if self.num_key_parts is not None else 1
|
|
784
|
+
table_kind = f"KTable({num_parts})"
|
|
785
|
+
else: # LTable
|
|
786
|
+
table_kind = "LTable"
|
|
787
|
+
|
|
788
|
+
return f"{table_kind}({self.row})"
|
|
789
|
+
|
|
790
|
+
def __repr__(self) -> str:
|
|
791
|
+
return self.__str__()
|
|
792
|
+
|
|
793
|
+
@staticmethod
|
|
794
|
+
def decode(obj: dict[str, Any]) -> "TableType":
|
|
795
|
+
row_obj = obj["row"]
|
|
796
|
+
row = StructSchema(
|
|
797
|
+
fields=[FieldSchema.decode(f) for f in row_obj["fields"]],
|
|
798
|
+
description=row_obj.get("description"),
|
|
799
|
+
)
|
|
800
|
+
return TableType(
|
|
801
|
+
kind=obj["kind"], # type: ignore[arg-type]
|
|
802
|
+
row=row,
|
|
803
|
+
num_key_parts=obj.get("num_key_parts"),
|
|
804
|
+
)
|
|
805
|
+
|
|
806
|
+
def encode(self) -> dict[str, Any]:
|
|
807
|
+
result: dict[str, Any] = {"kind": self.kind, "row": self.row.encode()}
|
|
808
|
+
if self.num_key_parts is not None:
|
|
809
|
+
result["num_key_parts"] = self.num_key_parts
|
|
810
|
+
return result
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
ValueType = BasicValueType | StructType | TableType
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
def decode_engine_field_schemas(objs: list[dict[str, Any]]) -> list[FieldSchema]:
|
|
817
|
+
return [FieldSchema.decode(o) for o in objs]
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
def decode_engine_value_type(obj: dict[str, Any]) -> ValueType:
|
|
821
|
+
kind = obj["kind"]
|
|
822
|
+
if kind == "Struct":
|
|
823
|
+
return StructType.decode(obj)
|
|
824
|
+
|
|
825
|
+
if kind in TABLE_TYPES:
|
|
826
|
+
return TableType.decode(obj)
|
|
827
|
+
|
|
828
|
+
# Otherwise it's a basic value
|
|
829
|
+
return BasicValueType.decode(obj)
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
def encode_engine_value_type(value_type: ValueType) -> dict[str, Any]:
|
|
833
|
+
"""Encode a ValueType to its dictionary representation."""
|
|
834
|
+
return value_type.encode()
|