cocoindex 0.2.3__cp311-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cocoindex/typing.py ADDED
@@ -0,0 +1,473 @@
1
+ import collections
2
+ import dataclasses
3
+ import datetime
4
+ import inspect
5
+ import types
6
+ import typing
7
+ import uuid
8
+ from typing import (
9
+ TYPE_CHECKING,
10
+ Annotated,
11
+ Any,
12
+ Generic,
13
+ Literal,
14
+ NamedTuple,
15
+ Protocol,
16
+ TypeVar,
17
+ overload,
18
+ )
19
+
20
+ import numpy as np
21
+ from numpy.typing import NDArray
22
+
23
+
24
+ class VectorInfo(NamedTuple):
25
+ dim: int | None
26
+
27
+
28
+ class TypeKind(NamedTuple):
29
+ kind: str
30
+
31
+
32
+ class TypeAttr:
33
+ key: str
34
+ value: Any
35
+
36
+ def __init__(self, key: str, value: Any):
37
+ self.key = key
38
+ self.value = value
39
+
40
+
41
+ Annotation = TypeKind | TypeAttr | VectorInfo
42
+
43
+ Int64 = Annotated[int, TypeKind("Int64")]
44
+ Float32 = Annotated[float, TypeKind("Float32")]
45
+ Float64 = Annotated[float, TypeKind("Float64")]
46
+ Range = Annotated[tuple[int, int], TypeKind("Range")]
47
+ Json = Annotated[Any, TypeKind("Json")]
48
+ LocalDateTime = Annotated[datetime.datetime, TypeKind("LocalDateTime")]
49
+ OffsetDateTime = Annotated[datetime.datetime, TypeKind("OffsetDateTime")]
50
+
51
+ if TYPE_CHECKING:
52
+ T_co = TypeVar("T_co", covariant=True)
53
+ Dim_co = TypeVar("Dim_co", bound=int | None, covariant=True, default=None)
54
+
55
+ class Vector(Protocol, Generic[T_co, Dim_co]):
56
+ """Vector[T, Dim] is a special typing alias for an NDArray[T] with optional dimension info"""
57
+
58
+ def __getitem__(self, index: int) -> T_co: ...
59
+ def __len__(self) -> int: ...
60
+
61
+ else:
62
+
63
+ class Vector: # type: ignore[unreachable]
64
+ """A special typing alias for an NDArray[T] with optional dimension info"""
65
+
66
+ def __class_getitem__(self, params):
67
+ if not isinstance(params, tuple):
68
+ # No dimension provided, e.g., Vector[np.float32]
69
+ dtype = params
70
+ vector_info = VectorInfo(dim=None)
71
+ else:
72
+ # Element type and dimension provided, e.g., Vector[np.float32, Literal[3]]
73
+ dtype, dim_literal = params
74
+ # Extract the literal value
75
+ dim_val = (
76
+ typing.get_args(dim_literal)[0]
77
+ if typing.get_origin(dim_literal) is Literal
78
+ else None
79
+ )
80
+ vector_info = VectorInfo(dim=dim_val)
81
+
82
+ # Use NDArray for supported numeric dtypes, else list
83
+ base_type = analyze_type_info(dtype).base_type
84
+ if is_numpy_number_type(base_type) or base_type is np.ndarray:
85
+ return Annotated[NDArray[dtype], vector_info]
86
+ return Annotated[list[dtype], vector_info]
87
+
88
+
89
+ TABLE_TYPES: tuple[str, str] = ("KTable", "LTable")
90
+ KEY_FIELD_NAME: str = "_key"
91
+
92
+
93
+ def extract_ndarray_elem_dtype(ndarray_type: Any) -> Any:
94
+ args = typing.get_args(ndarray_type)
95
+ _, dtype_spec = args
96
+ dtype_args = typing.get_args(dtype_spec)
97
+ if not dtype_args:
98
+ raise ValueError(f"Invalid dtype specification: {dtype_spec}")
99
+ return dtype_args[0]
100
+
101
+
102
+ def is_numpy_number_type(t: type) -> bool:
103
+ return isinstance(t, type) and issubclass(t, (np.integer, np.floating))
104
+
105
+
106
+ def is_namedtuple_type(t: type) -> bool:
107
+ return isinstance(t, type) and issubclass(t, tuple) and hasattr(t, "_fields")
108
+
109
+
110
+ def is_struct_type(t: Any) -> bool:
111
+ return isinstance(t, type) and (
112
+ dataclasses.is_dataclass(t) or is_namedtuple_type(t)
113
+ )
114
+
115
+
116
+ class DtypeRegistry:
117
+ """
118
+ Registry for NumPy dtypes used in CocoIndex.
119
+ Maps NumPy dtypes to their CocoIndex type kind.
120
+ """
121
+
122
+ _DTYPE_TO_KIND: dict[Any, str] = {
123
+ np.float32: "Float32",
124
+ np.float64: "Float64",
125
+ np.int64: "Int64",
126
+ }
127
+
128
+ @classmethod
129
+ def validate_dtype_and_get_kind(cls, dtype: Any) -> str:
130
+ """
131
+ Validate that the given dtype is supported, and get its CocoIndex kind by dtype.
132
+ """
133
+ if dtype is Any:
134
+ raise TypeError(
135
+ "NDArray for Vector must use a concrete numpy dtype, got `Any`."
136
+ )
137
+ kind = cls._DTYPE_TO_KIND.get(dtype)
138
+ if kind is None:
139
+ raise ValueError(
140
+ f"Unsupported NumPy dtype in NDArray: {dtype}. "
141
+ f"Supported dtypes: {cls._DTYPE_TO_KIND.keys()}"
142
+ )
143
+ return kind
144
+
145
+
146
+ class AnalyzedAnyType(NamedTuple):
147
+ """
148
+ When the type annotation is missing or matches any type.
149
+ """
150
+
151
+
152
+ class AnalyzedBasicType(NamedTuple):
153
+ """
154
+ For types that fit into basic type, and annotated with basic type or Json type.
155
+ """
156
+
157
+ kind: str
158
+
159
+
160
+ class AnalyzedListType(NamedTuple):
161
+ """
162
+ Any list type, e.g. list[T], Sequence[T], NDArray[T], etc.
163
+ """
164
+
165
+ elem_type: Any
166
+ vector_info: VectorInfo | None
167
+
168
+
169
+ class AnalyzedStructType(NamedTuple):
170
+ """
171
+ Any struct type, e.g. dataclass, NamedTuple, etc.
172
+ """
173
+
174
+ struct_type: type
175
+
176
+
177
+ class AnalyzedUnionType(NamedTuple):
178
+ """
179
+ Any union type, e.g. T1 | T2 | ..., etc.
180
+ """
181
+
182
+ variant_types: list[Any]
183
+
184
+
185
+ class AnalyzedDictType(NamedTuple):
186
+ """
187
+ Any dict type, e.g. dict[T1, T2], Mapping[T1, T2], etc.
188
+ """
189
+
190
+ key_type: Any
191
+ value_type: Any
192
+
193
+
194
+ class AnalyzedUnknownType(NamedTuple):
195
+ """
196
+ Any type that is not supported by CocoIndex.
197
+ """
198
+
199
+
200
+ AnalyzedTypeVariant = (
201
+ AnalyzedAnyType
202
+ | AnalyzedBasicType
203
+ | AnalyzedListType
204
+ | AnalyzedStructType
205
+ | AnalyzedUnionType
206
+ | AnalyzedDictType
207
+ | AnalyzedUnknownType
208
+ )
209
+
210
+
211
+ @dataclasses.dataclass
212
+ class AnalyzedTypeInfo:
213
+ """
214
+ Analyzed info of a Python type.
215
+ """
216
+
217
+ # The type without annotations. e.g. int, list[int], dict[str, int]
218
+ core_type: Any
219
+ # The type without annotations and parameters. e.g. int, list, dict
220
+ base_type: Any
221
+ variant: AnalyzedTypeVariant
222
+ attrs: dict[str, Any] | None
223
+ nullable: bool = False
224
+
225
+
226
+ def analyze_type_info(t: Any) -> AnalyzedTypeInfo:
227
+ """
228
+ Analyze a Python type annotation and extract CocoIndex-specific type information.
229
+ """
230
+
231
+ annotations: tuple[Annotation, ...] = ()
232
+ base_type = None
233
+ type_args: tuple[Any, ...] = ()
234
+ nullable = False
235
+ while True:
236
+ base_type = typing.get_origin(t)
237
+ if base_type is Annotated:
238
+ annotations = t.__metadata__
239
+ t = t.__origin__
240
+ else:
241
+ if base_type is None:
242
+ base_type = t
243
+ else:
244
+ type_args = typing.get_args(t)
245
+ break
246
+ core_type = t
247
+
248
+ attrs: dict[str, Any] | None = None
249
+ vector_info: VectorInfo | None = None
250
+ kind: str | None = None
251
+ for attr in annotations:
252
+ if isinstance(attr, TypeAttr):
253
+ if attrs is None:
254
+ attrs = dict()
255
+ attrs[attr.key] = attr.value
256
+ elif isinstance(attr, VectorInfo):
257
+ vector_info = attr
258
+ elif isinstance(attr, TypeKind):
259
+ kind = attr.kind
260
+
261
+ variant: AnalyzedTypeVariant | None = None
262
+
263
+ if kind is not None:
264
+ variant = AnalyzedBasicType(kind=kind)
265
+ elif base_type is Any or base_type is inspect.Parameter.empty:
266
+ variant = AnalyzedAnyType()
267
+ elif is_struct_type(base_type):
268
+ variant = AnalyzedStructType(struct_type=t)
269
+ elif is_numpy_number_type(t):
270
+ kind = DtypeRegistry.validate_dtype_and_get_kind(t)
271
+ variant = AnalyzedBasicType(kind=kind)
272
+ elif base_type is collections.abc.Sequence or base_type is list:
273
+ elem_type = type_args[0] if len(type_args) > 0 else Any
274
+ variant = AnalyzedListType(elem_type=elem_type, vector_info=vector_info)
275
+ elif base_type is np.ndarray:
276
+ np_number_type = t
277
+ elem_type = extract_ndarray_elem_dtype(np_number_type)
278
+ variant = AnalyzedListType(elem_type=elem_type, vector_info=vector_info)
279
+ elif base_type is collections.abc.Mapping or base_type is dict or t is dict:
280
+ key_type = type_args[0] if len(type_args) > 0 else Any
281
+ elem_type = type_args[1] if len(type_args) > 1 else Any
282
+ variant = AnalyzedDictType(key_type=key_type, value_type=elem_type)
283
+ elif base_type in (types.UnionType, typing.Union):
284
+ non_none_types = [arg for arg in type_args if arg not in (None, types.NoneType)]
285
+ if len(non_none_types) == 0:
286
+ return analyze_type_info(None)
287
+
288
+ nullable = len(non_none_types) < len(type_args)
289
+ if len(non_none_types) == 1:
290
+ result = analyze_type_info(non_none_types[0])
291
+ result.nullable = nullable
292
+ return result
293
+
294
+ variant = AnalyzedUnionType(variant_types=non_none_types)
295
+ else:
296
+ if t is bytes:
297
+ kind = "Bytes"
298
+ elif t is str:
299
+ kind = "Str"
300
+ elif t is bool:
301
+ kind = "Bool"
302
+ elif t is int:
303
+ kind = "Int64"
304
+ elif t is float:
305
+ kind = "Float64"
306
+ elif t is uuid.UUID:
307
+ kind = "Uuid"
308
+ elif t is datetime.date:
309
+ kind = "Date"
310
+ elif t is datetime.time:
311
+ kind = "Time"
312
+ elif t is datetime.datetime:
313
+ kind = "OffsetDateTime"
314
+ elif t is datetime.timedelta:
315
+ kind = "TimeDelta"
316
+
317
+ if kind is None:
318
+ variant = AnalyzedUnknownType()
319
+ else:
320
+ variant = AnalyzedBasicType(kind=kind)
321
+
322
+ return AnalyzedTypeInfo(
323
+ core_type=core_type,
324
+ base_type=base_type,
325
+ variant=variant,
326
+ attrs=attrs,
327
+ nullable=nullable,
328
+ )
329
+
330
+
331
+ def _encode_struct_schema(
332
+ struct_type: type, key_type: type | None = None
333
+ ) -> tuple[dict[str, Any], int | None]:
334
+ fields = []
335
+
336
+ def add_field(name: str, analyzed_type: AnalyzedTypeInfo) -> None:
337
+ try:
338
+ type_info = encode_enriched_type_info(analyzed_type)
339
+ except ValueError as e:
340
+ e.add_note(
341
+ f"Failed to encode annotation for field - "
342
+ f"{struct_type.__name__}.{name}: {analyzed_type.core_type}"
343
+ )
344
+ raise
345
+ type_info["name"] = name
346
+ fields.append(type_info)
347
+
348
+ def add_fields_from_struct(struct_type: type) -> None:
349
+ if dataclasses.is_dataclass(struct_type):
350
+ for field in dataclasses.fields(struct_type):
351
+ add_field(field.name, analyze_type_info(field.type))
352
+ elif is_namedtuple_type(struct_type):
353
+ for name, field_type in struct_type.__annotations__.items():
354
+ add_field(name, analyze_type_info(field_type))
355
+ else:
356
+ raise ValueError(f"Unsupported struct type: {struct_type}")
357
+
358
+ result: dict[str, Any] = {}
359
+ num_key_parts = None
360
+ if key_type is not None:
361
+ key_type_info = analyze_type_info(key_type)
362
+ if isinstance(key_type_info.variant, AnalyzedBasicType):
363
+ add_field(KEY_FIELD_NAME, key_type_info)
364
+ num_key_parts = 1
365
+ elif isinstance(key_type_info.variant, AnalyzedStructType):
366
+ add_fields_from_struct(key_type_info.variant.struct_type)
367
+ num_key_parts = len(fields)
368
+ else:
369
+ raise ValueError(f"Unsupported key type: {key_type}")
370
+
371
+ add_fields_from_struct(struct_type)
372
+
373
+ result["fields"] = fields
374
+ if doc := inspect.getdoc(struct_type):
375
+ result["description"] = doc
376
+ return result, num_key_parts
377
+
378
+
379
+ def _encode_type(type_info: AnalyzedTypeInfo) -> dict[str, Any]:
380
+ variant = type_info.variant
381
+
382
+ if isinstance(variant, AnalyzedAnyType):
383
+ raise ValueError("Specific type annotation is expected")
384
+
385
+ if isinstance(variant, AnalyzedUnknownType):
386
+ raise ValueError(f"Unsupported type annotation: {type_info.core_type}")
387
+
388
+ if isinstance(variant, AnalyzedBasicType):
389
+ return {"kind": variant.kind}
390
+
391
+ if isinstance(variant, AnalyzedStructType):
392
+ encoded_type, _ = _encode_struct_schema(variant.struct_type)
393
+ encoded_type["kind"] = "Struct"
394
+ return encoded_type
395
+
396
+ if isinstance(variant, AnalyzedListType):
397
+ elem_type_info = analyze_type_info(variant.elem_type)
398
+ encoded_elem_type = _encode_type(elem_type_info)
399
+ if isinstance(elem_type_info.variant, AnalyzedStructType):
400
+ if variant.vector_info is not None:
401
+ raise ValueError("LTable type must not have a vector info")
402
+ row_type, _ = _encode_struct_schema(elem_type_info.variant.struct_type)
403
+ return {"kind": "LTable", "row": row_type}
404
+ else:
405
+ vector_info = variant.vector_info
406
+ return {
407
+ "kind": "Vector",
408
+ "element_type": encoded_elem_type,
409
+ "dimension": vector_info and vector_info.dim,
410
+ }
411
+
412
+ if isinstance(variant, AnalyzedDictType):
413
+ value_type_info = analyze_type_info(variant.value_type)
414
+ if not isinstance(value_type_info.variant, AnalyzedStructType):
415
+ raise ValueError(
416
+ f"KTable value must have a Struct type, got {value_type_info.core_type}"
417
+ )
418
+ row_type, num_key_parts = _encode_struct_schema(
419
+ value_type_info.variant.struct_type,
420
+ variant.key_type,
421
+ )
422
+ return {
423
+ "kind": "KTable",
424
+ "row": row_type,
425
+ "num_key_parts": num_key_parts,
426
+ }
427
+
428
+ if isinstance(variant, AnalyzedUnionType):
429
+ return {
430
+ "kind": "Union",
431
+ "types": [
432
+ _encode_type(analyze_type_info(typ)) for typ in variant.variant_types
433
+ ],
434
+ }
435
+
436
+
437
+ def encode_enriched_type_info(enriched_type_info: AnalyzedTypeInfo) -> dict[str, Any]:
438
+ """
439
+ Encode an enriched type info to a CocoIndex engine's type representation
440
+ """
441
+ encoded: dict[str, Any] = {"type": _encode_type(enriched_type_info)}
442
+
443
+ if enriched_type_info.attrs is not None:
444
+ encoded["attrs"] = enriched_type_info.attrs
445
+
446
+ if enriched_type_info.nullable:
447
+ encoded["nullable"] = True
448
+
449
+ return encoded
450
+
451
+
452
+ @overload
453
+ def encode_enriched_type(t: None) -> None: ...
454
+
455
+
456
+ @overload
457
+ def encode_enriched_type(t: Any) -> dict[str, Any]: ...
458
+
459
+
460
+ def encode_enriched_type(t: Any) -> dict[str, Any] | None:
461
+ """
462
+ Convert a Python type to a CocoIndex engine's type representation
463
+ """
464
+ if t is None:
465
+ return None
466
+
467
+ return encode_enriched_type_info(analyze_type_info(t))
468
+
469
+
470
+ def resolve_forward_ref(t: Any) -> Any:
471
+ if isinstance(t, str):
472
+ return eval(t) # pylint: disable=eval-used
473
+ return t
@@ -0,0 +1,51 @@
1
+ import os
2
+ import sys
3
+ import importlib
4
+ import click
5
+ import types
6
+
7
+
8
+ def load_user_app(app_target: str) -> types.ModuleType:
9
+ """
10
+ Loads the user's application, which can be a file path or an installed module name.
11
+ Exits on failure.
12
+ """
13
+ if not app_target:
14
+ raise click.ClickException("Application target not provided.")
15
+
16
+ looks_like_path = os.sep in app_target or app_target.lower().endswith(".py")
17
+
18
+ if looks_like_path:
19
+ if not os.path.isfile(app_target):
20
+ raise click.ClickException(f"Application file path not found: {app_target}")
21
+ app_path = os.path.abspath(app_target)
22
+ app_dir = os.path.dirname(app_path)
23
+ module_name = os.path.splitext(os.path.basename(app_path))[0]
24
+
25
+ if app_dir not in sys.path:
26
+ sys.path.insert(0, app_dir)
27
+ try:
28
+ spec = importlib.util.spec_from_file_location(module_name, app_path)
29
+ if spec is None:
30
+ raise ImportError(f"Could not create spec for file: {app_path}")
31
+ module = importlib.util.module_from_spec(spec)
32
+ sys.modules[spec.name] = module
33
+ if spec.loader is None:
34
+ raise ImportError(f"Could not create loader for file: {app_path}")
35
+ spec.loader.exec_module(module)
36
+ return module
37
+ except (ImportError, FileNotFoundError, PermissionError) as e:
38
+ raise click.ClickException(f"Failed importing file '{app_path}': {e}")
39
+ finally:
40
+ if app_dir in sys.path and sys.path[0] == app_dir:
41
+ sys.path.pop(0)
42
+
43
+ # Try as module
44
+ try:
45
+ return importlib.import_module(app_target)
46
+ except ImportError as e:
47
+ raise click.ClickException(f"Failed to load module '{app_target}': {e}")
48
+ except Exception as e:
49
+ raise click.ClickException(
50
+ f"Unexpected error importing module '{app_target}': {e}"
51
+ )
cocoindex/utils.py ADDED
@@ -0,0 +1,20 @@
1
+ from .flow import Flow
2
+ from .setting import get_app_namespace
3
+
4
+
5
+ def get_target_default_name(flow: Flow, target_name: str, delimiter: str = "__") -> str:
6
+ """
7
+ Get the default name for a target.
8
+ It's used as the underlying target name (e.g. a table, a collection, etc.) followed by most targets, if not explicitly specified.
9
+ """
10
+ return (
11
+ get_app_namespace(trailing_delimiter=delimiter)
12
+ + flow.name
13
+ + delimiter
14
+ + target_name
15
+ )
16
+
17
+
18
+ get_target_storage_default_name = (
19
+ get_target_default_name # Deprecated: Use get_target_default_name instead
20
+ )
@@ -0,0 +1,104 @@
1
+ """
2
+ Naming validation for CocoIndex identifiers.
3
+
4
+ This module enforces naming conventions for flow names, field names,
5
+ target names, and app namespace names as specified in issue #779.
6
+ """
7
+
8
+ import re
9
+ from typing import Optional
10
+
11
+ _IDENTIFIER_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
12
+ _IDENTIFIER_WITH_DOTS_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_.]*$")
13
+
14
+
15
+ class NamingError(ValueError):
16
+ """Exception raised for naming convention violations."""
17
+
18
+ pass
19
+
20
+
21
+ def validate_identifier_name(
22
+ name: str,
23
+ max_length: int = 64,
24
+ allow_dots: bool = False,
25
+ identifier_type: str = "identifier",
26
+ ) -> Optional[str]:
27
+ """
28
+ Validate identifier names according to CocoIndex naming rules.
29
+
30
+ Args:
31
+ name: The name to validate
32
+ max_length: Maximum allowed length (default 64)
33
+ allow_dots: Whether to allow dots in the name (for full flow names)
34
+ identifier_type: Type of identifier for error messages
35
+
36
+ Returns:
37
+ None if valid, error message string if invalid
38
+ """
39
+ if not name:
40
+ return f"{identifier_type} name cannot be empty"
41
+
42
+ if len(name) > max_length:
43
+ return f"{identifier_type} name '{name}' exceeds maximum length of {max_length} characters"
44
+
45
+ if name.startswith("__"):
46
+ return f"{identifier_type} name '{name}' cannot start with double underscores (reserved for internal usage)"
47
+
48
+ # Define allowed pattern
49
+ if allow_dots:
50
+ pattern = _IDENTIFIER_WITH_DOTS_PATTERN
51
+ allowed_chars = "letters, digits, underscores, and dots"
52
+ else:
53
+ pattern = _IDENTIFIER_PATTERN
54
+ allowed_chars = "letters, digits, and underscores"
55
+
56
+ if not pattern.match(name):
57
+ return f"{identifier_type} name '{name}' must start with a letter or underscore and contain only {allowed_chars}"
58
+
59
+ return None
60
+
61
+
62
+ def validate_field_name(name: str) -> None:
63
+ """Validate field names."""
64
+ error = validate_identifier_name(
65
+ name, max_length=64, allow_dots=False, identifier_type="Field"
66
+ )
67
+ if error:
68
+ raise NamingError(error)
69
+
70
+
71
+ def validate_flow_name(name: str) -> None:
72
+ """Validate flow names."""
73
+ error = validate_identifier_name(
74
+ name, max_length=64, allow_dots=False, identifier_type="Flow"
75
+ )
76
+ if error:
77
+ raise NamingError(error)
78
+
79
+
80
+ def validate_full_flow_name(name: str) -> None:
81
+ """Validate full flow names (can contain dots for namespacing)."""
82
+ error = validate_identifier_name(
83
+ name, max_length=64, allow_dots=True, identifier_type="Full flow"
84
+ )
85
+ if error:
86
+ raise NamingError(error)
87
+
88
+
89
+ def validate_app_namespace_name(name: str) -> None:
90
+ """Validate app namespace names."""
91
+ error = validate_identifier_name(
92
+ name, max_length=64, allow_dots=False, identifier_type="App namespace"
93
+ )
94
+ if error:
95
+ raise NamingError(error)
96
+
97
+
98
+ def validate_target_name(name: str) -> None:
99
+ """Validate target names."""
100
+ error = validate_identifier_name(
101
+ name, max_length=64, allow_dots=False, identifier_type="Target"
102
+ )
103
+ if error:
104
+ raise NamingError(error)