cocoindex 0.1.70__cp313-cp313t-manylinux_2_28_aarch64.whl → 0.1.72__cp313-cp313t-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoindex/__init__.py +1 -0
- cocoindex/_engine.cpython-313t-aarch64-linux-gnu.so +0 -0
- cocoindex/convert.py +79 -4
- cocoindex/flow.py +16 -7
- cocoindex/functions.py +8 -7
- cocoindex/op.py +33 -4
- cocoindex/setting.py +3 -0
- cocoindex/tests/test_convert.py +127 -0
- cocoindex/tests/test_validation.py +134 -0
- cocoindex/validation.py +104 -0
- {cocoindex-0.1.70.dist-info → cocoindex-0.1.72.dist-info}/METADATA +12 -11
- {cocoindex-0.1.70.dist-info → cocoindex-0.1.72.dist-info}/RECORD +15 -13
- {cocoindex-0.1.70.dist-info → cocoindex-0.1.72.dist-info}/WHEEL +0 -0
- {cocoindex-0.1.70.dist-info → cocoindex-0.1.72.dist-info}/entry_points.txt +0 -0
- {cocoindex-0.1.70.dist-info → cocoindex-0.1.72.dist-info}/licenses/LICENSE +0 -0
cocoindex/__init__.py
CHANGED
Binary file
|
cocoindex/convert.py
CHANGED
@@ -92,10 +92,14 @@ def make_engine_value_decoder(
|
|
92
92
|
if src_type_kind == "Struct":
|
93
93
|
return _make_engine_struct_to_dict_decoder(field_path, src_type["fields"])
|
94
94
|
if src_type_kind in TABLE_TYPES:
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
95
|
+
if src_type_kind == "LTable":
|
96
|
+
return _make_engine_ltable_to_list_dict_decoder(
|
97
|
+
field_path, src_type["row"]["fields"]
|
98
|
+
)
|
99
|
+
elif src_type_kind == "KTable":
|
100
|
+
return _make_engine_ktable_to_dict_dict_decoder(
|
101
|
+
field_path, src_type["row"]["fields"]
|
102
|
+
)
|
99
103
|
return lambda value: value
|
100
104
|
|
101
105
|
# Handle struct -> dict binding for explicit dict annotations
|
@@ -340,6 +344,77 @@ def _make_engine_struct_to_dict_decoder(
|
|
340
344
|
return decode_to_dict
|
341
345
|
|
342
346
|
|
347
|
+
def _make_engine_ltable_to_list_dict_decoder(
|
348
|
+
field_path: list[str],
|
349
|
+
src_fields: list[dict[str, Any]],
|
350
|
+
) -> Callable[[list[Any] | None], list[dict[str, Any]] | None]:
|
351
|
+
"""Make a decoder from engine LTable values to a list of dicts."""
|
352
|
+
|
353
|
+
# Create a decoder for each row (struct) to dict
|
354
|
+
row_decoder = _make_engine_struct_to_dict_decoder(field_path, src_fields)
|
355
|
+
|
356
|
+
def decode_to_list_dict(values: list[Any] | None) -> list[dict[str, Any]] | None:
|
357
|
+
if values is None:
|
358
|
+
return None
|
359
|
+
result = []
|
360
|
+
for i, row_values in enumerate(values):
|
361
|
+
decoded_row = row_decoder(row_values)
|
362
|
+
if decoded_row is None:
|
363
|
+
raise ValueError(
|
364
|
+
f"LTable row at index {i} decoded to None, which is not allowed."
|
365
|
+
)
|
366
|
+
result.append(decoded_row)
|
367
|
+
return result
|
368
|
+
|
369
|
+
return decode_to_list_dict
|
370
|
+
|
371
|
+
|
372
|
+
def _make_engine_ktable_to_dict_dict_decoder(
|
373
|
+
field_path: list[str],
|
374
|
+
src_fields: list[dict[str, Any]],
|
375
|
+
) -> Callable[[list[Any] | None], dict[Any, dict[str, Any]] | None]:
|
376
|
+
"""Make a decoder from engine KTable values to a dict of dicts."""
|
377
|
+
|
378
|
+
if not src_fields:
|
379
|
+
raise ValueError("KTable must have at least one field for the key")
|
380
|
+
|
381
|
+
# First field is the key, remaining fields are the value
|
382
|
+
key_field_schema = src_fields[0]
|
383
|
+
value_fields_schema = src_fields[1:]
|
384
|
+
|
385
|
+
# Create decoders
|
386
|
+
field_path.append(f".{key_field_schema.get('name', KEY_FIELD_NAME)}")
|
387
|
+
key_decoder = make_engine_value_decoder(field_path, key_field_schema["type"], Any)
|
388
|
+
field_path.pop()
|
389
|
+
|
390
|
+
value_decoder = _make_engine_struct_to_dict_decoder(field_path, value_fields_schema)
|
391
|
+
|
392
|
+
def decode_to_dict_dict(
|
393
|
+
values: list[Any] | None,
|
394
|
+
) -> dict[Any, dict[str, Any]] | None:
|
395
|
+
if values is None:
|
396
|
+
return None
|
397
|
+
result = {}
|
398
|
+
for row_values in values:
|
399
|
+
if not row_values:
|
400
|
+
raise ValueError("KTable row must have at least 1 value (the key)")
|
401
|
+
key = key_decoder(row_values[0])
|
402
|
+
if len(row_values) == 1:
|
403
|
+
value: dict[str, Any] = {}
|
404
|
+
else:
|
405
|
+
tmp = value_decoder(row_values[1:])
|
406
|
+
if tmp is None:
|
407
|
+
value = {}
|
408
|
+
else:
|
409
|
+
value = tmp
|
410
|
+
if isinstance(key, dict):
|
411
|
+
key = tuple(key.values())
|
412
|
+
result[key] = value
|
413
|
+
return result
|
414
|
+
|
415
|
+
return decode_to_dict_dict
|
416
|
+
|
417
|
+
|
343
418
|
def dump_engine_object(v: Any) -> Any:
|
344
419
|
"""Recursively dump an object for engine. Engine side uses `Pythonized` to catch."""
|
345
420
|
if v is None:
|
cocoindex/flow.py
CHANGED
@@ -10,6 +10,13 @@ import functools
|
|
10
10
|
import inspect
|
11
11
|
import re
|
12
12
|
|
13
|
+
from .validation import (
|
14
|
+
validate_flow_name,
|
15
|
+
NamingError,
|
16
|
+
validate_full_flow_name,
|
17
|
+
validate_target_name,
|
18
|
+
)
|
19
|
+
|
13
20
|
from dataclasses import dataclass
|
14
21
|
from enum import Enum
|
15
22
|
from threading import Lock
|
@@ -300,6 +307,9 @@ class DataScope:
|
|
300
307
|
)
|
301
308
|
|
302
309
|
def __setitem__(self, field_name: str, value: DataSlice[T]) -> None:
|
310
|
+
from .validation import validate_field_name
|
311
|
+
|
312
|
+
validate_field_name(field_name)
|
303
313
|
value._state.attach_to_scope(self._engine_data_scope, field_name)
|
304
314
|
|
305
315
|
def __enter__(self) -> DataScope:
|
@@ -367,7 +377,7 @@ class DataCollector:
|
|
367
377
|
|
368
378
|
def export(
|
369
379
|
self,
|
370
|
-
|
380
|
+
target_name: str,
|
371
381
|
target_spec: op.TargetSpec,
|
372
382
|
/,
|
373
383
|
*,
|
@@ -381,6 +391,8 @@ class DataCollector:
|
|
381
391
|
|
382
392
|
`vector_index` is for backward compatibility only. Please use `vector_indexes` instead.
|
383
393
|
"""
|
394
|
+
|
395
|
+
validate_target_name(target_name)
|
384
396
|
if not isinstance(target_spec, op.TargetSpec):
|
385
397
|
raise ValueError(
|
386
398
|
"export() can only be called on a CocoIndex target storage"
|
@@ -398,7 +410,7 @@ class DataCollector:
|
|
398
410
|
vector_indexes=vector_indexes,
|
399
411
|
)
|
400
412
|
self._flow_builder_state.engine_flow_builder.export(
|
401
|
-
|
413
|
+
target_name,
|
402
414
|
_spec_kind(target_spec),
|
403
415
|
dump_engine_object(target_spec),
|
404
416
|
dump_engine_object(index_options),
|
@@ -660,6 +672,8 @@ class Flow:
|
|
660
672
|
def __init__(
|
661
673
|
self, name: str, full_name: str, engine_flow_creator: Callable[[], _engine.Flow]
|
662
674
|
):
|
675
|
+
validate_flow_name(name)
|
676
|
+
validate_full_flow_name(full_name)
|
663
677
|
self._name = name
|
664
678
|
self._full_name = full_name
|
665
679
|
engine_flow = None
|
@@ -831,11 +845,6 @@ def get_flow_full_name(name: str) -> str:
|
|
831
845
|
|
832
846
|
|
833
847
|
def add_flow_def(name: str, fl_def: Callable[[FlowBuilder, DataScope], None]) -> Flow:
|
834
|
-
"""Add a flow definition to the cocoindex library."""
|
835
|
-
if not all(c.isalnum() or c == "_" for c in name):
|
836
|
-
raise ValueError(
|
837
|
-
f"Flow name '{name}' contains invalid characters. Only alphanumeric characters and underscores are allowed."
|
838
|
-
)
|
839
848
|
with _flows_lock:
|
840
849
|
if name in _flows:
|
841
850
|
raise KeyError(f"Flow with name {name} already exists")
|
cocoindex/functions.py
CHANGED
@@ -66,14 +66,19 @@ class SentenceTransformerEmbed(op.FunctionSpec):
|
|
66
66
|
args: dict[str, Any] | None = None
|
67
67
|
|
68
68
|
|
69
|
-
@op.executor_class(
|
69
|
+
@op.executor_class(
|
70
|
+
gpu=True,
|
71
|
+
cache=True,
|
72
|
+
behavior_version=1,
|
73
|
+
arg_relationship=(op.ArgRelationship.EMBEDDING_ORIGIN_TEXT, "text"),
|
74
|
+
)
|
70
75
|
class SentenceTransformerEmbedExecutor:
|
71
76
|
"""Executor for SentenceTransformerEmbed."""
|
72
77
|
|
73
78
|
spec: SentenceTransformerEmbed
|
74
79
|
_model: Any | None = None
|
75
80
|
|
76
|
-
def analyze(self,
|
81
|
+
def analyze(self, _text: Any) -> type:
|
77
82
|
try:
|
78
83
|
# Only import sentence_transformers locally when it's needed, as its import is very slow.
|
79
84
|
import sentence_transformers # pylint: disable=import-outside-toplevel
|
@@ -88,11 +93,7 @@ class SentenceTransformerEmbedExecutor:
|
|
88
93
|
args = self.spec.args or {}
|
89
94
|
self._model = sentence_transformers.SentenceTransformer(self.spec.model, **args)
|
90
95
|
dim = self._model.get_sentence_embedding_dimension()
|
91
|
-
|
92
|
-
Vector[np.float32, Literal[dim]], # type: ignore
|
93
|
-
TypeAttr("cocoindex.io/vector_origin_text", text.analyzed_value),
|
94
|
-
]
|
95
|
-
return result
|
96
|
+
return Vector[np.float32, Literal[dim]] # type: ignore
|
96
97
|
|
97
98
|
def __call__(self, text: str) -> NDArray[np.float32]:
|
98
99
|
assert self._model is not None
|
cocoindex/op.py
CHANGED
@@ -6,11 +6,11 @@ import asyncio
|
|
6
6
|
import dataclasses
|
7
7
|
import inspect
|
8
8
|
from enum import Enum
|
9
|
-
from typing import Any, Awaitable, Callable, Protocol, dataclass_transform
|
9
|
+
from typing import Any, Awaitable, Callable, Protocol, dataclass_transform, Annotated
|
10
10
|
|
11
11
|
from . import _engine # type: ignore
|
12
12
|
from .convert import encode_engine_value, make_engine_value_decoder
|
13
|
-
from .typing import encode_enriched_type, resolve_forward_ref
|
13
|
+
from .typing import TypeAttr, encode_enriched_type, resolve_forward_ref
|
14
14
|
|
15
15
|
|
16
16
|
class OpCategory(Enum):
|
@@ -85,6 +85,17 @@ class _FunctionExecutorFactory:
|
|
85
85
|
_gpu_dispatch_lock = asyncio.Lock()
|
86
86
|
|
87
87
|
|
88
|
+
_COCOINDEX_ATTR_PREFIX = "cocoindex.io/"
|
89
|
+
|
90
|
+
|
91
|
+
class ArgRelationship(Enum):
|
92
|
+
"""Specifies the relationship between an input argument and the output."""
|
93
|
+
|
94
|
+
EMBEDDING_ORIGIN_TEXT = _COCOINDEX_ATTR_PREFIX + "embedding_origin_text"
|
95
|
+
CHUNKS_BASE_TEXT = _COCOINDEX_ATTR_PREFIX + "chunk_base_text"
|
96
|
+
RECTS_BASE_IMAGE = _COCOINDEX_ATTR_PREFIX + "rects_base_image"
|
97
|
+
|
98
|
+
|
88
99
|
@dataclasses.dataclass
|
89
100
|
class OpArgs:
|
90
101
|
"""
|
@@ -92,11 +103,15 @@ class OpArgs:
|
|
92
103
|
- cache: Whether the executor will be cached.
|
93
104
|
- behavior_version: The behavior version of the executor. Cache will be invalidated if it
|
94
105
|
changes. Must be provided if `cache` is True.
|
106
|
+
- arg_relationship: It specifies the relationship between an input argument and the output,
|
107
|
+
e.g. `(ArgRelationship.CHUNKS_BASE_TEXT, "content")` means the output is chunks for the
|
108
|
+
input argument with name `content`.
|
95
109
|
"""
|
96
110
|
|
97
111
|
gpu: bool = False
|
98
112
|
cache: bool = False
|
99
113
|
behavior_version: int | None = None
|
114
|
+
arg_relationship: tuple[ArgRelationship, str] | None = None
|
100
115
|
|
101
116
|
|
102
117
|
def _to_async_call(call: Callable[..., Any]) -> Callable[..., Awaitable[Any]]:
|
@@ -143,6 +158,15 @@ def _register_op_factory(
|
|
143
158
|
"""
|
144
159
|
self._args_decoders = []
|
145
160
|
self._kwargs_decoders = {}
|
161
|
+
attributes = []
|
162
|
+
|
163
|
+
def process_attribute(arg_name: str, arg: _engine.OpArgSchema) -> None:
|
164
|
+
if op_args.arg_relationship is not None:
|
165
|
+
related_attr, related_arg_name = op_args.arg_relationship
|
166
|
+
if related_arg_name == arg_name:
|
167
|
+
attributes.append(
|
168
|
+
TypeAttr(related_attr.value, arg.analyzed_value)
|
169
|
+
)
|
146
170
|
|
147
171
|
# Match arguments with parameters.
|
148
172
|
next_param_idx = 0
|
@@ -164,6 +188,7 @@ def _register_op_factory(
|
|
164
188
|
[arg_name], arg.value_type["type"], arg_param.annotation
|
165
189
|
)
|
166
190
|
)
|
191
|
+
process_attribute(arg_name, arg)
|
167
192
|
if arg_param.kind != inspect.Parameter.VAR_POSITIONAL:
|
168
193
|
next_param_idx += 1
|
169
194
|
|
@@ -194,6 +219,7 @@ def _register_op_factory(
|
|
194
219
|
self._kwargs_decoders[kwarg_name] = make_engine_value_decoder(
|
195
220
|
[kwarg_name], kwarg.value_type["type"], arg_param.annotation
|
196
221
|
)
|
222
|
+
process_attribute(kwarg_name, kwarg)
|
197
223
|
|
198
224
|
missing_args = [
|
199
225
|
name
|
@@ -216,9 +242,12 @@ def _register_op_factory(
|
|
216
242
|
|
217
243
|
prepare_method = getattr(executor_cls, "analyze", None)
|
218
244
|
if prepare_method is not None:
|
219
|
-
|
245
|
+
result = prepare_method(self, *args, **kwargs)
|
220
246
|
else:
|
221
|
-
|
247
|
+
result = expected_return
|
248
|
+
if len(attributes) > 0:
|
249
|
+
result = Annotated[result, *attributes]
|
250
|
+
return result
|
222
251
|
|
223
252
|
async def prepare(self) -> None:
|
224
253
|
"""
|
cocoindex/setting.py
CHANGED
@@ -6,6 +6,7 @@ import os
|
|
6
6
|
|
7
7
|
from typing import Callable, Self, Any, overload
|
8
8
|
from dataclasses import dataclass
|
9
|
+
from .validation import validate_app_namespace_name
|
9
10
|
|
10
11
|
_app_namespace: str = ""
|
11
12
|
|
@@ -27,6 +28,8 @@ def split_app_namespace(full_name: str, delimiter: str) -> tuple[str, str]:
|
|
27
28
|
|
28
29
|
def set_app_namespace(app_namespace: str) -> None:
|
29
30
|
"""Set the application namespace."""
|
31
|
+
if app_namespace:
|
32
|
+
validate_app_namespace_name(app_namespace)
|
30
33
|
global _app_namespace # pylint: disable=global-statement
|
31
34
|
_app_namespace = app_namespace
|
32
35
|
|
cocoindex/tests/test_convert.py
CHANGED
@@ -1341,3 +1341,130 @@ def test_roundtrip_namedtuple_to_dict_binding() -> None:
|
|
1341
1341
|
validate_full_roundtrip(
|
1342
1342
|
instance, Point, (expected_dict, dict), (expected_dict, Any)
|
1343
1343
|
)
|
1344
|
+
|
1345
|
+
|
1346
|
+
def test_roundtrip_ltable_to_list_dict_binding() -> None:
|
1347
|
+
"""Test LTable -> list[dict] binding with Any annotation."""
|
1348
|
+
|
1349
|
+
@dataclass
|
1350
|
+
class User:
|
1351
|
+
id: str
|
1352
|
+
name: str
|
1353
|
+
age: int
|
1354
|
+
|
1355
|
+
users = [User("u1", "Alice", 25), User("u2", "Bob", 30), User("u3", "Charlie", 35)]
|
1356
|
+
expected_list_dict = [
|
1357
|
+
{"id": "u1", "name": "Alice", "age": 25},
|
1358
|
+
{"id": "u2", "name": "Bob", "age": 30},
|
1359
|
+
{"id": "u3", "name": "Charlie", "age": 35},
|
1360
|
+
]
|
1361
|
+
|
1362
|
+
# Test Any annotation
|
1363
|
+
validate_full_roundtrip(users, list[User], (expected_list_dict, Any))
|
1364
|
+
|
1365
|
+
|
1366
|
+
def test_roundtrip_ktable_to_dict_dict_binding() -> None:
|
1367
|
+
"""Test KTable -> dict[K, dict] binding with Any annotation."""
|
1368
|
+
|
1369
|
+
@dataclass
|
1370
|
+
class Product:
|
1371
|
+
name: str
|
1372
|
+
price: float
|
1373
|
+
active: bool
|
1374
|
+
|
1375
|
+
products = {
|
1376
|
+
"p1": Product("Widget", 29.99, True),
|
1377
|
+
"p2": Product("Gadget", 49.99, False),
|
1378
|
+
"p3": Product("Tool", 19.99, True),
|
1379
|
+
}
|
1380
|
+
expected_dict_dict = {
|
1381
|
+
"p1": {"name": "Widget", "price": 29.99, "active": True},
|
1382
|
+
"p2": {"name": "Gadget", "price": 49.99, "active": False},
|
1383
|
+
"p3": {"name": "Tool", "price": 19.99, "active": True},
|
1384
|
+
}
|
1385
|
+
|
1386
|
+
# Test Any annotation
|
1387
|
+
validate_full_roundtrip(products, dict[str, Product], (expected_dict_dict, Any))
|
1388
|
+
|
1389
|
+
|
1390
|
+
def test_roundtrip_ktable_with_complex_key() -> None:
|
1391
|
+
"""Test KTable with complex key types -> dict binding."""
|
1392
|
+
|
1393
|
+
@dataclass(frozen=True)
|
1394
|
+
class OrderKey:
|
1395
|
+
shop_id: str
|
1396
|
+
version: int
|
1397
|
+
|
1398
|
+
@dataclass
|
1399
|
+
class Order:
|
1400
|
+
customer: str
|
1401
|
+
total: float
|
1402
|
+
|
1403
|
+
orders = {
|
1404
|
+
OrderKey("shop1", 1): Order("Alice", 100.0),
|
1405
|
+
OrderKey("shop2", 2): Order("Bob", 200.0),
|
1406
|
+
}
|
1407
|
+
expected_dict_dict = {
|
1408
|
+
("shop1", 1): {"customer": "Alice", "total": 100.0},
|
1409
|
+
("shop2", 2): {"customer": "Bob", "total": 200.0},
|
1410
|
+
}
|
1411
|
+
|
1412
|
+
# Test Any annotation
|
1413
|
+
validate_full_roundtrip(orders, dict[OrderKey, Order], (expected_dict_dict, Any))
|
1414
|
+
|
1415
|
+
|
1416
|
+
def test_roundtrip_ltable_with_nested_structs() -> None:
|
1417
|
+
"""Test LTable with nested structs -> list[dict] binding."""
|
1418
|
+
|
1419
|
+
@dataclass
|
1420
|
+
class Address:
|
1421
|
+
street: str
|
1422
|
+
city: str
|
1423
|
+
|
1424
|
+
@dataclass
|
1425
|
+
class Person:
|
1426
|
+
name: str
|
1427
|
+
age: int
|
1428
|
+
address: Address
|
1429
|
+
|
1430
|
+
people = [
|
1431
|
+
Person("John", 30, Address("123 Main St", "Anytown")),
|
1432
|
+
Person("Jane", 25, Address("456 Oak Ave", "Somewhere")),
|
1433
|
+
]
|
1434
|
+
expected_list_dict = [
|
1435
|
+
{
|
1436
|
+
"name": "John",
|
1437
|
+
"age": 30,
|
1438
|
+
"address": {"street": "123 Main St", "city": "Anytown"},
|
1439
|
+
},
|
1440
|
+
{
|
1441
|
+
"name": "Jane",
|
1442
|
+
"age": 25,
|
1443
|
+
"address": {"street": "456 Oak Ave", "city": "Somewhere"},
|
1444
|
+
},
|
1445
|
+
]
|
1446
|
+
|
1447
|
+
# Test Any annotation
|
1448
|
+
validate_full_roundtrip(people, list[Person], (expected_list_dict, Any))
|
1449
|
+
|
1450
|
+
|
1451
|
+
def test_roundtrip_ktable_with_list_fields() -> None:
|
1452
|
+
"""Test KTable with list fields -> dict binding."""
|
1453
|
+
|
1454
|
+
@dataclass
|
1455
|
+
class Team:
|
1456
|
+
name: str
|
1457
|
+
members: list[str]
|
1458
|
+
active: bool
|
1459
|
+
|
1460
|
+
teams = {
|
1461
|
+
"team1": Team("Dev Team", ["Alice", "Bob"], True),
|
1462
|
+
"team2": Team("QA Team", ["Charlie", "David"], False),
|
1463
|
+
}
|
1464
|
+
expected_dict_dict = {
|
1465
|
+
"team1": {"name": "Dev Team", "members": ["Alice", "Bob"], "active": True},
|
1466
|
+
"team2": {"name": "QA Team", "members": ["Charlie", "David"], "active": False},
|
1467
|
+
}
|
1468
|
+
|
1469
|
+
# Test Any annotation
|
1470
|
+
validate_full_roundtrip(teams, dict[str, Team], (expected_dict_dict, Any))
|
@@ -0,0 +1,134 @@
|
|
1
|
+
"""Tests for naming validation functionality."""
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
from cocoindex.validation import (
|
5
|
+
validate_field_name,
|
6
|
+
validate_flow_name,
|
7
|
+
validate_full_flow_name,
|
8
|
+
validate_app_namespace_name,
|
9
|
+
validate_target_name,
|
10
|
+
NamingError,
|
11
|
+
validate_identifier_name,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
class TestValidateIdentifierName:
|
16
|
+
"""Test the core validation function."""
|
17
|
+
|
18
|
+
def test_valid_names(self) -> None:
|
19
|
+
"""Test that valid names pass validation."""
|
20
|
+
valid_names = [
|
21
|
+
"field1",
|
22
|
+
"field_name",
|
23
|
+
"_private",
|
24
|
+
"a",
|
25
|
+
"field123",
|
26
|
+
"FIELD_NAME",
|
27
|
+
"MyField",
|
28
|
+
"field_123_test",
|
29
|
+
]
|
30
|
+
|
31
|
+
for name in valid_names:
|
32
|
+
result = validate_identifier_name(name)
|
33
|
+
assert result is None, f"Valid name '{name}' failed validation: {result}"
|
34
|
+
|
35
|
+
def test_valid_names_with_dots(self) -> None:
|
36
|
+
"""Test that valid names with dots pass validation when allowed."""
|
37
|
+
valid_names = ["app.flow", "my_app.my_flow", "namespace.sub.flow", "a.b.c.d"]
|
38
|
+
|
39
|
+
for name in valid_names:
|
40
|
+
result = validate_identifier_name(name, allow_dots=True)
|
41
|
+
assert result is None, (
|
42
|
+
f"Valid dotted name '{name}' failed validation: {result}"
|
43
|
+
)
|
44
|
+
|
45
|
+
def test_invalid_starting_characters(self) -> None:
|
46
|
+
"""Test names with invalid starting characters."""
|
47
|
+
invalid_names = [
|
48
|
+
"123field", # starts with digit
|
49
|
+
".field", # starts with dot
|
50
|
+
"-field", # starts with dash
|
51
|
+
" field", # starts with space
|
52
|
+
]
|
53
|
+
|
54
|
+
for name in invalid_names:
|
55
|
+
result = validate_identifier_name(name)
|
56
|
+
assert result is not None, (
|
57
|
+
f"Invalid name '{name}' should have failed validation"
|
58
|
+
)
|
59
|
+
|
60
|
+
def test_double_underscore_restriction(self) -> None:
|
61
|
+
"""Test double underscore restriction."""
|
62
|
+
invalid_names = ["__reserved", "__internal", "__test"]
|
63
|
+
|
64
|
+
for name in invalid_names:
|
65
|
+
result = validate_identifier_name(name)
|
66
|
+
assert result is not None
|
67
|
+
assert "double underscores" in result.lower()
|
68
|
+
|
69
|
+
def test_length_restriction(self) -> None:
|
70
|
+
"""Test maximum length restriction."""
|
71
|
+
long_name = "a" * 65
|
72
|
+
result = validate_identifier_name(long_name, max_length=64)
|
73
|
+
assert result is not None
|
74
|
+
assert "maximum length" in result.lower()
|
75
|
+
|
76
|
+
|
77
|
+
class TestSpecificValidators:
|
78
|
+
"""Test the specific validation functions."""
|
79
|
+
|
80
|
+
def test_valid_field_names(self) -> None:
|
81
|
+
"""Test valid field names."""
|
82
|
+
valid_names = ["field1", "field_name", "_private", "FIELD"]
|
83
|
+
for name in valid_names:
|
84
|
+
validate_field_name(name) # Should not raise
|
85
|
+
|
86
|
+
def test_invalid_field_names(self) -> None:
|
87
|
+
"""Test invalid field names raise NamingError."""
|
88
|
+
invalid_names = ["123field", "field-name", "__reserved", "a" * 65]
|
89
|
+
|
90
|
+
for name in invalid_names:
|
91
|
+
with pytest.raises(NamingError):
|
92
|
+
validate_field_name(name)
|
93
|
+
|
94
|
+
def test_flow_validation(self) -> None:
|
95
|
+
"""Test flow name validation."""
|
96
|
+
# Valid flow names
|
97
|
+
validate_flow_name("MyFlow")
|
98
|
+
validate_flow_name("my_flow_123")
|
99
|
+
|
100
|
+
# Invalid flow names
|
101
|
+
with pytest.raises(NamingError):
|
102
|
+
validate_flow_name("123flow")
|
103
|
+
|
104
|
+
with pytest.raises(NamingError):
|
105
|
+
validate_flow_name("__reserved_flow")
|
106
|
+
|
107
|
+
def test_full_flow_name_allows_dots(self) -> None:
|
108
|
+
"""Test that full flow names allow dots."""
|
109
|
+
validate_full_flow_name("app.my_flow")
|
110
|
+
validate_full_flow_name("namespace.subnamespace.flow")
|
111
|
+
|
112
|
+
# But still reject invalid patterns
|
113
|
+
with pytest.raises(NamingError):
|
114
|
+
validate_full_flow_name("123.invalid")
|
115
|
+
|
116
|
+
def test_target_validation(self) -> None:
|
117
|
+
"""Test target name validation."""
|
118
|
+
validate_target_name("my_target")
|
119
|
+
validate_target_name("output_table")
|
120
|
+
|
121
|
+
with pytest.raises(NamingError):
|
122
|
+
validate_target_name("123target")
|
123
|
+
|
124
|
+
def test_app_namespace_validation(self) -> None:
|
125
|
+
"""Test app namespace validation."""
|
126
|
+
validate_app_namespace_name("myapp")
|
127
|
+
validate_app_namespace_name("my_app_123")
|
128
|
+
|
129
|
+
# Should not allow dots in app namespace
|
130
|
+
with pytest.raises(NamingError):
|
131
|
+
validate_app_namespace_name("my.app")
|
132
|
+
|
133
|
+
with pytest.raises(NamingError):
|
134
|
+
validate_app_namespace_name("123app")
|
cocoindex/validation.py
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
"""
|
2
|
+
Naming validation for CocoIndex identifiers.
|
3
|
+
|
4
|
+
This module enforces naming conventions for flow names, field names,
|
5
|
+
target names, and app namespace names as specified in issue #779.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from typing import Optional
|
10
|
+
|
11
|
+
_IDENTIFIER_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
|
12
|
+
_IDENTIFIER_WITH_DOTS_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_.]*$")
|
13
|
+
|
14
|
+
|
15
|
+
class NamingError(ValueError):
|
16
|
+
"""Exception raised for naming convention violations."""
|
17
|
+
|
18
|
+
pass
|
19
|
+
|
20
|
+
|
21
|
+
def validate_identifier_name(
|
22
|
+
name: str,
|
23
|
+
max_length: int = 64,
|
24
|
+
allow_dots: bool = False,
|
25
|
+
identifier_type: str = "identifier",
|
26
|
+
) -> Optional[str]:
|
27
|
+
"""
|
28
|
+
Validate identifier names according to CocoIndex naming rules.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
name: The name to validate
|
32
|
+
max_length: Maximum allowed length (default 64)
|
33
|
+
allow_dots: Whether to allow dots in the name (for full flow names)
|
34
|
+
identifier_type: Type of identifier for error messages
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
None if valid, error message string if invalid
|
38
|
+
"""
|
39
|
+
if not name:
|
40
|
+
return f"{identifier_type} name cannot be empty"
|
41
|
+
|
42
|
+
if len(name) > max_length:
|
43
|
+
return f"{identifier_type} name '{name}' exceeds maximum length of {max_length} characters"
|
44
|
+
|
45
|
+
if name.startswith("__"):
|
46
|
+
return f"{identifier_type} name '{name}' cannot start with double underscores (reserved for internal usage)"
|
47
|
+
|
48
|
+
# Define allowed pattern
|
49
|
+
if allow_dots:
|
50
|
+
pattern = _IDENTIFIER_WITH_DOTS_PATTERN
|
51
|
+
allowed_chars = "letters, digits, underscores, and dots"
|
52
|
+
else:
|
53
|
+
pattern = _IDENTIFIER_PATTERN
|
54
|
+
allowed_chars = "letters, digits, and underscores"
|
55
|
+
|
56
|
+
if not pattern.match(name):
|
57
|
+
return f"{identifier_type} name '{name}' must start with a letter or underscore and contain only {allowed_chars}"
|
58
|
+
|
59
|
+
return None
|
60
|
+
|
61
|
+
|
62
|
+
def validate_field_name(name: str) -> None:
|
63
|
+
"""Validate field names."""
|
64
|
+
error = validate_identifier_name(
|
65
|
+
name, max_length=64, allow_dots=False, identifier_type="Field"
|
66
|
+
)
|
67
|
+
if error:
|
68
|
+
raise NamingError(error)
|
69
|
+
|
70
|
+
|
71
|
+
def validate_flow_name(name: str) -> None:
|
72
|
+
"""Validate flow names."""
|
73
|
+
error = validate_identifier_name(
|
74
|
+
name, max_length=64, allow_dots=False, identifier_type="Flow"
|
75
|
+
)
|
76
|
+
if error:
|
77
|
+
raise NamingError(error)
|
78
|
+
|
79
|
+
|
80
|
+
def validate_full_flow_name(name: str) -> None:
|
81
|
+
"""Validate full flow names (can contain dots for namespacing)."""
|
82
|
+
error = validate_identifier_name(
|
83
|
+
name, max_length=64, allow_dots=True, identifier_type="Full flow"
|
84
|
+
)
|
85
|
+
if error:
|
86
|
+
raise NamingError(error)
|
87
|
+
|
88
|
+
|
89
|
+
def validate_app_namespace_name(name: str) -> None:
|
90
|
+
"""Validate app namespace names."""
|
91
|
+
error = validate_identifier_name(
|
92
|
+
name, max_length=64, allow_dots=False, identifier_type="App namespace"
|
93
|
+
)
|
94
|
+
if error:
|
95
|
+
raise NamingError(error)
|
96
|
+
|
97
|
+
|
98
|
+
def validate_target_name(name: str) -> None:
|
99
|
+
"""Validate target names."""
|
100
|
+
error = validate_identifier_name(
|
101
|
+
name, max_length=64, allow_dots=False, identifier_type="Target"
|
102
|
+
)
|
103
|
+
if error:
|
104
|
+
raise NamingError(error)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cocoindex
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.72
|
4
4
|
Requires-Dist: click>=8.1.8
|
5
5
|
Requires-Dist: rich>=14.0.0
|
6
6
|
Requires-Dist: python-dotenv>=1.1.0
|
@@ -52,18 +52,18 @@ Ultra performant data transformation framework for AI, with core engine written
|
|
52
52
|
⭐ Drop a star to help us grow!
|
53
53
|
|
54
54
|
<div align="center">
|
55
|
-
|
55
|
+
|
56
56
|
<!-- Keep these links. Translations will automatically update with the README. -->
|
57
|
-
[Deutsch](https://readme-i18n.com/cocoindex-io/cocoindex?lang=de) |
|
58
|
-
[English](https://readme-i18n.com/cocoindex-io/cocoindex?lang=en) |
|
59
|
-
[Español](https://readme-i18n.com/cocoindex-io/cocoindex?lang=es) |
|
60
|
-
[français](https://readme-i18n.com/cocoindex-io/cocoindex?lang=fr) |
|
61
|
-
[日本語](https://readme-i18n.com/cocoindex-io/cocoindex?lang=ja) |
|
62
|
-
[한국어](https://readme-i18n.com/cocoindex-io/cocoindex?lang=ko) |
|
63
|
-
[Português](https://readme-i18n.com/cocoindex-io/cocoindex?lang=pt) |
|
64
|
-
[Русский](https://readme-i18n.com/cocoindex-io/cocoindex?lang=ru) |
|
57
|
+
[Deutsch](https://readme-i18n.com/cocoindex-io/cocoindex?lang=de) |
|
58
|
+
[English](https://readme-i18n.com/cocoindex-io/cocoindex?lang=en) |
|
59
|
+
[Español](https://readme-i18n.com/cocoindex-io/cocoindex?lang=es) |
|
60
|
+
[français](https://readme-i18n.com/cocoindex-io/cocoindex?lang=fr) |
|
61
|
+
[日本語](https://readme-i18n.com/cocoindex-io/cocoindex?lang=ja) |
|
62
|
+
[한국어](https://readme-i18n.com/cocoindex-io/cocoindex?lang=ko) |
|
63
|
+
[Português](https://readme-i18n.com/cocoindex-io/cocoindex?lang=pt) |
|
64
|
+
[Русский](https://readme-i18n.com/cocoindex-io/cocoindex?lang=ru) |
|
65
65
|
[中文](https://readme-i18n.com/cocoindex-io/cocoindex?lang=zh)
|
66
|
-
|
66
|
+
|
67
67
|
</div>
|
68
68
|
|
69
69
|
</br>
|
@@ -208,6 +208,7 @@ It defines an index flow like this:
|
|
208
208
|
| [FastAPI Server with Docker](examples/fastapi_server_docker) | Run the semantic search server in a Dockerized FastAPI setup |
|
209
209
|
| [Product Recommendation](examples/product_recommendation) | Build real-time product recommendations with LLM and graph database|
|
210
210
|
| [Image Search with Vision API](examples/image_search) | Generates detailed captions for images using a vision model, embeds them, enables live-updating semantic search via FastAPI and served on a React frontend|
|
211
|
+
| [Face Recognition](examples/face_recognition) | Recognize faces in images and build embedding index |
|
211
212
|
| [Paper Metadata](examples/paper_metadata) | Index papers in PDF files, and build metadata tables for each paper |
|
212
213
|
|
213
214
|
More coming and stay tuned 👀!
|
@@ -1,28 +1,30 @@
|
|
1
|
-
cocoindex-0.1.
|
2
|
-
cocoindex-0.1.
|
3
|
-
cocoindex-0.1.
|
4
|
-
cocoindex-0.1.
|
5
|
-
cocoindex/__init__.py,sha256=
|
6
|
-
cocoindex/_engine.cpython-313t-aarch64-linux-gnu.so,sha256=
|
1
|
+
cocoindex-0.1.72.dist-info/METADATA,sha256=ztXY--5U2oGAPvJlh-fGpYTFcrhoAg3AhaOX78sZACI,11304
|
2
|
+
cocoindex-0.1.72.dist-info/WHEEL,sha256=b5lwx5EheF_JFEgLjsRcIlINdne5QlxE_VF8TC2mALE,110
|
3
|
+
cocoindex-0.1.72.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
|
4
|
+
cocoindex-0.1.72.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
5
|
+
cocoindex/__init__.py,sha256=kfTgbh2haepo7kIbzJqfxU6Kx7wPol5_t1SYF2x6cBM,2114
|
6
|
+
cocoindex/_engine.cpython-313t-aarch64-linux-gnu.so,sha256=9_Dmgo_IQBcA7TFJFoBsgT7WCL2LCDAWOd11lFtjcW8,68452440
|
7
7
|
cocoindex/auth_registry.py,sha256=PE1-kVkcyC1G2C_V7b1kvYzeq73OFQehWKQP7ln7fJ8,1478
|
8
8
|
cocoindex/cli.py,sha256=-gp639JSyQN6YjnhGqCakIzYoSSqXxQMbxbkcYGP0QY,22359
|
9
|
-
cocoindex/convert.py,sha256=
|
10
|
-
cocoindex/flow.py,sha256=
|
11
|
-
cocoindex/functions.py,sha256=
|
9
|
+
cocoindex/convert.py,sha256=fOzfbMlQ8WQ_nAv8WpX-EEHdZdBV8QXV3qIe1_Ird_U,15806
|
10
|
+
cocoindex/flow.py,sha256=HN24rsihO3BkSYGnTtxgovgka2IobxhFuLmDlqw3fAk,36127
|
11
|
+
cocoindex/functions.py,sha256=LLu_ausirvqnsx_k3euZpv8sLCpBZ4DF77h2HOzbinE,3109
|
12
12
|
cocoindex/index.py,sha256=j93B9jEvvLXHtpzKWL88SY6wCGEoPgpsQhEGHlyYGFg,540
|
13
13
|
cocoindex/lib.py,sha256=f--9dAYd84CZosbDZqNW0oGbBLsY3dXiUTR1VrfQ_QY,817
|
14
14
|
cocoindex/llm.py,sha256=WxmWUbNcf9HOCM5xkbDeFs9lF67M3mr810B7deDDc-8,673
|
15
|
-
cocoindex/op.py,sha256=
|
15
|
+
cocoindex/op.py,sha256=Afi5CfgU3wPQoPPKFb2WUYCVLmCPhBuK-2NT1AzC2zU,13161
|
16
16
|
cocoindex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
cocoindex/runtime.py,sha256=povilB3HH3y1JF-yxKwU-pD8n2WnAqyQxIgvXXHNc60,1080
|
18
|
-
cocoindex/setting.py,sha256=
|
18
|
+
cocoindex/setting.py,sha256=TwhQ6pEeZmvc8ZXlnT9d8Wn8Vz_u7Z5LJUkGsKmKSno,4859
|
19
19
|
cocoindex/setup.py,sha256=7uIHKN4FOCuoidPXcKyGTrkqpkl9luL49-6UcnMxYzw,3068
|
20
20
|
cocoindex/sources.py,sha256=69COA4qbZDipzGYfXv-WJSmicFkA509xIShRGDh6A0A,2083
|
21
21
|
cocoindex/targets.py,sha256=Nfh_tpFd1goTnS_cxBjIs4j9zl3Z4Z1JomAQ1dl3Sic,2796
|
22
22
|
cocoindex/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
cocoindex/tests/test_convert.py,sha256=
|
23
|
+
cocoindex/tests/test_convert.py,sha256=CnPDAK8QdzWTS9II-prbwIHeiq5htvRFhkfR8YdUE10,48960
|
24
24
|
cocoindex/tests/test_optional_database.py,sha256=snAmkNa6wtOSaxoZE1HgjvL5v_ylitt3Jt_9df4Cgdc,8506
|
25
25
|
cocoindex/tests/test_typing.py,sha256=NB4nUzoumOF_wGFa4D2Xf6d0bUVtOiSXyb78M1pYSG4,14827
|
26
|
+
cocoindex/tests/test_validation.py,sha256=X6AQzVs-hVKIXcrHMEMQnhfUE8at7iXQnPq8nHNhZ2Q,4543
|
26
27
|
cocoindex/typing.py,sha256=MO9HkrNpargvMPvpkd7jgSu2R-21KE_NaB9-WI4YOZA,13241
|
27
28
|
cocoindex/utils.py,sha256=hUhX-XV6XGCtJSEIpBOuDv6VvqImwPlgBxztBTw7u0U,598
|
28
|
-
cocoindex
|
29
|
+
cocoindex/validation.py,sha256=PZnJoby4sLbsmPv9fOjOQXuefjfZ7gmtsiTGU8SH-tc,3090
|
30
|
+
cocoindex-0.1.72.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|