cocoindex 0.1.64__pp311-pypy311_pp73-manylinux_2_28_aarch64.whl → 0.1.66__pp311-pypy311_pp73-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoindex/__init__.py +4 -0
- cocoindex/_engine.pypy311-pp73-aarch64-linux-gnu.so +0 -0
- cocoindex/convert.py +47 -1
- cocoindex/flow.py +29 -9
- cocoindex/functions.py +1 -0
- cocoindex/llm.py +12 -0
- cocoindex/setting.py +8 -8
- cocoindex/sources.py +15 -0
- cocoindex/tests/test_convert.py +112 -0
- cocoindex/tests/test_typing.py +4 -1
- cocoindex/typing.py +13 -4
- {cocoindex-0.1.64.dist-info → cocoindex-0.1.66.dist-info}/METADATA +42 -18
- cocoindex-0.1.66.dist-info/RECORD +28 -0
- cocoindex-0.1.64.dist-info/RECORD +0 -28
- {cocoindex-0.1.64.dist-info → cocoindex-0.1.66.dist-info}/WHEEL +0 -0
- {cocoindex-0.1.64.dist-info → cocoindex-0.1.66.dist-info}/entry_points.txt +0 -0
- {cocoindex-0.1.64.dist-info → cocoindex-0.1.66.dist-info}/licenses/LICENSE +0 -0
cocoindex/__init__.py
CHANGED
@@ -11,6 +11,7 @@ from .flow import FlowBuilder, DataScope, DataSlice, Flow, transform_flow
|
|
11
11
|
from .flow import flow_def
|
12
12
|
from .flow import EvaluateAndDumpOptions, GeneratedField
|
13
13
|
from .flow import FlowLiveUpdater, FlowLiveUpdaterOptions
|
14
|
+
from .flow import add_flow_def, remove_flow
|
14
15
|
from .flow import update_all_flows_async, setup_all_flows, drop_all_flows
|
15
16
|
from .lib import init, start_server, stop
|
16
17
|
from .llm import LlmSpec, LlmApiType
|
@@ -32,6 +33,7 @@ __all__ = [
|
|
32
33
|
# Submodules
|
33
34
|
"_engine",
|
34
35
|
"functions",
|
36
|
+
"llm",
|
35
37
|
"sources",
|
36
38
|
"targets",
|
37
39
|
"storages",
|
@@ -52,6 +54,8 @@ __all__ = [
|
|
52
54
|
"GeneratedField",
|
53
55
|
"FlowLiveUpdater",
|
54
56
|
"FlowLiveUpdaterOptions",
|
57
|
+
"add_flow_def",
|
58
|
+
"remove_flow",
|
55
59
|
"update_all_flows_async",
|
56
60
|
"setup_all_flows",
|
57
61
|
"drop_all_flows",
|
Binary file
|
cocoindex/convert.py
CHANGED
@@ -89,13 +89,26 @@ def make_engine_value_decoder(
|
|
89
89
|
if dst_is_any:
|
90
90
|
if src_type_kind == "Union":
|
91
91
|
return lambda value: value[1]
|
92
|
-
if src_type_kind == "Struct"
|
92
|
+
if src_type_kind == "Struct":
|
93
|
+
return _make_engine_struct_to_dict_decoder(field_path, src_type["fields"])
|
94
|
+
if src_type_kind in TABLE_TYPES:
|
93
95
|
raise ValueError(
|
94
96
|
f"Missing type annotation for `{''.join(field_path)}`."
|
95
97
|
f"It's required for {src_type_kind} type."
|
96
98
|
)
|
97
99
|
return lambda value: value
|
98
100
|
|
101
|
+
# Handle struct -> dict binding for explicit dict annotations
|
102
|
+
is_dict_annotation = False
|
103
|
+
if dst_annotation is dict:
|
104
|
+
is_dict_annotation = True
|
105
|
+
elif getattr(dst_annotation, "__origin__", None) is dict:
|
106
|
+
args = getattr(dst_annotation, "__args__", ())
|
107
|
+
if args == (str, Any):
|
108
|
+
is_dict_annotation = True
|
109
|
+
if is_dict_annotation and src_type_kind == "Struct":
|
110
|
+
return _make_engine_struct_to_dict_decoder(field_path, src_type["fields"])
|
111
|
+
|
99
112
|
dst_type_info = analyze_type_info(dst_annotation)
|
100
113
|
|
101
114
|
if src_type_kind == "Union":
|
@@ -294,6 +307,39 @@ def _make_engine_struct_value_decoder(
|
|
294
307
|
)
|
295
308
|
|
296
309
|
|
310
|
+
def _make_engine_struct_to_dict_decoder(
|
311
|
+
field_path: list[str],
|
312
|
+
src_fields: list[dict[str, Any]],
|
313
|
+
) -> Callable[[list[Any] | None], dict[str, Any] | None]:
|
314
|
+
"""Make a decoder from engine field values to a Python dict."""
|
315
|
+
|
316
|
+
field_decoders = []
|
317
|
+
for i, field_schema in enumerate(src_fields):
|
318
|
+
field_name = field_schema["name"]
|
319
|
+
field_path.append(f".{field_name}")
|
320
|
+
field_decoder = make_engine_value_decoder(
|
321
|
+
field_path,
|
322
|
+
field_schema["type"],
|
323
|
+
Any, # Use Any for recursive decoding
|
324
|
+
)
|
325
|
+
field_path.pop()
|
326
|
+
field_decoders.append((field_name, field_decoder))
|
327
|
+
|
328
|
+
def decode_to_dict(values: list[Any] | None) -> dict[str, Any] | None:
|
329
|
+
if values is None:
|
330
|
+
return None
|
331
|
+
if len(field_decoders) != len(values):
|
332
|
+
raise ValueError(
|
333
|
+
f"Field count mismatch: expected {len(field_decoders)}, got {len(values)}"
|
334
|
+
)
|
335
|
+
return {
|
336
|
+
field_name: field_decoder(value)
|
337
|
+
for value, (field_name, field_decoder) in zip(values, field_decoders)
|
338
|
+
}
|
339
|
+
|
340
|
+
return decode_to_dict
|
341
|
+
|
342
|
+
|
297
343
|
def dump_engine_object(v: Any) -> Any:
|
298
344
|
"""Recursively dump an object for engine. Engine side uses `Pythonized` to catch."""
|
299
345
|
if v is None:
|
cocoindex/flow.py
CHANGED
@@ -624,7 +624,7 @@ class Flow:
|
|
624
624
|
|
625
625
|
_name: str
|
626
626
|
_full_name: str
|
627
|
-
_lazy_engine_flow: Callable[[], _engine.Flow]
|
627
|
+
_lazy_engine_flow: Callable[[], _engine.Flow] | None
|
628
628
|
|
629
629
|
def __init__(
|
630
630
|
self, name: str, full_name: str, engine_flow_creator: Callable[[], _engine.Flow]
|
@@ -664,18 +664,18 @@ class Flow:
|
|
664
664
|
return tree
|
665
665
|
|
666
666
|
def _get_spec(self, verbose: bool = False) -> _engine.RenderedSpec:
|
667
|
-
return self.
|
667
|
+
return self.internal_flow().get_spec(
|
668
668
|
output_mode="verbose" if verbose else "concise"
|
669
669
|
)
|
670
670
|
|
671
671
|
def _get_schema(self) -> list[tuple[str, str, str]]:
|
672
|
-
return cast(list[tuple[str, str, str]], self.
|
672
|
+
return cast(list[tuple[str, str, str]], self.internal_flow().get_schema())
|
673
673
|
|
674
674
|
def __str__(self) -> str:
|
675
675
|
return str(self._get_spec())
|
676
676
|
|
677
677
|
def __repr__(self) -> str:
|
678
|
-
return repr(self.
|
678
|
+
return repr(self.internal_flow())
|
679
679
|
|
680
680
|
@property
|
681
681
|
def name(self) -> str:
|
@@ -715,12 +715,14 @@ class Flow:
|
|
715
715
|
"""
|
716
716
|
Evaluate the flow and dump flow outputs to files.
|
717
717
|
"""
|
718
|
-
return self.
|
718
|
+
return self.internal_flow().evaluate_and_dump(dump_engine_object(options))
|
719
719
|
|
720
720
|
def internal_flow(self) -> _engine.Flow:
|
721
721
|
"""
|
722
722
|
Get the engine flow.
|
723
723
|
"""
|
724
|
+
if self._lazy_engine_flow is None:
|
725
|
+
raise RuntimeError(f"Flow {self.full_name} is already removed")
|
724
726
|
return self._lazy_engine_flow()
|
725
727
|
|
726
728
|
async def internal_flow_async(self) -> _engine.Flow:
|
@@ -731,13 +733,13 @@ class Flow:
|
|
731
733
|
|
732
734
|
def setup(self, report_to_stdout: bool = False) -> None:
|
733
735
|
"""
|
734
|
-
Setup the flow.
|
736
|
+
Setup persistent backends of the flow.
|
735
737
|
"""
|
736
738
|
execution_context.run(self.setup_async(report_to_stdout=report_to_stdout))
|
737
739
|
|
738
740
|
async def setup_async(self, report_to_stdout: bool = False) -> None:
|
739
741
|
"""
|
740
|
-
Setup the flow. The async version.
|
742
|
+
Setup persistent backends of the flow. The async version.
|
741
743
|
"""
|
742
744
|
await make_setup_bundle([self]).describe_and_apply_async(
|
743
745
|
report_to_stdout=report_to_stdout
|
@@ -745,13 +747,18 @@ class Flow:
|
|
745
747
|
|
746
748
|
def drop(self, report_to_stdout: bool = False) -> None:
|
747
749
|
"""
|
748
|
-
Drop the flow.
|
750
|
+
Drop persistent backends of the flow.
|
751
|
+
|
752
|
+
The current instance is still valid after it's called.
|
753
|
+
For example, you can still call `setup()` after it, to setup the persistent backends again.
|
754
|
+
|
755
|
+
Call `cocoindex.remove_flow()` if you want to remove the flow from the current process.
|
749
756
|
"""
|
750
757
|
execution_context.run(self.drop_async(report_to_stdout=report_to_stdout))
|
751
758
|
|
752
759
|
async def drop_async(self, report_to_stdout: bool = False) -> None:
|
753
760
|
"""
|
754
|
-
Drop the flow. The async version.
|
761
|
+
Drop persistent backends of the flow. The async version.
|
755
762
|
"""
|
756
763
|
await make_drop_bundle([self]).describe_and_apply_async(
|
757
764
|
report_to_stdout=report_to_stdout
|
@@ -805,6 +812,19 @@ def add_flow_def(name: str, fl_def: Callable[[FlowBuilder, DataScope], None]) ->
|
|
805
812
|
return fl
|
806
813
|
|
807
814
|
|
815
|
+
def remove_flow(fl: Flow) -> None:
|
816
|
+
"""
|
817
|
+
Remove a flow from the current process to free up resources.
|
818
|
+
After it's called, methods of the flow should no longer be called.
|
819
|
+
|
820
|
+
This will NOT touch the persistent backends of the flow.
|
821
|
+
"""
|
822
|
+
_engine.remove_flow_context(fl.full_name)
|
823
|
+
fl._lazy_engine_flow = None # pylint: disable=protected-access
|
824
|
+
with _flows_lock:
|
825
|
+
del _flows[fl.name]
|
826
|
+
|
827
|
+
|
808
828
|
def flow_def(
|
809
829
|
name: str | None = None,
|
810
830
|
) -> Callable[[Callable[[FlowBuilder, DataScope], None]], Flow]:
|
cocoindex/functions.py
CHANGED
cocoindex/llm.py
CHANGED
@@ -8,6 +8,7 @@ class LlmApiType(Enum):
|
|
8
8
|
OPENAI = "OpenAi"
|
9
9
|
OLLAMA = "Ollama"
|
10
10
|
GEMINI = "Gemini"
|
11
|
+
VERTEX_AI = "VertexAi"
|
11
12
|
ANTHROPIC = "Anthropic"
|
12
13
|
LITE_LLM = "LiteLlm"
|
13
14
|
OPEN_ROUTER = "OpenRouter"
|
@@ -15,6 +16,16 @@ class LlmApiType(Enum):
|
|
15
16
|
VLLM = "Vllm"
|
16
17
|
|
17
18
|
|
19
|
+
@dataclass
|
20
|
+
class VertexAiConfig:
|
21
|
+
"""A specification for a Vertex AI LLM."""
|
22
|
+
|
23
|
+
kind = "VertexAi"
|
24
|
+
|
25
|
+
project: str
|
26
|
+
region: str | None = None
|
27
|
+
|
28
|
+
|
18
29
|
@dataclass
|
19
30
|
class LlmSpec:
|
20
31
|
"""A specification for a LLM."""
|
@@ -22,3 +33,4 @@ class LlmSpec:
|
|
22
33
|
api_type: LlmApiType
|
23
34
|
model: str
|
24
35
|
address: str | None = None
|
36
|
+
api_config: VertexAiConfig | None = None
|
cocoindex/setting.py
CHANGED
@@ -44,12 +44,12 @@ class DatabaseConnectionSpec:
|
|
44
44
|
|
45
45
|
|
46
46
|
@dataclass
|
47
|
-
class
|
48
|
-
"""
|
47
|
+
class GlobalExecutionOptions:
|
48
|
+
"""Global execution options."""
|
49
49
|
|
50
|
-
# The maximum number of concurrent inflight requests.
|
51
|
-
source_max_inflight_rows: int | None =
|
52
|
-
source_max_inflight_bytes: int | None =
|
50
|
+
# The maximum number of concurrent inflight requests, shared among all sources from all flows.
|
51
|
+
source_max_inflight_rows: int | None = None
|
52
|
+
source_max_inflight_bytes: int | None = None
|
53
53
|
|
54
54
|
|
55
55
|
def _load_field(
|
@@ -81,7 +81,7 @@ class Settings:
|
|
81
81
|
|
82
82
|
database: DatabaseConnectionSpec | None = None
|
83
83
|
app_namespace: str = ""
|
84
|
-
|
84
|
+
global_execution_options: GlobalExecutionOptions | None = None
|
85
85
|
|
86
86
|
@classmethod
|
87
87
|
def from_env(cls) -> Self:
|
@@ -110,14 +110,14 @@ class Settings:
|
|
110
110
|
"COCOINDEX_SOURCE_MAX_INFLIGHT_BYTES",
|
111
111
|
parse=int,
|
112
112
|
)
|
113
|
-
|
113
|
+
global_execution_options = GlobalExecutionOptions(**exec_kwargs)
|
114
114
|
|
115
115
|
app_namespace = os.getenv("COCOINDEX_APP_NAMESPACE", "")
|
116
116
|
|
117
117
|
return cls(
|
118
118
|
database=database,
|
119
119
|
app_namespace=app_namespace,
|
120
|
-
|
120
|
+
global_execution_options=global_execution_options,
|
121
121
|
)
|
122
122
|
|
123
123
|
|
cocoindex/sources.py
CHANGED
@@ -43,3 +43,18 @@ class AmazonS3(op.SourceSpec):
|
|
43
43
|
included_patterns: list[str] | None = None
|
44
44
|
excluded_patterns: list[str] | None = None
|
45
45
|
sqs_queue_url: str | None = None
|
46
|
+
|
47
|
+
|
48
|
+
class AzureBlob(op.SourceSpec):
|
49
|
+
"""
|
50
|
+
Import data from an Azure Blob Storage container. Supports optional prefix and file filtering by glob patterns.
|
51
|
+
"""
|
52
|
+
|
53
|
+
_op_category = op.OpCategory.SOURCE
|
54
|
+
|
55
|
+
account_name: str
|
56
|
+
container_name: str
|
57
|
+
prefix: str | None = None
|
58
|
+
binary: bool = False
|
59
|
+
included_patterns: list[str] | None = None
|
60
|
+
excluded_patterns: list[str] | None = None
|
cocoindex/tests/test_convert.py
CHANGED
@@ -1229,3 +1229,115 @@ def test_full_roundtrip_scalar_with_python_types() -> None:
|
|
1229
1229
|
annotated_float=2.0,
|
1230
1230
|
)
|
1231
1231
|
validate_full_roundtrip(instance, MixedStruct)
|
1232
|
+
|
1233
|
+
|
1234
|
+
def test_roundtrip_struct_to_dict_binding() -> None:
|
1235
|
+
"""Test struct -> dict binding with Any annotation."""
|
1236
|
+
|
1237
|
+
@dataclass
|
1238
|
+
class SimpleStruct:
|
1239
|
+
name: str
|
1240
|
+
value: int
|
1241
|
+
price: float
|
1242
|
+
|
1243
|
+
instance = SimpleStruct("test", 42, 3.14)
|
1244
|
+
expected_dict = {"name": "test", "value": 42, "price": 3.14}
|
1245
|
+
|
1246
|
+
# Test Any annotation
|
1247
|
+
validate_full_roundtrip(instance, SimpleStruct, (expected_dict, Any))
|
1248
|
+
|
1249
|
+
|
1250
|
+
def test_roundtrip_struct_to_dict_explicit() -> None:
|
1251
|
+
"""Test struct -> dict binding with explicit dict annotations."""
|
1252
|
+
|
1253
|
+
@dataclass
|
1254
|
+
class Product:
|
1255
|
+
id: str
|
1256
|
+
name: str
|
1257
|
+
price: float
|
1258
|
+
active: bool
|
1259
|
+
|
1260
|
+
instance = Product("P1", "Widget", 29.99, True)
|
1261
|
+
expected_dict = {"id": "P1", "name": "Widget", "price": 29.99, "active": True}
|
1262
|
+
|
1263
|
+
# Test explicit dict annotations
|
1264
|
+
validate_full_roundtrip(
|
1265
|
+
instance, Product, (expected_dict, dict), (expected_dict, dict[str, Any])
|
1266
|
+
)
|
1267
|
+
|
1268
|
+
|
1269
|
+
def test_roundtrip_struct_to_dict_with_none_annotation() -> None:
|
1270
|
+
"""Test struct -> dict binding with None annotation."""
|
1271
|
+
|
1272
|
+
@dataclass
|
1273
|
+
class Config:
|
1274
|
+
host: str
|
1275
|
+
port: int
|
1276
|
+
debug: bool
|
1277
|
+
|
1278
|
+
instance = Config("localhost", 8080, True)
|
1279
|
+
expected_dict = {"host": "localhost", "port": 8080, "debug": True}
|
1280
|
+
|
1281
|
+
# Test None annotation (should be treated as Any)
|
1282
|
+
validate_full_roundtrip(instance, Config, (expected_dict, None))
|
1283
|
+
|
1284
|
+
|
1285
|
+
def test_roundtrip_struct_to_dict_nested() -> None:
|
1286
|
+
"""Test struct -> dict binding with nested structs."""
|
1287
|
+
|
1288
|
+
@dataclass
|
1289
|
+
class Address:
|
1290
|
+
street: str
|
1291
|
+
city: str
|
1292
|
+
|
1293
|
+
@dataclass
|
1294
|
+
class Person:
|
1295
|
+
name: str
|
1296
|
+
age: int
|
1297
|
+
address: Address
|
1298
|
+
|
1299
|
+
address = Address("123 Main St", "Anytown")
|
1300
|
+
person = Person("John", 30, address)
|
1301
|
+
expected_dict = {
|
1302
|
+
"name": "John",
|
1303
|
+
"age": 30,
|
1304
|
+
"address": {"street": "123 Main St", "city": "Anytown"},
|
1305
|
+
}
|
1306
|
+
|
1307
|
+
# Test nested struct conversion
|
1308
|
+
validate_full_roundtrip(person, Person, (expected_dict, dict[str, Any]))
|
1309
|
+
|
1310
|
+
|
1311
|
+
def test_roundtrip_struct_to_dict_with_list() -> None:
|
1312
|
+
"""Test struct -> dict binding with list fields."""
|
1313
|
+
|
1314
|
+
@dataclass
|
1315
|
+
class Team:
|
1316
|
+
name: str
|
1317
|
+
members: list[str]
|
1318
|
+
active: bool
|
1319
|
+
|
1320
|
+
instance = Team("Dev Team", ["Alice", "Bob", "Charlie"], True)
|
1321
|
+
expected_dict = {
|
1322
|
+
"name": "Dev Team",
|
1323
|
+
"members": ["Alice", "Bob", "Charlie"],
|
1324
|
+
"active": True,
|
1325
|
+
}
|
1326
|
+
|
1327
|
+
validate_full_roundtrip(instance, Team, (expected_dict, dict))
|
1328
|
+
|
1329
|
+
|
1330
|
+
def test_roundtrip_namedtuple_to_dict_binding() -> None:
|
1331
|
+
"""Test NamedTuple -> dict binding."""
|
1332
|
+
|
1333
|
+
class Point(NamedTuple):
|
1334
|
+
x: float
|
1335
|
+
y: float
|
1336
|
+
z: float
|
1337
|
+
|
1338
|
+
instance = Point(1.0, 2.0, 3.0)
|
1339
|
+
expected_dict = {"x": 1.0, "y": 2.0, "z": 3.0}
|
1340
|
+
|
1341
|
+
validate_full_roundtrip(
|
1342
|
+
instance, Point, (expected_dict, dict), (expected_dict, Any)
|
1343
|
+
)
|
cocoindex/tests/test_typing.py
CHANGED
@@ -539,5 +539,8 @@ def test_invalid_list_kind() -> None:
|
|
539
539
|
|
540
540
|
def test_unsupported_type() -> None:
|
541
541
|
typ = set
|
542
|
-
with pytest.raises(
|
542
|
+
with pytest.raises(
|
543
|
+
ValueError,
|
544
|
+
match="Unsupported as a specific type annotation for CocoIndex data type.*: <class 'set'>",
|
545
|
+
):
|
543
546
|
analyze_type_info(typ)
|
cocoindex/typing.py
CHANGED
@@ -168,7 +168,8 @@ class AnalyzedTypeInfo:
|
|
168
168
|
|
169
169
|
def analyze_type_info(t: Any) -> AnalyzedTypeInfo:
|
170
170
|
"""
|
171
|
-
Analyze a Python type and
|
171
|
+
Analyze a Python type annotation and extract CocoIndex-specific type information.
|
172
|
+
Type annotations for specific CocoIndex types are expected. Raises ValueError for Any, empty, or untyped dict types.
|
172
173
|
"""
|
173
174
|
if isinstance(t, tuple) and len(t) == 2:
|
174
175
|
kt, vt = t
|
@@ -239,9 +240,15 @@ def analyze_type_info(t: Any) -> AnalyzedTypeInfo:
|
|
239
240
|
_ = DtypeRegistry.validate_dtype_and_get_kind(elem_type)
|
240
241
|
vector_info = VectorInfo(dim=None) if vector_info is None else vector_info
|
241
242
|
|
242
|
-
elif base_type is collections.abc.Mapping or base_type is dict:
|
243
|
+
elif base_type is collections.abc.Mapping or base_type is dict or t is dict:
|
243
244
|
args = typing.get_args(t)
|
244
|
-
|
245
|
+
if len(args) == 0: # Handle untyped dict
|
246
|
+
raise ValueError(
|
247
|
+
"Untyped dict is not accepted as a specific type annotation; please provide a concrete type, "
|
248
|
+
"e.g. a dataclass or namedtuple for Struct types, a dict[str, T] for KTable types."
|
249
|
+
)
|
250
|
+
else:
|
251
|
+
elem_type = (args[0], args[1])
|
245
252
|
kind = "KTable"
|
246
253
|
elif base_type in (types.UnionType, typing.Union):
|
247
254
|
possible_types = typing.get_args(t)
|
@@ -283,7 +290,9 @@ def analyze_type_info(t: Any) -> AnalyzedTypeInfo:
|
|
283
290
|
elif t is datetime.timedelta:
|
284
291
|
kind = "TimeDelta"
|
285
292
|
else:
|
286
|
-
raise ValueError(
|
293
|
+
raise ValueError(
|
294
|
+
f"Unsupported as a specific type annotation for CocoIndex data type (https://cocoindex.io/docs/core/data_types): {t}"
|
295
|
+
)
|
287
296
|
|
288
297
|
return AnalyzedTypeInfo(
|
289
298
|
kind=kind,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cocoindex
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.66
|
4
4
|
Requires-Dist: click>=8.1.8
|
5
5
|
Requires-Dist: rich>=14.0.0
|
6
6
|
Requires-Dist: python-dotenv>=1.1.0
|
@@ -25,7 +25,7 @@ Project-URL: Homepage, https://cocoindex.io/
|
|
25
25
|
<img src="https://cocoindex.io/images/github.svg" alt="CocoIndex">
|
26
26
|
</p>
|
27
27
|
|
28
|
-
<
|
28
|
+
<h1 align="center">Data transformation for AI</h1>
|
29
29
|
|
30
30
|
<div align="center">
|
31
31
|
|
@@ -40,18 +40,32 @@ Project-URL: Homepage, https://cocoindex.io/
|
|
40
40
|
[](https://discord.com/invite/zpA9S2DR7s)
|
41
41
|
</div>
|
42
42
|
|
43
|
-
|
43
|
+
Ultra performant data transformation framework for AI, with core engine written in Rust. Support incremental processing and data lineage out-of-box. Exceptional developer velocity. Production-ready at day 0.
|
44
|
+
|
45
|
+
⭐ Drop a star to help us grow!
|
46
|
+
|
47
|
+
</br>
|
48
|
+
|
49
|
+
<p align="center">
|
50
|
+
<img src="https://cocoindex.io/images/transformation.svg" alt="CocoIndex Transformation">
|
51
|
+
</p>
|
52
|
+
|
53
|
+
</br>
|
54
|
+
|
55
|
+
CocoIndex makes it super easy to transform data with AI workloads, and keep source data and target in sync effortlessly.
|
56
|
+
|
57
|
+
</br>
|
44
58
|
|
45
59
|
<p align="center">
|
46
|
-
<img src="https://cocoindex.io/images/
|
60
|
+
<img src="https://cocoindex.io/images/venn-features.png" alt="CocoIndex Features" width='480'>
|
47
61
|
</p>
|
48
62
|
|
49
|
-
|
63
|
+
</br>
|
50
64
|
|
51
|
-
|
52
|
-
Unlike a workflow orchestration framework where data is usually opaque, in CocoIndex, data and data operations are first class citizens. CocoIndex follows the idea of [Dataflow](https://en.wikipedia.org/wiki/Dataflow_programming) programming model. Each transformation creates a new field solely based on input fields, without hidden states and value mutation. All data before/after each transformation is observable, with lineage out of the box.
|
65
|
+
Either creating embedding, building knowledge graphs, or any data transformations - beyond traditional SQL.
|
53
66
|
|
54
|
-
|
67
|
+
## Exceptional velocity
|
68
|
+
Just declare transformation in dataflow with ~100 lines of python
|
55
69
|
|
56
70
|
```python
|
57
71
|
# import
|
@@ -69,19 +83,27 @@ collector.collect(...)
|
|
69
83
|
collector.export(...)
|
70
84
|
```
|
71
85
|
|
72
|
-
|
73
|
-
|
86
|
+
CocoIndex follows the idea of [Dataflow](https://en.wikipedia.org/wiki/Dataflow_programming) programming model. Each transformation creates a new field solely based on input fields, without hidden states and value mutation. All data before/after each transformation is observable, with lineage out of the box.
|
87
|
+
|
88
|
+
**Particularly**, developers don't explicitly mutate data by creating, updating and deleting. They just need to define transformation/formula for a set of source data.
|
89
|
+
|
90
|
+
## Build like LEGO
|
91
|
+
Native builtins for different source, targets and transformations. Standardize interface, make it 1-line code switch between different components.
|
74
92
|
|
75
93
|
<p align="center">
|
76
|
-
<img src="https://
|
94
|
+
<img src="https://cocoindex.io/images/components.svg" alt="CocoIndex Features">
|
77
95
|
</p>
|
78
96
|
|
79
|
-
|
80
|
-
|
81
|
-
- Figure out what exactly needs to be updated, and only updating that without having to recompute everything.
|
97
|
+
## Data Freshness
|
98
|
+
CocoIndex keep source data and target in sync effortlessly.
|
82
99
|
|
83
|
-
|
100
|
+
<p align="center">
|
101
|
+
<img src="https://github.com/user-attachments/assets/f4eb29b3-84ee-4fa0-a1e2-80eedeeabde6" alt="Incremental Processing" width="700">
|
102
|
+
</p>
|
84
103
|
|
104
|
+
It has out-of-box support for incremental indexing:
|
105
|
+
- minimal recomputation on source or logic change.
|
106
|
+
- (re-)processing necessary portions; reuse cache when possible
|
85
107
|
|
86
108
|
## Quick Start:
|
87
109
|
If you're new to CocoIndex, we recommend checking out
|
@@ -100,7 +122,7 @@ pip install -U cocoindex
|
|
100
122
|
2. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. CocoIndex uses it for incremental processing.
|
101
123
|
|
102
124
|
|
103
|
-
|
125
|
+
## Define data flow
|
104
126
|
|
105
127
|
Follow [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart) to define your first indexing flow. An example flow looks like:
|
106
128
|
|
@@ -144,8 +166,9 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
|
|
144
166
|
|
145
167
|
It defines an index flow like this:
|
146
168
|
|
147
|
-
<
|
148
|
-
|
169
|
+
<p align="center">
|
170
|
+
<img width="400" alt="Data Flow" src="https://github.com/user-attachments/assets/2ea7be6d-3d94-42b1-b2bd-22515577e463" />
|
171
|
+
</p>
|
149
172
|
|
150
173
|
## 🚀 Examples and demo
|
151
174
|
|
@@ -156,6 +179,7 @@ It defines an index flow like this:
|
|
156
179
|
| [PDF Embedding](examples/pdf_embedding) | Parse PDF and index text embeddings for semantic search |
|
157
180
|
| [Manuals LLM Extraction](examples/manuals_llm_extraction) | Extract structured information from a manual using LLM |
|
158
181
|
| [Amazon S3 Embedding](examples/amazon_s3_embedding) | Index text documents from Amazon S3 |
|
182
|
+
| [Azure Blob Storage Embedding](examples/azure_blob_embedding) | Index text documents from Azure Blob Storage |
|
159
183
|
| [Google Drive Text Embedding](examples/gdrive_text_embedding) | Index text documents from Google Drive |
|
160
184
|
| [Docs to Knowledge Graph](examples/docs_to_knowledge_graph) | Extract relationships from Markdown documents and build a knowledge graph |
|
161
185
|
| [Embeddings to Qdrant](examples/text_embedding_qdrant) | Index documents in a Qdrant collection for semantic search |
|
@@ -0,0 +1,28 @@
|
|
1
|
+
cocoindex-0.1.66.dist-info/METADATA,sha256=ZwdUYvaVBt5qmZ7pQqJOB6FoszGiwbEerJyeI52aSBE,10098
|
2
|
+
cocoindex-0.1.66.dist-info/WHEEL,sha256=7zxhuuBJUuil6R_MfVJvTZXJmNSPMntESIcLB40BBxA,116
|
3
|
+
cocoindex-0.1.66.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
|
4
|
+
cocoindex-0.1.66.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
5
|
+
cocoindex/__init__.py,sha256=nr-W-LX0e9EX6ZVMbf0MRF0EhtntqADOdwc9F2nwfWw,2004
|
6
|
+
cocoindex/_engine.pypy311-pp73-aarch64-linux-gnu.so,sha256=B6HiMN0h56MtAcVy_f6s3KOXjvVf1jURJcwSoEMxL6o,64916680
|
7
|
+
cocoindex/auth_registry.py,sha256=1XqO7ibjmBBd8i11XSJTvTgdz8p1ptW-ZpuSgo_5zzk,716
|
8
|
+
cocoindex/cli.py,sha256=-gp639JSyQN6YjnhGqCakIzYoSSqXxQMbxbkcYGP0QY,22359
|
9
|
+
cocoindex/convert.py,sha256=RYfRUungabr-dHakG4k2kDvYambxHFljAmTuPQeQths,13117
|
10
|
+
cocoindex/flow.py,sha256=MFPtfJBVTjQ56d7vUn2LvtY30Vg4q2rY6nqvjjJL1kQ,35085
|
11
|
+
cocoindex/functions.py,sha256=EvRnd2Y3TIeEueYjOmproBG5RinlbqK6ym1Hnrmnm-0,3251
|
12
|
+
cocoindex/index.py,sha256=j93B9jEvvLXHtpzKWL88SY6wCGEoPgpsQhEGHlyYGFg,540
|
13
|
+
cocoindex/lib.py,sha256=f--9dAYd84CZosbDZqNW0oGbBLsY3dXiUTR1VrfQ_QY,817
|
14
|
+
cocoindex/llm.py,sha256=WxmWUbNcf9HOCM5xkbDeFs9lF67M3mr810B7deDDc-8,673
|
15
|
+
cocoindex/op.py,sha256=r_Usx7Jqh49Cck3tsYLx2vLRNUZArkQP_g7bIID6LPU,11809
|
16
|
+
cocoindex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
cocoindex/runtime.py,sha256=bAdHYaXFWiiUWyAgzmKTeaAaRR0D_AmaqVCIdPO-v00,1056
|
18
|
+
cocoindex/setting.py,sha256=ADuv7RaWd9k-m3V0Cfy2jmaCt6DupJCviWdOm0CTiVw,4734
|
19
|
+
cocoindex/setup.py,sha256=7uIHKN4FOCuoidPXcKyGTrkqpkl9luL49-6UcnMxYzw,3068
|
20
|
+
cocoindex/sources.py,sha256=8MR_oyr7t0m-gUFq7FO6HHM-tDLmQSBAjheFXJzRd8g,1733
|
21
|
+
cocoindex/targets.py,sha256=Nfh_tpFd1goTnS_cxBjIs4j9zl3Z4Z1JomAQ1dl3Sic,2796
|
22
|
+
cocoindex/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
+
cocoindex/tests/test_convert.py,sha256=48-fnWKv02gmFETV3b-8IC89SKMUZfJLEH-ucRtXGuI,45450
|
24
|
+
cocoindex/tests/test_optional_database.py,sha256=snAmkNa6wtOSaxoZE1HgjvL5v_ylitt3Jt_9df4Cgdc,8506
|
25
|
+
cocoindex/tests/test_typing.py,sha256=NB4nUzoumOF_wGFa4D2Xf6d0bUVtOiSXyb78M1pYSG4,14827
|
26
|
+
cocoindex/typing.py,sha256=MO9HkrNpargvMPvpkd7jgSu2R-21KE_NaB9-WI4YOZA,13241
|
27
|
+
cocoindex/utils.py,sha256=hUhX-XV6XGCtJSEIpBOuDv6VvqImwPlgBxztBTw7u0U,598
|
28
|
+
cocoindex-0.1.66.dist-info/RECORD,,
|
@@ -1,28 +0,0 @@
|
|
1
|
-
cocoindex-0.1.64.dist-info/METADATA,sha256=ByEFBlXrGkbESv0twsq2eLycjhTdkQKnE_EzQVIpWgQ,10136
|
2
|
-
cocoindex-0.1.64.dist-info/WHEEL,sha256=7zxhuuBJUuil6R_MfVJvTZXJmNSPMntESIcLB40BBxA,116
|
3
|
-
cocoindex-0.1.64.dist-info/entry_points.txt,sha256=_NretjYVzBdNTn7dK-zgwr7YfG2afz1u1uSE-5bZXF8,46
|
4
|
-
cocoindex-0.1.64.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
5
|
-
cocoindex/__init__.py,sha256=hDjehCjxRabFCW0RTt00JxnSAJIn9HeVoK4OjFbETsk,1910
|
6
|
-
cocoindex/_engine.pypy311-pp73-aarch64-linux-gnu.so,sha256=YJMPrvVKi9es-DB_Dekgx6xizs_rlSkjA51TRfV58AY,61286424
|
7
|
-
cocoindex/auth_registry.py,sha256=1XqO7ibjmBBd8i11XSJTvTgdz8p1ptW-ZpuSgo_5zzk,716
|
8
|
-
cocoindex/cli.py,sha256=-gp639JSyQN6YjnhGqCakIzYoSSqXxQMbxbkcYGP0QY,22359
|
9
|
-
cocoindex/convert.py,sha256=qE1Ut_tAwX4wA4WqaWxpyj80-1t6WZ8Oi5_L9Mw5g4k,11393
|
10
|
-
cocoindex/flow.py,sha256=Rb3ImrFa-TKYZXZnfcTj4ePUZXqKgJCzZR4OYHP6rlk,34207
|
11
|
-
cocoindex/functions.py,sha256=IBwvdPpGR-S5mk53HvHpT2GVs15MI9wQznxgOdxA0ac,3202
|
12
|
-
cocoindex/index.py,sha256=j93B9jEvvLXHtpzKWL88SY6wCGEoPgpsQhEGHlyYGFg,540
|
13
|
-
cocoindex/lib.py,sha256=f--9dAYd84CZosbDZqNW0oGbBLsY3dXiUTR1VrfQ_QY,817
|
14
|
-
cocoindex/llm.py,sha256=0ri8ZRg9_Zf2gyC5xuQ1Kq6kdZUO8r-A5WLnxit5S_4,448
|
15
|
-
cocoindex/op.py,sha256=r_Usx7Jqh49Cck3tsYLx2vLRNUZArkQP_g7bIID6LPU,11809
|
16
|
-
cocoindex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
-
cocoindex/runtime.py,sha256=bAdHYaXFWiiUWyAgzmKTeaAaRR0D_AmaqVCIdPO-v00,1056
|
18
|
-
cocoindex/setting.py,sha256=FMNjer3YVmVyxLuGt6_DJ6vA1QH1mIo7oH0R51OLnk4,4714
|
19
|
-
cocoindex/setup.py,sha256=7uIHKN4FOCuoidPXcKyGTrkqpkl9luL49-6UcnMxYzw,3068
|
20
|
-
cocoindex/sources.py,sha256=JCnOhv1w4o28e03i7yvo4ESicWYAhckkBg5bQlxNH4U,1330
|
21
|
-
cocoindex/targets.py,sha256=Nfh_tpFd1goTnS_cxBjIs4j9zl3Z4Z1JomAQ1dl3Sic,2796
|
22
|
-
cocoindex/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
cocoindex/tests/test_convert.py,sha256=efwF-43SFJiu85sQ78Z9k9OaJphTz_es_1cm5BoPO2Y,42565
|
24
|
-
cocoindex/tests/test_optional_database.py,sha256=snAmkNa6wtOSaxoZE1HgjvL5v_ylitt3Jt_9df4Cgdc,8506
|
25
|
-
cocoindex/tests/test_typing.py,sha256=t6UCYShcfonTfjBlGRWPiFGMZ8DGFfABXo6idekPoJE,14757
|
26
|
-
cocoindex/typing.py,sha256=qQ-nSdkHzu8pSxfuR5sGGfoE8nCKqCDb0D9jbmxVt4M,12635
|
27
|
-
cocoindex/utils.py,sha256=hUhX-XV6XGCtJSEIpBOuDv6VvqImwPlgBxztBTw7u0U,598
|
28
|
-
cocoindex-0.1.64.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|