cocoindex 0.1.33__cp311-cp311-macosx_11_0_arm64.whl → 0.1.35__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoindex/__init__.py +1 -1
- cocoindex/_engine.cpython-311-darwin.so +0 -0
- cocoindex/cli.py +22 -24
- cocoindex/flow.py +94 -87
- cocoindex/setup.py +4 -4
- {cocoindex-0.1.33.dist-info → cocoindex-0.1.35.dist-info}/METADATA +58 -25
- {cocoindex-0.1.33.dist-info → cocoindex-0.1.35.dist-info}/RECORD +18 -18
- {cocoindex-0.1.33.dist-info → cocoindex-0.1.35.dist-info}/WHEEL +1 -1
- {cocoindex-0.1.33.dist-info → cocoindex-0.1.35.dist-info}/licenses/LICENSE +0 -0
cocoindex/__init__.py
CHANGED
@@ -4,7 +4,7 @@ Cocoindex is a framework for building and running indexing pipelines.
|
|
4
4
|
from . import functions, query, sources, storages, cli
|
5
5
|
from .flow import FlowBuilder, DataScope, DataSlice, Flow, flow_def
|
6
6
|
from .flow import EvaluateAndDumpOptions, GeneratedField
|
7
|
-
from .flow import
|
7
|
+
from .flow import update_all_flows_async, FlowLiveUpdater, FlowLiveUpdaterOptions
|
8
8
|
from .llm import LlmSpec, LlmApiType
|
9
9
|
from .index import VectorSimilarityMetric, VectorIndexDef, IndexOptions
|
10
10
|
from .auth_registry import AuthEntryReference, add_auth_entry, ref_auth_entry
|
Binary file
|
cocoindex/cli.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
import asyncio
|
2
1
|
import click
|
3
2
|
import datetime
|
4
3
|
|
@@ -7,7 +6,6 @@ from rich.table import Table
|
|
7
6
|
|
8
7
|
from . import flow, lib, setting
|
9
8
|
from .setup import sync_setup, drop_setup, flow_names_with_setup, apply_setup_changes
|
10
|
-
from .runtime import execution_context
|
11
9
|
|
12
10
|
@click.group()
|
13
11
|
def cli():
|
@@ -55,16 +53,17 @@ def ls(show_all: bool):
|
|
55
53
|
|
56
54
|
@cli.command()
|
57
55
|
@click.argument("flow_name", type=str, required=False)
|
58
|
-
@click.option("--color/--no-color", default=True)
|
59
|
-
|
56
|
+
@click.option("--color/--no-color", default=True, help="Enable or disable colored output.")
|
57
|
+
@click.option("--verbose", is_flag=True, help="Show verbose output with full details.")
|
58
|
+
def show(flow_name: str | None, color: bool, verbose: bool):
|
60
59
|
"""
|
61
|
-
Show the flow spec in a readable format with colored output
|
62
|
-
including the schema.
|
60
|
+
Show the flow spec and schema in a readable format with colored output.
|
63
61
|
"""
|
64
62
|
flow = _flow_by_name(flow_name)
|
65
63
|
console = Console(no_color=not color)
|
66
|
-
console.print(flow.
|
64
|
+
console.print(flow._render_spec(verbose=verbose))
|
67
65
|
|
66
|
+
console.print()
|
68
67
|
table = Table(
|
69
68
|
title=f"Schema for Flow: {flow.name}",
|
70
69
|
show_header=True,
|
@@ -74,7 +73,7 @@ def show(flow_name: str | None, color: bool):
|
|
74
73
|
table.add_column("Type", style="green")
|
75
74
|
table.add_column("Attributes", style="yellow")
|
76
75
|
|
77
|
-
for field_name, field_type, attr_str in flow.
|
76
|
+
for field_name, field_type, attr_str in flow._get_schema():
|
78
77
|
table.add_row(field_name, field_type, attr_str)
|
79
78
|
|
80
79
|
console.print(table)
|
@@ -85,15 +84,15 @@ def setup():
|
|
85
84
|
Check and apply backend setup changes for flows, including the internal and target storage
|
86
85
|
(to export).
|
87
86
|
"""
|
88
|
-
|
89
|
-
click.echo(
|
90
|
-
if
|
87
|
+
setup_status = sync_setup()
|
88
|
+
click.echo(setup_status)
|
89
|
+
if setup_status.is_up_to_date():
|
91
90
|
click.echo("No changes need to be pushed.")
|
92
91
|
return
|
93
92
|
if not click.confirm(
|
94
93
|
"Changes need to be pushed. Continue? [yes/N]", default=False, show_default=False):
|
95
94
|
return
|
96
|
-
apply_setup_changes(
|
95
|
+
apply_setup_changes(setup_status)
|
97
96
|
|
98
97
|
@cli.command()
|
99
98
|
@click.argument("flow_name", type=str, nargs=-1)
|
@@ -112,15 +111,15 @@ def drop(flow_name: tuple[str, ...], drop_all: bool):
|
|
112
111
|
flow_names = [fl.name for fl in flow.flows()]
|
113
112
|
else:
|
114
113
|
flow_names = list(flow_name)
|
115
|
-
|
116
|
-
click.echo(
|
117
|
-
if
|
114
|
+
setup_status = drop_setup(flow_names)
|
115
|
+
click.echo(setup_status)
|
116
|
+
if setup_status.is_up_to_date():
|
118
117
|
click.echo("No flows need to be dropped.")
|
119
118
|
return
|
120
119
|
if not click.confirm(
|
121
120
|
"Changes need to be pushed. Continue? [yes/N]", default=False, show_default=False):
|
122
121
|
return
|
123
|
-
apply_setup_changes(
|
122
|
+
apply_setup_changes(setup_status)
|
124
123
|
|
125
124
|
@cli.command()
|
126
125
|
@click.argument("flow_name", type=str, required=False)
|
@@ -135,13 +134,12 @@ def update(flow_name: str | None, live: bool, quiet: bool):
|
|
135
134
|
Update the index to reflect the latest data from data sources.
|
136
135
|
"""
|
137
136
|
options = flow.FlowLiveUpdaterOptions(live_mode=live, print_stats=not quiet)
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
updater
|
143
|
-
|
144
|
-
execution_context.run(_update())
|
137
|
+
if flow_name is None:
|
138
|
+
return flow.update_all_flows(options)
|
139
|
+
else:
|
140
|
+
with flow.FlowLiveUpdater(_flow_by_name(flow_name), options) as updater:
|
141
|
+
updater.wait()
|
142
|
+
return updater.update_stats()
|
145
143
|
|
146
144
|
@cli.command()
|
147
145
|
@click.argument("flow_name", type=str, required=False)
|
@@ -216,7 +214,7 @@ def server(address: str | None, live_update: bool, quiet: bool, cors_origin: str
|
|
216
214
|
|
217
215
|
if live_update:
|
218
216
|
options = flow.FlowLiveUpdaterOptions(live_mode=True, print_stats=not quiet)
|
219
|
-
|
217
|
+
flow.update_all_flows(options)
|
220
218
|
if COCOINDEX_HOST in cors_origins:
|
221
219
|
click.echo(f"Open CocoInsight at: {COCOINDEX_HOST}/cocoinsight")
|
222
220
|
input("Press Enter to stop...")
|
cocoindex/flow.py
CHANGED
@@ -8,14 +8,13 @@ import asyncio
|
|
8
8
|
import re
|
9
9
|
import inspect
|
10
10
|
import datetime
|
11
|
-
import json
|
12
11
|
|
13
12
|
from typing import Any, Callable, Sequence, TypeVar
|
14
13
|
from threading import Lock
|
15
14
|
from enum import Enum
|
16
15
|
from dataclasses import dataclass
|
17
16
|
from rich.text import Text
|
18
|
-
from rich.
|
17
|
+
from rich.tree import Tree
|
19
18
|
|
20
19
|
from . import _engine
|
21
20
|
from . import index
|
@@ -161,6 +160,9 @@ class DataSlice:
|
|
161
160
|
"""
|
162
161
|
Apply a function to the data slice.
|
163
162
|
"""
|
163
|
+
if not isinstance(fn_spec, op.FunctionSpec):
|
164
|
+
raise ValueError("transform() can only be called on a CocoIndex function")
|
165
|
+
|
164
166
|
transform_args: list[tuple[Any, str | None]]
|
165
167
|
transform_args = [(self._state.engine_data_slice, None)]
|
166
168
|
transform_args += [(self._state.flow_builder_state.get_data_slice(v), None) for v in args]
|
@@ -280,6 +282,9 @@ class DataCollector:
|
|
280
282
|
|
281
283
|
`vector_index` is for backward compatibility only. Please use `vector_indexes` instead.
|
282
284
|
"""
|
285
|
+
if not isinstance(target_spec, op.StorageSpec):
|
286
|
+
raise ValueError("export() can only be called on a CocoIndex target storage")
|
287
|
+
|
283
288
|
# For backward compatibility only.
|
284
289
|
if len(vector_indexes) == 0 and len(vector_index) > 0:
|
285
290
|
vector_indexes = [index.VectorIndexDef(field_name=field_name, metric=metric)
|
@@ -343,8 +348,10 @@ class FlowBuilder:
|
|
343
348
|
refresh_interval: datetime.timedelta | None = None,
|
344
349
|
) -> DataSlice:
|
345
350
|
"""
|
346
|
-
|
351
|
+
Import a source to the flow.
|
347
352
|
"""
|
353
|
+
if not isinstance(spec, op.SourceSpec):
|
354
|
+
raise ValueError("add_source() can only be called on a CocoIndex source")
|
348
355
|
return _create_data_slice(
|
349
356
|
self._state,
|
350
357
|
lambda target_scope, name: self._state.engine_flow_builder.add_source(
|
@@ -376,56 +383,71 @@ class FlowLiveUpdater:
|
|
376
383
|
"""
|
377
384
|
A live updater for a flow.
|
378
385
|
"""
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
if isinstance(arg, _engine.FlowLiveUpdater):
|
383
|
-
self._engine_live_updater = arg
|
384
|
-
else:
|
385
|
-
self._engine_live_updater = execution_context.run(_engine.FlowLiveUpdater(
|
386
|
-
arg.internal_flow(), dump_engine_object(options or FlowLiveUpdaterOptions())))
|
386
|
+
_flow: Flow
|
387
|
+
_options: FlowLiveUpdaterOptions
|
388
|
+
_engine_live_updater: _engine.FlowLiveUpdater | None = None
|
387
389
|
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
Create a live updater for a flow.
|
392
|
-
"""
|
393
|
-
engine_live_updater = await _engine.FlowLiveUpdater.create(
|
394
|
-
await fl.ainternal_flow(),
|
395
|
-
dump_engine_object(options or FlowLiveUpdaterOptions()))
|
396
|
-
return FlowLiveUpdater(engine_live_updater)
|
390
|
+
def __init__(self, fl: Flow, options: FlowLiveUpdaterOptions | None = None):
|
391
|
+
self._flow = fl
|
392
|
+
self._options = options or FlowLiveUpdaterOptions()
|
397
393
|
|
398
394
|
def __enter__(self) -> FlowLiveUpdater:
|
395
|
+
self.start()
|
399
396
|
return self
|
400
397
|
|
401
398
|
def __exit__(self, exc_type, exc_value, traceback):
|
402
399
|
self.abort()
|
403
|
-
|
400
|
+
self.wait()
|
404
401
|
|
405
402
|
async def __aenter__(self) -> FlowLiveUpdater:
|
403
|
+
await self.start_async()
|
406
404
|
return self
|
407
405
|
|
408
406
|
async def __aexit__(self, exc_type, exc_value, traceback):
|
409
407
|
self.abort()
|
410
|
-
await self.
|
408
|
+
await self.wait_async()
|
411
409
|
|
412
|
-
|
410
|
+
def start(self) -> None:
|
411
|
+
"""
|
412
|
+
Start the live updater.
|
413
|
+
"""
|
414
|
+
execution_context.run(self.start_async())
|
415
|
+
|
416
|
+
async def start_async(self) -> None:
|
417
|
+
"""
|
418
|
+
Start the live updater.
|
419
|
+
"""
|
420
|
+
self._engine_live_updater = await _engine.FlowLiveUpdater.create(
|
421
|
+
await self._flow.internal_flow_async(), dump_engine_object(self._options))
|
422
|
+
|
423
|
+
def wait(self) -> None:
|
413
424
|
"""
|
414
425
|
Wait for the live updater to finish.
|
415
426
|
"""
|
416
|
-
|
427
|
+
execution_context.run(self.wait_async())
|
428
|
+
|
429
|
+
async def wait_async(self) -> None:
|
430
|
+
"""
|
431
|
+
Wait for the live updater to finish. Async version.
|
432
|
+
"""
|
433
|
+
await self._get_engine_live_updater().wait()
|
417
434
|
|
418
435
|
def abort(self) -> None:
|
419
436
|
"""
|
420
437
|
Abort the live updater.
|
421
438
|
"""
|
422
|
-
self.
|
439
|
+
self._get_engine_live_updater().abort()
|
423
440
|
|
424
441
|
def update_stats(self) -> _engine.IndexUpdateInfo:
|
425
442
|
"""
|
426
443
|
Get the index update info.
|
427
444
|
"""
|
428
|
-
return self.
|
445
|
+
return self._get_engine_live_updater().index_update_info()
|
446
|
+
|
447
|
+
def _get_engine_live_updater(self) -> _engine.FlowLiveUpdater:
|
448
|
+
if self._engine_live_updater is None:
|
449
|
+
raise RuntimeError("Live updater is not started")
|
450
|
+
return self._engine_live_updater
|
429
451
|
|
430
452
|
|
431
453
|
@dataclass
|
@@ -454,61 +476,33 @@ class Flow:
|
|
454
476
|
return engine_flow
|
455
477
|
self._lazy_engine_flow = _lazy_engine_flow
|
456
478
|
|
457
|
-
def
|
458
|
-
|
479
|
+
def _render_spec(self, verbose: bool = False) -> Tree:
|
480
|
+
"""
|
481
|
+
Render the flow spec as a styled rich Tree with hierarchical structure.
|
482
|
+
"""
|
483
|
+
spec = self._get_spec(verbose=verbose)
|
484
|
+
tree = Tree(f"Flow: {self.name}", style="cyan")
|
459
485
|
|
460
|
-
def
|
461
|
-
|
462
|
-
|
463
|
-
|
486
|
+
def build_tree(label: str, lines: list):
|
487
|
+
node = Tree(label, style="bold magenta" if lines else "cyan")
|
488
|
+
for line in lines:
|
489
|
+
child_node = node.add(Text(line.content, style="yellow"))
|
490
|
+
child_node.children = build_tree("", line.children).children
|
491
|
+
return node
|
464
492
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
def format_data(data, indent=0):
|
474
|
-
if isinstance(data, dict):
|
475
|
-
for key, value in data.items():
|
476
|
-
format_key_value(key, value, indent)
|
477
|
-
elif isinstance(data, list):
|
478
|
-
for i, item in enumerate(data):
|
479
|
-
format_key_value(f"[{i}]", item, indent)
|
480
|
-
else:
|
481
|
-
add_line(str(data), indent, style="yellow")
|
482
|
-
|
483
|
-
# Header
|
484
|
-
flow_name = flow_dict.get("name", "Unnamed")
|
485
|
-
add_line(f"Flow: {flow_name}", style="bold cyan")
|
486
|
-
|
487
|
-
# Section
|
488
|
-
for section_title, section_key in [
|
489
|
-
("Sources:", "import_ops"),
|
490
|
-
("Processing:", "reactive_ops"),
|
491
|
-
("Targets:", "export_ops"),
|
492
|
-
]:
|
493
|
-
add_line("")
|
494
|
-
add_line(section_title, style="bold cyan")
|
495
|
-
format_data(flow_dict.get(section_key, []), indent=0)
|
496
|
-
|
497
|
-
return output
|
498
|
-
|
499
|
-
def _render_text(self) -> Text:
|
500
|
-
flow_spec_str = str(self._lazy_engine_flow())
|
501
|
-
try:
|
502
|
-
flow_dict = json.loads(flow_spec_str)
|
503
|
-
return self._format_flow(flow_dict)
|
504
|
-
except json.JSONDecodeError:
|
505
|
-
return Text(flow_spec_str)
|
493
|
+
for section, lines in spec.sections:
|
494
|
+
section_node = build_tree(f"{section}:", lines)
|
495
|
+
tree.children.append(section_node)
|
496
|
+
return tree
|
497
|
+
|
498
|
+
def _get_spec(self, verbose: bool = False) -> list[tuple[str, str, int]]:
|
499
|
+
return self._lazy_engine_flow().get_spec(output_mode="verbose" if verbose else "concise")
|
506
500
|
|
507
|
-
def
|
501
|
+
def _get_schema(self) -> list[tuple[str, str, str]]:
|
508
502
|
return self._lazy_engine_flow().get_schema()
|
509
503
|
|
510
504
|
def __str__(self):
|
511
|
-
return str(self.
|
505
|
+
return str(self._get_spec())
|
512
506
|
|
513
507
|
def __repr__(self):
|
514
508
|
return repr(self._lazy_engine_flow())
|
@@ -520,13 +514,20 @@ class Flow:
|
|
520
514
|
"""
|
521
515
|
return self._lazy_engine_flow().name()
|
522
516
|
|
523
|
-
|
517
|
+
def update(self) -> _engine.IndexUpdateInfo:
|
524
518
|
"""
|
525
519
|
Update the index defined by the flow.
|
526
|
-
Once the function returns, the
|
520
|
+
Once the function returns, the index is fresh up to the moment when the function is called.
|
521
|
+
"""
|
522
|
+
return execution_context.run(self.update_async())
|
523
|
+
|
524
|
+
async def update_async(self) -> _engine.IndexUpdateInfo:
|
527
525
|
"""
|
528
|
-
|
529
|
-
|
526
|
+
Update the index defined by the flow.
|
527
|
+
Once the function returns, the index is fresh up to the moment when the function is called.
|
528
|
+
"""
|
529
|
+
updater = await FlowLiveUpdater.create_async(self, FlowLiveUpdaterOptions(live_mode=False))
|
530
|
+
await updater.wait_async()
|
530
531
|
return updater.update_stats()
|
531
532
|
|
532
533
|
def evaluate_and_dump(self, options: EvaluateAndDumpOptions):
|
@@ -541,7 +542,7 @@ class Flow:
|
|
541
542
|
"""
|
542
543
|
return self._lazy_engine_flow()
|
543
544
|
|
544
|
-
async def
|
545
|
+
async def internal_flow_async(self) -> _engine.Flow:
|
545
546
|
"""
|
546
547
|
Get the engine flow. The async version.
|
547
548
|
"""
|
@@ -607,22 +608,28 @@ def ensure_all_flows_built() -> None:
|
|
607
608
|
for fl in flows():
|
608
609
|
fl.internal_flow()
|
609
610
|
|
610
|
-
async def
|
611
|
+
async def ensure_all_flows_built_async() -> None:
|
611
612
|
"""
|
612
613
|
Ensure all flows are built.
|
613
614
|
"""
|
614
615
|
for fl in flows():
|
615
|
-
await fl.
|
616
|
+
await fl.internal_flow_async()
|
616
617
|
|
617
|
-
|
618
|
+
def update_all_flows(options: FlowLiveUpdaterOptions) -> dict[str, _engine.IndexUpdateInfo]:
|
618
619
|
"""
|
619
620
|
Update all flows.
|
620
621
|
"""
|
621
|
-
|
622
|
+
return execution_context.run(update_all_flows_async(options))
|
623
|
+
|
624
|
+
async def update_all_flows_async(options: FlowLiveUpdaterOptions) -> dict[str, _engine.IndexUpdateInfo]:
|
625
|
+
"""
|
626
|
+
Update all flows.
|
627
|
+
"""
|
628
|
+
await ensure_all_flows_built_async()
|
622
629
|
async def _update_flow(fl: Flow) -> _engine.IndexUpdateInfo:
|
623
|
-
|
624
|
-
|
625
|
-
|
630
|
+
async with FlowLiveUpdater(fl, options) as updater:
|
631
|
+
await updater.wait_async()
|
632
|
+
return updater.update_stats()
|
626
633
|
fls = flows()
|
627
634
|
all_stats = await asyncio.gather(*(_update_flow(fl) for fl in fls))
|
628
635
|
return {fl.name: stats for fl, stats in zip(fls, all_stats)}
|
cocoindex/setup.py
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
from . import flow
|
2
2
|
from . import _engine
|
3
3
|
|
4
|
-
def sync_setup() -> _engine.
|
4
|
+
def sync_setup() -> _engine.SetupStatus:
|
5
5
|
flow.ensure_all_flows_built()
|
6
6
|
return _engine.sync_setup()
|
7
7
|
|
8
|
-
def drop_setup(flow_names: list[str]) -> _engine.
|
8
|
+
def drop_setup(flow_names: list[str]) -> _engine.SetupStatus:
|
9
9
|
flow.ensure_all_flows_built()
|
10
10
|
return _engine.drop_setup(flow_names)
|
11
11
|
|
12
12
|
def flow_names_with_setup() -> list[str]:
|
13
13
|
return _engine.flow_names_with_setup()
|
14
14
|
|
15
|
-
def apply_setup_changes(
|
16
|
-
_engine.apply_setup_changes(
|
15
|
+
def apply_setup_changes(setup_status: _engine.SetupStatus):
|
16
|
+
_engine.apply_setup_changes(setup_status)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cocoindex
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.35
|
4
4
|
Requires-Dist: sentence-transformers>=3.3.1
|
5
5
|
Requires-Dist: click>=8.1.8
|
6
6
|
Requires-Dist: rich>=14.0.0
|
@@ -28,44 +28,74 @@ Project-URL: Homepage, https://cocoindex.io/
|
|
28
28
|
[](https://pypi.org/project/cocoindex/)
|
29
29
|
[](https://pypistats.org/packages/cocoindex)
|
30
30
|
|
31
|
-
<!-- [](https://www.python.org/) -->
|
32
31
|
[](https://github.com/cocoindex-io/cocoindex/actions/workflows/CI.yml)
|
33
32
|
[](https://github.com/cocoindex-io/cocoindex/actions/workflows/release.yml)
|
34
33
|
[](https://discord.com/invite/zpA9S2DR7s)
|
35
|
-
<!--[](https://www.linkedin.com/company/cocoindex) -->
|
36
|
-
<!--[](https://twitter.com/intent/follow?screen_name=cocoindex_io) -->
|
37
|
-
|
38
34
|
</div>
|
39
35
|
|
40
|
-
CocoIndex is
|
36
|
+
**CocoIndex** is an ultra performant data transformation framework, with its core engine written in Rust. The problem it tries to solve is to make it easy to prepare fresh data for AI - either creating embedding, building knowledge graphs, or performing other data transformations - and take real-time data pipelines beyond traditional SQL.
|
37
|
+
|
38
|
+
<p align="center">
|
39
|
+
<img src="https://cocoindex.io/images/cocoindex-features.png" alt="CocoIndex Features" width="500">
|
40
|
+
</p>
|
41
|
+
|
42
|
+
The philosophy is to have the framework handle the source updates, and having developers only worry about defining a series of data transformation, inspired by spreadsheet.
|
43
|
+
|
44
|
+
## Dataflow programming
|
45
|
+
Unlike a workflow orchestration framework where data is usually opaque, in CocoIndex, data and data operations are first class citizens. CocoIndex follows the idea of [Dataflow](https://en.wikipedia.org/wiki/Dataflow_programming) programming model. Each transformation creates a new field solely based on input fields, without hidden states and value mutation. All data before/after each transformation is observable, with lineage out of the box.
|
46
|
+
|
47
|
+
**Particularly**, users don't explicitly mutate data by creating, updating and deleting. Rather, they define something like - for a set of source data, this is the transformation or formula. The framework takes care of the data operations such as when to create, update, or delete.
|
48
|
+
|
49
|
+
```python
|
50
|
+
# import
|
51
|
+
data['content'] = flow_builder.add_source(...)
|
52
|
+
|
53
|
+
# transform
|
54
|
+
data['out'] = data['content']
|
55
|
+
.transform(...)
|
56
|
+
.transform(...)
|
57
|
+
|
58
|
+
# collect data
|
59
|
+
collector.collect(...)
|
60
|
+
|
61
|
+
# export to db, vector db, graph db ...
|
62
|
+
collector.export(...)
|
63
|
+
```
|
64
|
+
|
65
|
+
## Data Freshness
|
66
|
+
As a data framework, CocoIndex takes it to the next level on data freshness. **Incremental processing** is one of the core values provided by CocoIndex.
|
67
|
+
|
41
68
|
<p align="center">
|
42
|
-
<img src="https://
|
69
|
+
<img src="https://github.com/user-attachments/assets/f4eb29b3-84ee-4fa0-a1e2-80eedeeabde6" alt="Incremental Processing" width="700">
|
43
70
|
</p>
|
44
|
-
|
71
|
+
|
72
|
+
The frameworks takes care of
|
73
|
+
- Change data capture.
|
74
|
+
- Figure out what exactly needs to be updated, and only updating that without having to recompute everything.
|
75
|
+
|
76
|
+
This makes it fast to reflect any source updates to the target store. If you have concerns with surfacing stale data to AI agents and are spending lots of efforts working on infra piece to optimize the latency, the framework actually handles it for you.
|
45
77
|
|
46
78
|
|
47
79
|
## Quick Start:
|
48
|
-
If you're new to CocoIndex
|
80
|
+
If you're new to CocoIndex, we recommend checking out
|
81
|
+
- 📖 [Documentation](https://cocoindex.io/docs)
|
82
|
+
- ⚡ [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart)
|
83
|
+
- 🎬 [Quick Start Video Tutorial](https://youtu.be/gv5R8nOXsWU?si=9ioeKYkMEnYevTXT)
|
49
84
|
|
50
85
|
### Setup
|
86
|
+
|
51
87
|
1. Install CocoIndex Python library
|
52
88
|
|
53
89
|
```bash
|
54
90
|
pip install -U cocoindex
|
55
91
|
```
|
56
92
|
|
57
|
-
2.
|
93
|
+
2. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. CocoIndex uses it for incremental processing.
|
58
94
|
|
59
|
-
- Make sure Docker Compose is installed: [docs](https://docs.docker.com/compose/install/)
|
60
|
-
- Start a Postgres SQL database for cocoindex using our docker compose config:
|
61
95
|
|
62
|
-
|
63
|
-
docker compose -f <(curl -L https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/dev/postgres.yaml) up -d
|
64
|
-
```
|
96
|
+
### Define data flow
|
65
97
|
|
66
|
-
|
67
|
-
Follow [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart) to define your first indexing flow.
|
68
|
-
A common indexing flow looks like:
|
98
|
+
Follow [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart) to define your first indexing flow. An example flow looks like:
|
69
99
|
|
70
100
|
```python
|
71
101
|
@cocoindex.flow_def(name="TextEmbedding")
|
@@ -106,10 +136,11 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
|
|
106
136
|
```
|
107
137
|
|
108
138
|
It defines an index flow like this:
|
109
|
-

|
110
139
|
|
111
|
-
|
112
|
-
|
140
|
+
<img width="363" alt="Data Flow" src="https://github.com/user-attachments/assets/2ea7be6d-3d94-42b1-b2bd-22515577e463" />
|
141
|
+
|
142
|
+
|
143
|
+
## 🚀 Examples and demo
|
113
144
|
|
114
145
|
| Example | Description |
|
115
146
|
|---------|-------------|
|
@@ -121,8 +152,10 @@ Go to the [examples directory](examples) to try out with any of the examples, fo
|
|
121
152
|
| [Docs to Knowledge Graph](examples/docs_to_knowledge_graph) | Extract relationships from Markdown documents and build a knowledge graph |
|
122
153
|
| [Embeddings to Qdrant](examples/text_embedding_qdrant) | Index documents in a Qdrant collection for semantic search |
|
123
154
|
| [FastAPI Server with Docker](examples/fastapi_server_docker) | Run the semantic search server in a Dockerized FastAPI setup |
|
155
|
+
| [Product_Taxonomy_Knowledge_Graph](examples/product_taxonomy_knowledge_graph) | Build knowledge graph for product recommendations |
|
156
|
+
| [Image Search with Vision API](examples/image_search_example) | Generates detailed captions for images using a vision model, embeds them, enables semantic search via FastAPI and served on a React frontend.|
|
124
157
|
|
125
|
-
More coming and stay tuned
|
158
|
+
More coming and stay tuned 👀!
|
126
159
|
|
127
160
|
## 📖 Documentation
|
128
161
|
For detailed documentation, visit [CocoIndex Documentation](https://cocoindex.io/docs), including a [Quickstart guide](https://cocoindex.io/docs/getting_started/quickstart).
|
@@ -136,13 +169,13 @@ Welcome with a huge coconut hug 🥥⋆。˚🤗. We are super excited for commu
|
|
136
169
|
Join our community here:
|
137
170
|
|
138
171
|
- 🌟 [Star us on GitHub](https://github.com/cocoindex-io/cocoindex)
|
139
|
-
- 💬 [Start a GitHub Discussion](https://github.com/cocoindex-io/cocoindex/discussions)
|
140
172
|
- 👋 [Join our Discord community](https://discord.com/invite/zpA9S2DR7s)
|
141
|
-
- 𝕏 [Follow us on X](https://x.com/cocoindex_io)
|
142
|
-
- 🐚 [Follow us on LinkedIn](https://www.linkedin.com/company/cocoindex/about/)
|
143
173
|
- ▶️ [Subscribe to our YouTube channel](https://www.youtube.com/@cocoindex-io)
|
144
174
|
- 📜 [Read our blog posts](https://cocoindex.io/blogs/)
|
145
175
|
|
176
|
+
## Support us:
|
177
|
+
We are constantly improving, and more features and examples are coming soon. If you love this project, please drop us a star ⭐ at GitHub repo [](https://github.com/cocoindex-io/cocoindex) to stay tuned and help us grow.
|
178
|
+
|
146
179
|
## License
|
147
180
|
CocoIndex is Apache 2.0 licensed.
|
148
181
|
|
@@ -1,25 +1,25 @@
|
|
1
|
-
cocoindex-0.1.
|
2
|
-
cocoindex-0.1.
|
3
|
-
cocoindex-0.1.
|
1
|
+
cocoindex-0.1.35.dist-info/METADATA,sha256=pcN86SngXFfIOvvQtGQREkVyTBemfX5JXDNc2PQf0fg,9686
|
2
|
+
cocoindex-0.1.35.dist-info/WHEEL,sha256=M0oGXcMDUVEBxvyDRZ1SJRlU2WxAfG7DBwXO4GUZt1Q,104
|
3
|
+
cocoindex-0.1.35.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
4
|
+
cocoindex/__init__.py,sha256=LpB0VjGvkD1beio8R9RCT6PI3eU0keV-3sBL45fHTQE,690
|
5
|
+
cocoindex/_engine.cpython-311-darwin.so,sha256=jkZjB2b_IwXug3tGtCJnCErcVoz44RCB41PFztm8sDc,49711104
|
6
|
+
cocoindex/auth_registry.py,sha256=NsALZ3SKsDG9cPdrlTlalIqUvgbgFOaFGAbWJNedtJE,692
|
7
|
+
cocoindex/cli.py,sha256=QdZjgnABuDQfy6JiAxeAJiQMI5FNT9FQGLiYAUtLMw8,8923
|
8
|
+
cocoindex/convert.py,sha256=mBUTa_Ag39_ut-yE_jc1wqS3zLjtOm6QKet-bqJ-RWc,5947
|
9
|
+
cocoindex/flow.py,sha256=MZZ0Uf0ObAzR1yIjUecRgA-U0t__95eoLBK_DxwwLnk,23375
|
4
10
|
cocoindex/functions.py,sha256=F79dNmGE127LaU67kF5Oqtf_tIzebFQH7MkyceMX4-s,1830
|
5
|
-
cocoindex/query.py,sha256=8_3Lb_EVjZtl2ZyJNZGX16LoKXEd-PL8OjY-zs9GQeA,3205
|
6
11
|
cocoindex/index.py,sha256=LssEOuZi6AqhwKtZM3QFeQpa9T-0ELi8G5DsrYKECvc,534
|
7
12
|
cocoindex/lib.py,sha256=812GB8Z-2PyjG73Odvw5jtNBLnoeU9aOh9s2ZnETKa8,2329
|
8
|
-
cocoindex/auth_registry.py,sha256=NsALZ3SKsDG9cPdrlTlalIqUvgbgFOaFGAbWJNedtJE,692
|
9
|
-
cocoindex/convert.py,sha256=mBUTa_Ag39_ut-yE_jc1wqS3zLjtOm6QKet-bqJ-RWc,5947
|
10
|
-
cocoindex/tests/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
11
|
-
cocoindex/tests/test_convert.py,sha256=WPRKp0jv_uSEM81RGWEAmsax-J-FtXt90mZ0yEnvGLs,11236
|
12
|
-
cocoindex/__init__.py,sha256=CMfiZ-CROvrcE6jjkmzEZBk4HjuN6s6nfRXtSd0c_z8,684
|
13
|
-
cocoindex/flow.py,sha256=KVbB_Ebm0IpJgZxV4BLg30fjIPmsGFhrtmQOBqCZaIk,23037
|
14
13
|
cocoindex/llm.py,sha256=_3rtahuKcqcEHPkFSwhXOSrekZyGxVApPoYtlU_chcA,348
|
15
|
-
cocoindex/setting.py,sha256=pms1blwlXIOqZIpye-rfiwzqYUCAC8oEL7mQM5A160g,2356
|
16
|
-
cocoindex/runtime.py,sha256=jqRnWkkIlAhE04gi4y0Y5bzuq9FX4j0aVNU-nengLJk,980
|
17
14
|
cocoindex/op.py,sha256=OGYRYl7gPa7X7iSU30iTrCzvqRBu7jQqfvN4vjG__dA,10730
|
18
|
-
cocoindex/sources.py,sha256=wZFU8lwSXjyofJR-syySH9fTyPnBlAPJ6-1hQNX8fGA,936
|
19
|
-
cocoindex/setup.py,sha256=W1HshwYk_K2aeLOVn_e62ZOXBO9yWsoUboRiH4SjF48,496
|
20
|
-
cocoindex/cli.py,sha256=Vh8bNZ41yLr1l_jJR1Z_b7mY-dOvN-EbiCRxDvtIsRk,8885
|
21
15
|
cocoindex/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
-
cocoindex/
|
16
|
+
cocoindex/query.py,sha256=8_3Lb_EVjZtl2ZyJNZGX16LoKXEd-PL8OjY-zs9GQeA,3205
|
17
|
+
cocoindex/runtime.py,sha256=jqRnWkkIlAhE04gi4y0Y5bzuq9FX4j0aVNU-nengLJk,980
|
18
|
+
cocoindex/setting.py,sha256=pms1blwlXIOqZIpye-rfiwzqYUCAC8oEL7mQM5A160g,2356
|
19
|
+
cocoindex/setup.py,sha256=AQLbtBLuJX066IANS7BGp20246mAGQ_4Z0W6MVJcQzY,481
|
20
|
+
cocoindex/sources.py,sha256=wZFU8lwSXjyofJR-syySH9fTyPnBlAPJ6-1hQNX8fGA,936
|
23
21
|
cocoindex/storages.py,sha256=MFMsfyOCYMggTWeWrOi82miqOXQmiUuqq828x5htBr0,2207
|
24
|
-
cocoindex/
|
25
|
-
cocoindex
|
22
|
+
cocoindex/tests/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
23
|
+
cocoindex/tests/test_convert.py,sha256=WPRKp0jv_uSEM81RGWEAmsax-J-FtXt90mZ0yEnvGLs,11236
|
24
|
+
cocoindex/typing.py,sha256=BI2vPw4Iu4S3aznNJQrfM2LZU_weGYASTXF1W3ZWh_Y,8568
|
25
|
+
cocoindex-0.1.35.dist-info/RECORD,,
|
File without changes
|