cocoindex 0.3.4__cp311-abi3-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. cocoindex/__init__.py +114 -0
  2. cocoindex/_engine.abi3.so +0 -0
  3. cocoindex/auth_registry.py +44 -0
  4. cocoindex/cli.py +830 -0
  5. cocoindex/engine_object.py +214 -0
  6. cocoindex/engine_value.py +550 -0
  7. cocoindex/flow.py +1281 -0
  8. cocoindex/functions/__init__.py +40 -0
  9. cocoindex/functions/_engine_builtin_specs.py +66 -0
  10. cocoindex/functions/colpali.py +247 -0
  11. cocoindex/functions/sbert.py +77 -0
  12. cocoindex/index.py +50 -0
  13. cocoindex/lib.py +75 -0
  14. cocoindex/llm.py +47 -0
  15. cocoindex/op.py +1047 -0
  16. cocoindex/py.typed +0 -0
  17. cocoindex/query_handler.py +57 -0
  18. cocoindex/runtime.py +78 -0
  19. cocoindex/setting.py +171 -0
  20. cocoindex/setup.py +92 -0
  21. cocoindex/sources/__init__.py +5 -0
  22. cocoindex/sources/_engine_builtin_specs.py +120 -0
  23. cocoindex/subprocess_exec.py +277 -0
  24. cocoindex/targets/__init__.py +5 -0
  25. cocoindex/targets/_engine_builtin_specs.py +153 -0
  26. cocoindex/targets/lancedb.py +466 -0
  27. cocoindex/tests/__init__.py +0 -0
  28. cocoindex/tests/test_engine_object.py +331 -0
  29. cocoindex/tests/test_engine_value.py +1724 -0
  30. cocoindex/tests/test_optional_database.py +249 -0
  31. cocoindex/tests/test_transform_flow.py +300 -0
  32. cocoindex/tests/test_typing.py +553 -0
  33. cocoindex/tests/test_validation.py +134 -0
  34. cocoindex/typing.py +834 -0
  35. cocoindex/user_app_loader.py +53 -0
  36. cocoindex/utils.py +20 -0
  37. cocoindex/validation.py +104 -0
  38. cocoindex-0.3.4.dist-info/METADATA +288 -0
  39. cocoindex-0.3.4.dist-info/RECORD +42 -0
  40. cocoindex-0.3.4.dist-info/WHEEL +4 -0
  41. cocoindex-0.3.4.dist-info/entry_points.txt +2 -0
  42. cocoindex-0.3.4.dist-info/licenses/THIRD_PARTY_NOTICES.html +13249 -0
cocoindex/py.typed ADDED
File without changes
@@ -0,0 +1,57 @@
1
+ import dataclasses
2
+ import numpy as np
3
+ from numpy import typing as npt
4
+ from typing import Generic, Any
5
+ from .index import VectorSimilarityMetric
6
+ import sys
7
+
8
+ if sys.version_info >= (3, 13):
9
+ from typing import TypeVar
10
+ else:
11
+ from typing_extensions import TypeVar # PEP 696 backport
12
+
13
+
14
+ @dataclasses.dataclass
15
+ class QueryHandlerResultFields:
16
+ """
17
+ Specify field names in query results returned by the query handler.
18
+ This provides metadata for tools like CocoInsight to recognize structure of the query results.
19
+ """
20
+
21
+ embedding: list[str] = dataclasses.field(default_factory=list)
22
+ score: str | None = None
23
+
24
+
25
+ @dataclasses.dataclass
26
+ class QueryHandlerInfo:
27
+ """
28
+ Info to configure a query handler.
29
+ """
30
+
31
+ result_fields: QueryHandlerResultFields | None = None
32
+
33
+
34
+ @dataclasses.dataclass
35
+ class QueryInfo:
36
+ """
37
+ Info about the query.
38
+ """
39
+
40
+ embedding: list[float] | npt.NDArray[np.float32] | None = None
41
+ similarity_metric: VectorSimilarityMetric | None = None
42
+
43
+
44
+ R = TypeVar("R", default=Any)
45
+
46
+
47
+ @dataclasses.dataclass
48
+ class QueryOutput(Generic[R]):
49
+ """
50
+ Output of a query handler.
51
+
52
+ results: list of results. Each result can be a dict or a dataclass.
53
+ query_info: Info about the query.
54
+ """
55
+
56
+ results: list[R]
57
+ query_info: QueryInfo = dataclasses.field(default_factory=QueryInfo)
cocoindex/runtime.py ADDED
@@ -0,0 +1,78 @@
1
+ """
2
+ This module provides a standalone execution runtime for executing coroutines in a thread-safe
3
+ manner.
4
+ """
5
+
6
+ import threading
7
+ import asyncio
8
+ import inspect
9
+ import warnings
10
+
11
+ from typing import Any, Callable, Awaitable, TypeVar, Coroutine
12
+
13
+ T = TypeVar("T")
14
+
15
+
16
+ class _ExecutionContext:
17
+ _lock: threading.Lock
18
+ _event_loop: asyncio.AbstractEventLoop | None = None
19
+
20
+ def __init__(self) -> None:
21
+ self._lock = threading.Lock()
22
+
23
+ @property
24
+ def event_loop(self) -> asyncio.AbstractEventLoop:
25
+ """Get the event loop for the cocoindex library."""
26
+ with self._lock:
27
+ if self._event_loop is None:
28
+ loop = asyncio.new_event_loop()
29
+ self._event_loop = loop
30
+
31
+ def _runner(l: asyncio.AbstractEventLoop) -> None:
32
+ asyncio.set_event_loop(l)
33
+ l.run_forever()
34
+
35
+ threading.Thread(target=_runner, args=(loop,), daemon=True).start()
36
+ return self._event_loop
37
+
38
+ def run(self, coro: Coroutine[Any, Any, T]) -> T:
39
+ """Run a coroutine in the event loop, blocking until it finishes. Return its result."""
40
+ try:
41
+ running_loop = asyncio.get_running_loop()
42
+ except RuntimeError:
43
+ running_loop = None
44
+
45
+ loop = self.event_loop
46
+
47
+ if running_loop is not None:
48
+ if running_loop is loop:
49
+ raise RuntimeError(
50
+ "CocoIndex sync API was called from inside CocoIndex's async context. "
51
+ "Use the async variant of this method instead."
52
+ )
53
+ warnings.warn(
54
+ "CocoIndex sync API was called inside an existing event loop. "
55
+ "This may block other tasks. Prefer the async method.",
56
+ RuntimeWarning,
57
+ stacklevel=2,
58
+ )
59
+
60
+ fut = asyncio.run_coroutine_threadsafe(coro, loop)
61
+ try:
62
+ return fut.result()
63
+ except KeyboardInterrupt:
64
+ fut.cancel()
65
+ raise
66
+
67
+
68
+ execution_context = _ExecutionContext()
69
+
70
+
71
+ def to_async_call(call: Callable[..., Any]) -> Callable[..., Awaitable[Any]]:
72
+ if isinstance(call, (staticmethod, classmethod)):
73
+ base_call = call.__func__
74
+ else:
75
+ base_call = call
76
+ if inspect.iscoroutinefunction(base_call):
77
+ return call
78
+ return lambda *args, **kwargs: asyncio.to_thread(lambda: call(*args, **kwargs))
cocoindex/setting.py ADDED
@@ -0,0 +1,171 @@
1
+ """
2
+ Data types for settings of the cocoindex library.
3
+ """
4
+
5
+ import os
6
+
7
+ from typing import Callable, Self, Any, overload
8
+ from dataclasses import dataclass
9
+ from . import _engine # type: ignore
10
+
11
+
12
+ def get_app_namespace(*, trailing_delimiter: str | None = None) -> str:
13
+ """Get the application namespace. Append the `trailing_delimiter` if not empty."""
14
+ app_namespace: str = _engine.get_app_namespace()
15
+ if app_namespace == "" or trailing_delimiter is None:
16
+ return app_namespace
17
+ return f"{app_namespace}{trailing_delimiter}"
18
+
19
+
20
+ def split_app_namespace(full_name: str, delimiter: str) -> tuple[str, str]:
21
+ """Split the full name into the application namespace and the rest."""
22
+ parts = full_name.split(delimiter, 1)
23
+ if len(parts) == 1:
24
+ return "", parts[0]
25
+ return (parts[0], parts[1])
26
+
27
+
28
+ @dataclass
29
+ class DatabaseConnectionSpec:
30
+ """
31
+ Connection spec for relational database.
32
+ Used by both internal and target storage.
33
+ """
34
+
35
+ url: str
36
+ user: str | None = None
37
+ password: str | None = None
38
+ max_connections: int = 25
39
+ min_connections: int = 5
40
+
41
+
42
+ @dataclass
43
+ class GlobalExecutionOptions:
44
+ """Global execution options."""
45
+
46
+ # The maximum number of concurrent inflight requests, shared among all sources from all flows.
47
+ source_max_inflight_rows: int | None = 1024
48
+ source_max_inflight_bytes: int | None = None
49
+
50
+
51
+ def _load_field(
52
+ target: dict[str, Any],
53
+ name: str,
54
+ env_name: str,
55
+ required: bool = False,
56
+ parse: Callable[[str], Any] | None = None,
57
+ ) -> None:
58
+ value = os.getenv(env_name)
59
+ if value is None:
60
+ if required:
61
+ raise ValueError(f"{env_name} is not set")
62
+ else:
63
+ if parse is None:
64
+ target[name] = value
65
+ else:
66
+ try:
67
+ target[name] = parse(value)
68
+ except Exception as e:
69
+ raise ValueError(
70
+ f"failed to parse environment variable {env_name}: {value}"
71
+ ) from e
72
+
73
+
74
+ @dataclass
75
+ class Settings:
76
+ """Settings for the cocoindex library."""
77
+
78
+ database: DatabaseConnectionSpec | None = None
79
+ app_namespace: str = ""
80
+ global_execution_options: GlobalExecutionOptions | None = None
81
+
82
+ @classmethod
83
+ def from_env(cls) -> Self:
84
+ """Load settings from environment variables."""
85
+
86
+ database_url = os.getenv("COCOINDEX_DATABASE_URL")
87
+ if database_url is not None:
88
+ db_kwargs: dict[str, Any] = {"url": database_url}
89
+ _load_field(db_kwargs, "user", "COCOINDEX_DATABASE_USER")
90
+ _load_field(db_kwargs, "password", "COCOINDEX_DATABASE_PASSWORD")
91
+ _load_field(
92
+ db_kwargs,
93
+ "max_connections",
94
+ "COCOINDEX_DATABASE_MAX_CONNECTIONS",
95
+ parse=int,
96
+ )
97
+ _load_field(
98
+ db_kwargs,
99
+ "min_connections",
100
+ "COCOINDEX_DATABASE_MIN_CONNECTIONS",
101
+ parse=int,
102
+ )
103
+ database = DatabaseConnectionSpec(**db_kwargs)
104
+ else:
105
+ database = None
106
+
107
+ exec_kwargs: dict[str, Any] = dict()
108
+ _load_field(
109
+ exec_kwargs,
110
+ "source_max_inflight_rows",
111
+ "COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS",
112
+ parse=int,
113
+ )
114
+ _load_field(
115
+ exec_kwargs,
116
+ "source_max_inflight_bytes",
117
+ "COCOINDEX_SOURCE_MAX_INFLIGHT_BYTES",
118
+ parse=int,
119
+ )
120
+ global_execution_options = GlobalExecutionOptions(**exec_kwargs)
121
+
122
+ app_namespace = os.getenv("COCOINDEX_APP_NAMESPACE", "")
123
+
124
+ return cls(
125
+ database=database,
126
+ app_namespace=app_namespace,
127
+ global_execution_options=global_execution_options,
128
+ )
129
+
130
+
131
+ @dataclass
132
+ class ServerSettings:
133
+ """Settings for the cocoindex server."""
134
+
135
+ # The address to bind the server to.
136
+ address: str = "127.0.0.1:49344"
137
+
138
+ # The origins of the clients (e.g. CocoInsight UI) to allow CORS from.
139
+ cors_origins: list[str] | None = None
140
+
141
+ @classmethod
142
+ def from_env(cls) -> Self:
143
+ """Load settings from environment variables."""
144
+ kwargs: dict[str, Any] = dict()
145
+ _load_field(kwargs, "address", "COCOINDEX_SERVER_ADDRESS")
146
+ _load_field(
147
+ kwargs,
148
+ "cors_origins",
149
+ "COCOINDEX_SERVER_CORS_ORIGINS",
150
+ parse=ServerSettings.parse_cors_origins,
151
+ )
152
+ return cls(**kwargs)
153
+
154
+ @overload
155
+ @staticmethod
156
+ def parse_cors_origins(s: str) -> list[str]: ...
157
+
158
+ @overload
159
+ @staticmethod
160
+ def parse_cors_origins(s: str | None) -> list[str] | None: ...
161
+
162
+ @staticmethod
163
+ def parse_cors_origins(s: str | None) -> list[str] | None:
164
+ """
165
+ Parse the CORS origins from a string.
166
+ """
167
+ return (
168
+ [o for e in s.split(",") if (o := e.strip()) != ""]
169
+ if s is not None
170
+ else None
171
+ )
cocoindex/setup.py ADDED
@@ -0,0 +1,92 @@
1
+ """
2
+ This module provides APIs to manage the setup of flows.
3
+ """
4
+
5
+ from . import setting
6
+ from . import _engine # type: ignore
7
+ from .runtime import execution_context
8
+
9
+
10
+ class SetupChangeBundle:
11
+ """
12
+ This class represents a bundle of setup changes.
13
+ """
14
+
15
+ _engine_bundle: _engine.SetupChangeBundle
16
+
17
+ def __init__(self, _engine_bundle: _engine.SetupChangeBundle):
18
+ self._engine_bundle = _engine_bundle
19
+
20
+ def __str__(self) -> str:
21
+ desc, _ = execution_context.run(self._engine_bundle.describe_async())
22
+ return desc # type: ignore
23
+
24
+ def __repr__(self) -> str:
25
+ return self.__str__()
26
+
27
+ def apply(self, report_to_stdout: bool = False) -> None:
28
+ """
29
+ Apply the setup changes.
30
+ """
31
+ execution_context.run(self.apply_async(report_to_stdout=report_to_stdout))
32
+
33
+ async def apply_async(self, report_to_stdout: bool = False) -> None:
34
+ """
35
+ Apply the setup changes. Async version of `apply`.
36
+ """
37
+ await self._engine_bundle.apply_async(report_to_stdout=report_to_stdout)
38
+
39
+ def describe(self) -> tuple[str, bool]:
40
+ """
41
+ Describe the setup changes.
42
+ """
43
+ return execution_context.run(self.describe_async()) # type: ignore
44
+
45
+ async def describe_async(self) -> tuple[str, bool]:
46
+ """
47
+ Describe the setup changes. Async version of `describe`.
48
+ """
49
+ return await self._engine_bundle.describe_async() # type: ignore
50
+
51
+ def describe_and_apply(self, report_to_stdout: bool = False) -> None:
52
+ """
53
+ Describe the setup changes and apply them if `report_to_stdout` is True.
54
+ Silently apply setup changes otherwise.
55
+ """
56
+ execution_context.run(
57
+ self.describe_and_apply_async(report_to_stdout=report_to_stdout)
58
+ )
59
+
60
+ async def describe_and_apply_async(self, *, report_to_stdout: bool = False) -> None:
61
+ """
62
+ Describe the setup changes and apply them if `report_to_stdout` is True.
63
+ Silently apply setup changes otherwise. Async version of `describe_and_apply`.
64
+ """
65
+ if report_to_stdout:
66
+ desc, is_up_to_date = await self.describe_async()
67
+ print("Setup status:\n")
68
+ print(desc)
69
+ if is_up_to_date:
70
+ print("No setup changes to apply.")
71
+ return
72
+ await self.apply_async(report_to_stdout=report_to_stdout)
73
+
74
+
75
+ def flow_names_with_setup() -> list[str]:
76
+ """
77
+ Get the names of all flows that have been setup.
78
+ """
79
+ return execution_context.run(flow_names_with_setup_async()) # type: ignore
80
+
81
+
82
+ async def flow_names_with_setup_async() -> list[str]:
83
+ """
84
+ Get the names of all flows that have been setup. Async version of `flow_names_with_setup`.
85
+ """
86
+ result = []
87
+ all_flow_names = await _engine.flow_names_with_setup_async()
88
+ for name in all_flow_names:
89
+ app_namespace, name = setting.split_app_namespace(name, ".")
90
+ if app_namespace == setting.get_app_namespace():
91
+ result.append(name)
92
+ return result
@@ -0,0 +1,5 @@
1
+ """
2
+ Sources supported by CocoIndex.
3
+ """
4
+
5
+ from ._engine_builtin_specs import *
@@ -0,0 +1,120 @@
1
+ """All builtin sources."""
2
+
3
+ from .. import op
4
+ from ..auth_registry import TransientAuthEntryReference
5
+ from ..setting import DatabaseConnectionSpec
6
+ from dataclasses import dataclass
7
+ import datetime
8
+
9
+
10
+ class LocalFile(op.SourceSpec):
11
+ """Import data from local file system."""
12
+
13
+ _op_category = op.OpCategory.SOURCE
14
+
15
+ path: str
16
+ binary: bool = False
17
+
18
+ # If provided, only files matching these patterns will be included.
19
+ # See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
20
+ included_patterns: list[str] | None = None
21
+
22
+ # If provided, files matching these patterns will be excluded.
23
+ # See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
24
+ excluded_patterns: list[str] | None = None
25
+
26
+ # If provided, files exceeding this size in bytes will be treated as non-existent.
27
+ max_file_size: int | None = None
28
+
29
+
30
+ class GoogleDrive(op.SourceSpec):
31
+ """Import data from Google Drive."""
32
+
33
+ _op_category = op.OpCategory.SOURCE
34
+
35
+ service_account_credential_path: str
36
+ root_folder_ids: list[str]
37
+ binary: bool = False
38
+ recent_changes_poll_interval: datetime.timedelta | None = None
39
+
40
+
41
+ @dataclass
42
+ class RedisNotification:
43
+ """Redis pub/sub configuration for event notifications."""
44
+
45
+ # Redis server URL (e.g., "redis://localhost:6379")
46
+ redis_url: str
47
+ # Redis channel name for pub/sub notifications
48
+ redis_channel: str
49
+
50
+
51
+ class AmazonS3(op.SourceSpec):
52
+ """Import data from an Amazon S3 bucket. Supports optional prefix and file filtering by glob patterns."""
53
+
54
+ _op_category = op.OpCategory.SOURCE
55
+
56
+ bucket_name: str
57
+ prefix: str | None = None
58
+ binary: bool = False
59
+ included_patterns: list[str] | None = None
60
+ excluded_patterns: list[str] | None = None
61
+ max_file_size: int | None = None
62
+ sqs_queue_url: str | None = None
63
+ redis: RedisNotification | None = None
64
+
65
+
66
+ class AzureBlob(op.SourceSpec):
67
+ """
68
+ Import data from an Azure Blob Storage container. Supports optional prefix and file filtering by glob patterns.
69
+
70
+ Authentication mechanisms taken in the following order:
71
+ - SAS token (if provided)
72
+ - Account access key (if provided)
73
+ - Default Azure credential
74
+ """
75
+
76
+ _op_category = op.OpCategory.SOURCE
77
+
78
+ account_name: str
79
+ container_name: str
80
+ prefix: str | None = None
81
+ binary: bool = False
82
+ included_patterns: list[str] | None = None
83
+ excluded_patterns: list[str] | None = None
84
+
85
+ sas_token: TransientAuthEntryReference[str] | None = None
86
+ account_access_key: TransientAuthEntryReference[str] | None = None
87
+
88
+
89
+ @dataclass
90
+ class PostgresNotification:
91
+ """Notification for a PostgreSQL table."""
92
+
93
+ # Optional: name of the PostgreSQL channel to use.
94
+ # If not provided, will generate a default channel name.
95
+ channel_name: str | None = None
96
+
97
+
98
+ class Postgres(op.SourceSpec):
99
+ """Import data from a PostgreSQL table."""
100
+
101
+ _op_category = op.OpCategory.SOURCE
102
+
103
+ # Table name to read from (required)
104
+ table_name: str
105
+
106
+ # Database connection reference (optional - uses default if not provided)
107
+ database: TransientAuthEntryReference[DatabaseConnectionSpec] | None = None
108
+
109
+ # Optional: specific columns to include (if None, includes all columns)
110
+ included_columns: list[str] | None = None
111
+
112
+ # Optional: column name to use for ordinal tracking (for incremental updates)
113
+ # Should be a timestamp, serial, or other incrementing column
114
+ ordinal_column: str | None = None
115
+
116
+ # Optional: when set, supports change capture from PostgreSQL notification.
117
+ notification: PostgresNotification | None = None
118
+
119
+ # Optional: SQL expression filter for rows (arbitrary SQL boolean expression)
120
+ filter: str | None = None