cocoindex 0.3.4__cp311-abi3-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoindex/__init__.py +114 -0
- cocoindex/_engine.abi3.so +0 -0
- cocoindex/auth_registry.py +44 -0
- cocoindex/cli.py +830 -0
- cocoindex/engine_object.py +214 -0
- cocoindex/engine_value.py +550 -0
- cocoindex/flow.py +1281 -0
- cocoindex/functions/__init__.py +40 -0
- cocoindex/functions/_engine_builtin_specs.py +66 -0
- cocoindex/functions/colpali.py +247 -0
- cocoindex/functions/sbert.py +77 -0
- cocoindex/index.py +50 -0
- cocoindex/lib.py +75 -0
- cocoindex/llm.py +47 -0
- cocoindex/op.py +1047 -0
- cocoindex/py.typed +0 -0
- cocoindex/query_handler.py +57 -0
- cocoindex/runtime.py +78 -0
- cocoindex/setting.py +171 -0
- cocoindex/setup.py +92 -0
- cocoindex/sources/__init__.py +5 -0
- cocoindex/sources/_engine_builtin_specs.py +120 -0
- cocoindex/subprocess_exec.py +277 -0
- cocoindex/targets/__init__.py +5 -0
- cocoindex/targets/_engine_builtin_specs.py +153 -0
- cocoindex/targets/lancedb.py +466 -0
- cocoindex/tests/__init__.py +0 -0
- cocoindex/tests/test_engine_object.py +331 -0
- cocoindex/tests/test_engine_value.py +1724 -0
- cocoindex/tests/test_optional_database.py +249 -0
- cocoindex/tests/test_transform_flow.py +300 -0
- cocoindex/tests/test_typing.py +553 -0
- cocoindex/tests/test_validation.py +134 -0
- cocoindex/typing.py +834 -0
- cocoindex/user_app_loader.py +53 -0
- cocoindex/utils.py +20 -0
- cocoindex/validation.py +104 -0
- cocoindex-0.3.4.dist-info/METADATA +288 -0
- cocoindex-0.3.4.dist-info/RECORD +42 -0
- cocoindex-0.3.4.dist-info/WHEEL +4 -0
- cocoindex-0.3.4.dist-info/entry_points.txt +2 -0
- cocoindex-0.3.4.dist-info/licenses/THIRD_PARTY_NOTICES.html +13249 -0
cocoindex/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import numpy as np
|
|
3
|
+
from numpy import typing as npt
|
|
4
|
+
from typing import Generic, Any
|
|
5
|
+
from .index import VectorSimilarityMetric
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
if sys.version_info >= (3, 13):
|
|
9
|
+
from typing import TypeVar
|
|
10
|
+
else:
|
|
11
|
+
from typing_extensions import TypeVar # PEP 696 backport
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclasses.dataclass
|
|
15
|
+
class QueryHandlerResultFields:
|
|
16
|
+
"""
|
|
17
|
+
Specify field names in query results returned by the query handler.
|
|
18
|
+
This provides metadata for tools like CocoInsight to recognize structure of the query results.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
embedding: list[str] = dataclasses.field(default_factory=list)
|
|
22
|
+
score: str | None = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclasses.dataclass
|
|
26
|
+
class QueryHandlerInfo:
|
|
27
|
+
"""
|
|
28
|
+
Info to configure a query handler.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
result_fields: QueryHandlerResultFields | None = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclasses.dataclass
|
|
35
|
+
class QueryInfo:
|
|
36
|
+
"""
|
|
37
|
+
Info about the query.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
embedding: list[float] | npt.NDArray[np.float32] | None = None
|
|
41
|
+
similarity_metric: VectorSimilarityMetric | None = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
R = TypeVar("R", default=Any)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclasses.dataclass
|
|
48
|
+
class QueryOutput(Generic[R]):
|
|
49
|
+
"""
|
|
50
|
+
Output of a query handler.
|
|
51
|
+
|
|
52
|
+
results: list of results. Each result can be a dict or a dataclass.
|
|
53
|
+
query_info: Info about the query.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
results: list[R]
|
|
57
|
+
query_info: QueryInfo = dataclasses.field(default_factory=QueryInfo)
|
cocoindex/runtime.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module provides a standalone execution runtime for executing coroutines in a thread-safe
|
|
3
|
+
manner.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import threading
|
|
7
|
+
import asyncio
|
|
8
|
+
import inspect
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
from typing import Any, Callable, Awaitable, TypeVar, Coroutine
|
|
12
|
+
|
|
13
|
+
T = TypeVar("T")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _ExecutionContext:
|
|
17
|
+
_lock: threading.Lock
|
|
18
|
+
_event_loop: asyncio.AbstractEventLoop | None = None
|
|
19
|
+
|
|
20
|
+
def __init__(self) -> None:
|
|
21
|
+
self._lock = threading.Lock()
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def event_loop(self) -> asyncio.AbstractEventLoop:
|
|
25
|
+
"""Get the event loop for the cocoindex library."""
|
|
26
|
+
with self._lock:
|
|
27
|
+
if self._event_loop is None:
|
|
28
|
+
loop = asyncio.new_event_loop()
|
|
29
|
+
self._event_loop = loop
|
|
30
|
+
|
|
31
|
+
def _runner(l: asyncio.AbstractEventLoop) -> None:
|
|
32
|
+
asyncio.set_event_loop(l)
|
|
33
|
+
l.run_forever()
|
|
34
|
+
|
|
35
|
+
threading.Thread(target=_runner, args=(loop,), daemon=True).start()
|
|
36
|
+
return self._event_loop
|
|
37
|
+
|
|
38
|
+
def run(self, coro: Coroutine[Any, Any, T]) -> T:
|
|
39
|
+
"""Run a coroutine in the event loop, blocking until it finishes. Return its result."""
|
|
40
|
+
try:
|
|
41
|
+
running_loop = asyncio.get_running_loop()
|
|
42
|
+
except RuntimeError:
|
|
43
|
+
running_loop = None
|
|
44
|
+
|
|
45
|
+
loop = self.event_loop
|
|
46
|
+
|
|
47
|
+
if running_loop is not None:
|
|
48
|
+
if running_loop is loop:
|
|
49
|
+
raise RuntimeError(
|
|
50
|
+
"CocoIndex sync API was called from inside CocoIndex's async context. "
|
|
51
|
+
"Use the async variant of this method instead."
|
|
52
|
+
)
|
|
53
|
+
warnings.warn(
|
|
54
|
+
"CocoIndex sync API was called inside an existing event loop. "
|
|
55
|
+
"This may block other tasks. Prefer the async method.",
|
|
56
|
+
RuntimeWarning,
|
|
57
|
+
stacklevel=2,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
fut = asyncio.run_coroutine_threadsafe(coro, loop)
|
|
61
|
+
try:
|
|
62
|
+
return fut.result()
|
|
63
|
+
except KeyboardInterrupt:
|
|
64
|
+
fut.cancel()
|
|
65
|
+
raise
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
execution_context = _ExecutionContext()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def to_async_call(call: Callable[..., Any]) -> Callable[..., Awaitable[Any]]:
|
|
72
|
+
if isinstance(call, (staticmethod, classmethod)):
|
|
73
|
+
base_call = call.__func__
|
|
74
|
+
else:
|
|
75
|
+
base_call = call
|
|
76
|
+
if inspect.iscoroutinefunction(base_call):
|
|
77
|
+
return call
|
|
78
|
+
return lambda *args, **kwargs: asyncio.to_thread(lambda: call(*args, **kwargs))
|
cocoindex/setting.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data types for settings of the cocoindex library.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from typing import Callable, Self, Any, overload
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from . import _engine # type: ignore
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_app_namespace(*, trailing_delimiter: str | None = None) -> str:
|
|
13
|
+
"""Get the application namespace. Append the `trailing_delimiter` if not empty."""
|
|
14
|
+
app_namespace: str = _engine.get_app_namespace()
|
|
15
|
+
if app_namespace == "" or trailing_delimiter is None:
|
|
16
|
+
return app_namespace
|
|
17
|
+
return f"{app_namespace}{trailing_delimiter}"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def split_app_namespace(full_name: str, delimiter: str) -> tuple[str, str]:
|
|
21
|
+
"""Split the full name into the application namespace and the rest."""
|
|
22
|
+
parts = full_name.split(delimiter, 1)
|
|
23
|
+
if len(parts) == 1:
|
|
24
|
+
return "", parts[0]
|
|
25
|
+
return (parts[0], parts[1])
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class DatabaseConnectionSpec:
|
|
30
|
+
"""
|
|
31
|
+
Connection spec for relational database.
|
|
32
|
+
Used by both internal and target storage.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
url: str
|
|
36
|
+
user: str | None = None
|
|
37
|
+
password: str | None = None
|
|
38
|
+
max_connections: int = 25
|
|
39
|
+
min_connections: int = 5
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class GlobalExecutionOptions:
|
|
44
|
+
"""Global execution options."""
|
|
45
|
+
|
|
46
|
+
# The maximum number of concurrent inflight requests, shared among all sources from all flows.
|
|
47
|
+
source_max_inflight_rows: int | None = 1024
|
|
48
|
+
source_max_inflight_bytes: int | None = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _load_field(
|
|
52
|
+
target: dict[str, Any],
|
|
53
|
+
name: str,
|
|
54
|
+
env_name: str,
|
|
55
|
+
required: bool = False,
|
|
56
|
+
parse: Callable[[str], Any] | None = None,
|
|
57
|
+
) -> None:
|
|
58
|
+
value = os.getenv(env_name)
|
|
59
|
+
if value is None:
|
|
60
|
+
if required:
|
|
61
|
+
raise ValueError(f"{env_name} is not set")
|
|
62
|
+
else:
|
|
63
|
+
if parse is None:
|
|
64
|
+
target[name] = value
|
|
65
|
+
else:
|
|
66
|
+
try:
|
|
67
|
+
target[name] = parse(value)
|
|
68
|
+
except Exception as e:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"failed to parse environment variable {env_name}: {value}"
|
|
71
|
+
) from e
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class Settings:
|
|
76
|
+
"""Settings for the cocoindex library."""
|
|
77
|
+
|
|
78
|
+
database: DatabaseConnectionSpec | None = None
|
|
79
|
+
app_namespace: str = ""
|
|
80
|
+
global_execution_options: GlobalExecutionOptions | None = None
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def from_env(cls) -> Self:
|
|
84
|
+
"""Load settings from environment variables."""
|
|
85
|
+
|
|
86
|
+
database_url = os.getenv("COCOINDEX_DATABASE_URL")
|
|
87
|
+
if database_url is not None:
|
|
88
|
+
db_kwargs: dict[str, Any] = {"url": database_url}
|
|
89
|
+
_load_field(db_kwargs, "user", "COCOINDEX_DATABASE_USER")
|
|
90
|
+
_load_field(db_kwargs, "password", "COCOINDEX_DATABASE_PASSWORD")
|
|
91
|
+
_load_field(
|
|
92
|
+
db_kwargs,
|
|
93
|
+
"max_connections",
|
|
94
|
+
"COCOINDEX_DATABASE_MAX_CONNECTIONS",
|
|
95
|
+
parse=int,
|
|
96
|
+
)
|
|
97
|
+
_load_field(
|
|
98
|
+
db_kwargs,
|
|
99
|
+
"min_connections",
|
|
100
|
+
"COCOINDEX_DATABASE_MIN_CONNECTIONS",
|
|
101
|
+
parse=int,
|
|
102
|
+
)
|
|
103
|
+
database = DatabaseConnectionSpec(**db_kwargs)
|
|
104
|
+
else:
|
|
105
|
+
database = None
|
|
106
|
+
|
|
107
|
+
exec_kwargs: dict[str, Any] = dict()
|
|
108
|
+
_load_field(
|
|
109
|
+
exec_kwargs,
|
|
110
|
+
"source_max_inflight_rows",
|
|
111
|
+
"COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS",
|
|
112
|
+
parse=int,
|
|
113
|
+
)
|
|
114
|
+
_load_field(
|
|
115
|
+
exec_kwargs,
|
|
116
|
+
"source_max_inflight_bytes",
|
|
117
|
+
"COCOINDEX_SOURCE_MAX_INFLIGHT_BYTES",
|
|
118
|
+
parse=int,
|
|
119
|
+
)
|
|
120
|
+
global_execution_options = GlobalExecutionOptions(**exec_kwargs)
|
|
121
|
+
|
|
122
|
+
app_namespace = os.getenv("COCOINDEX_APP_NAMESPACE", "")
|
|
123
|
+
|
|
124
|
+
return cls(
|
|
125
|
+
database=database,
|
|
126
|
+
app_namespace=app_namespace,
|
|
127
|
+
global_execution_options=global_execution_options,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@dataclass
|
|
132
|
+
class ServerSettings:
|
|
133
|
+
"""Settings for the cocoindex server."""
|
|
134
|
+
|
|
135
|
+
# The address to bind the server to.
|
|
136
|
+
address: str = "127.0.0.1:49344"
|
|
137
|
+
|
|
138
|
+
# The origins of the clients (e.g. CocoInsight UI) to allow CORS from.
|
|
139
|
+
cors_origins: list[str] | None = None
|
|
140
|
+
|
|
141
|
+
@classmethod
|
|
142
|
+
def from_env(cls) -> Self:
|
|
143
|
+
"""Load settings from environment variables."""
|
|
144
|
+
kwargs: dict[str, Any] = dict()
|
|
145
|
+
_load_field(kwargs, "address", "COCOINDEX_SERVER_ADDRESS")
|
|
146
|
+
_load_field(
|
|
147
|
+
kwargs,
|
|
148
|
+
"cors_origins",
|
|
149
|
+
"COCOINDEX_SERVER_CORS_ORIGINS",
|
|
150
|
+
parse=ServerSettings.parse_cors_origins,
|
|
151
|
+
)
|
|
152
|
+
return cls(**kwargs)
|
|
153
|
+
|
|
154
|
+
@overload
|
|
155
|
+
@staticmethod
|
|
156
|
+
def parse_cors_origins(s: str) -> list[str]: ...
|
|
157
|
+
|
|
158
|
+
@overload
|
|
159
|
+
@staticmethod
|
|
160
|
+
def parse_cors_origins(s: str | None) -> list[str] | None: ...
|
|
161
|
+
|
|
162
|
+
@staticmethod
|
|
163
|
+
def parse_cors_origins(s: str | None) -> list[str] | None:
|
|
164
|
+
"""
|
|
165
|
+
Parse the CORS origins from a string.
|
|
166
|
+
"""
|
|
167
|
+
return (
|
|
168
|
+
[o for e in s.split(",") if (o := e.strip()) != ""]
|
|
169
|
+
if s is not None
|
|
170
|
+
else None
|
|
171
|
+
)
|
cocoindex/setup.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module provides APIs to manage the setup of flows.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from . import setting
|
|
6
|
+
from . import _engine # type: ignore
|
|
7
|
+
from .runtime import execution_context
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SetupChangeBundle:
|
|
11
|
+
"""
|
|
12
|
+
This class represents a bundle of setup changes.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
_engine_bundle: _engine.SetupChangeBundle
|
|
16
|
+
|
|
17
|
+
def __init__(self, _engine_bundle: _engine.SetupChangeBundle):
|
|
18
|
+
self._engine_bundle = _engine_bundle
|
|
19
|
+
|
|
20
|
+
def __str__(self) -> str:
|
|
21
|
+
desc, _ = execution_context.run(self._engine_bundle.describe_async())
|
|
22
|
+
return desc # type: ignore
|
|
23
|
+
|
|
24
|
+
def __repr__(self) -> str:
|
|
25
|
+
return self.__str__()
|
|
26
|
+
|
|
27
|
+
def apply(self, report_to_stdout: bool = False) -> None:
|
|
28
|
+
"""
|
|
29
|
+
Apply the setup changes.
|
|
30
|
+
"""
|
|
31
|
+
execution_context.run(self.apply_async(report_to_stdout=report_to_stdout))
|
|
32
|
+
|
|
33
|
+
async def apply_async(self, report_to_stdout: bool = False) -> None:
|
|
34
|
+
"""
|
|
35
|
+
Apply the setup changes. Async version of `apply`.
|
|
36
|
+
"""
|
|
37
|
+
await self._engine_bundle.apply_async(report_to_stdout=report_to_stdout)
|
|
38
|
+
|
|
39
|
+
def describe(self) -> tuple[str, bool]:
|
|
40
|
+
"""
|
|
41
|
+
Describe the setup changes.
|
|
42
|
+
"""
|
|
43
|
+
return execution_context.run(self.describe_async()) # type: ignore
|
|
44
|
+
|
|
45
|
+
async def describe_async(self) -> tuple[str, bool]:
|
|
46
|
+
"""
|
|
47
|
+
Describe the setup changes. Async version of `describe`.
|
|
48
|
+
"""
|
|
49
|
+
return await self._engine_bundle.describe_async() # type: ignore
|
|
50
|
+
|
|
51
|
+
def describe_and_apply(self, report_to_stdout: bool = False) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Describe the setup changes and apply them if `report_to_stdout` is True.
|
|
54
|
+
Silently apply setup changes otherwise.
|
|
55
|
+
"""
|
|
56
|
+
execution_context.run(
|
|
57
|
+
self.describe_and_apply_async(report_to_stdout=report_to_stdout)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
async def describe_and_apply_async(self, *, report_to_stdout: bool = False) -> None:
|
|
61
|
+
"""
|
|
62
|
+
Describe the setup changes and apply them if `report_to_stdout` is True.
|
|
63
|
+
Silently apply setup changes otherwise. Async version of `describe_and_apply`.
|
|
64
|
+
"""
|
|
65
|
+
if report_to_stdout:
|
|
66
|
+
desc, is_up_to_date = await self.describe_async()
|
|
67
|
+
print("Setup status:\n")
|
|
68
|
+
print(desc)
|
|
69
|
+
if is_up_to_date:
|
|
70
|
+
print("No setup changes to apply.")
|
|
71
|
+
return
|
|
72
|
+
await self.apply_async(report_to_stdout=report_to_stdout)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def flow_names_with_setup() -> list[str]:
|
|
76
|
+
"""
|
|
77
|
+
Get the names of all flows that have been setup.
|
|
78
|
+
"""
|
|
79
|
+
return execution_context.run(flow_names_with_setup_async()) # type: ignore
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
async def flow_names_with_setup_async() -> list[str]:
|
|
83
|
+
"""
|
|
84
|
+
Get the names of all flows that have been setup. Async version of `flow_names_with_setup`.
|
|
85
|
+
"""
|
|
86
|
+
result = []
|
|
87
|
+
all_flow_names = await _engine.flow_names_with_setup_async()
|
|
88
|
+
for name in all_flow_names:
|
|
89
|
+
app_namespace, name = setting.split_app_namespace(name, ".")
|
|
90
|
+
if app_namespace == setting.get_app_namespace():
|
|
91
|
+
result.append(name)
|
|
92
|
+
return result
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""All builtin sources."""
|
|
2
|
+
|
|
3
|
+
from .. import op
|
|
4
|
+
from ..auth_registry import TransientAuthEntryReference
|
|
5
|
+
from ..setting import DatabaseConnectionSpec
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
import datetime
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LocalFile(op.SourceSpec):
|
|
11
|
+
"""Import data from local file system."""
|
|
12
|
+
|
|
13
|
+
_op_category = op.OpCategory.SOURCE
|
|
14
|
+
|
|
15
|
+
path: str
|
|
16
|
+
binary: bool = False
|
|
17
|
+
|
|
18
|
+
# If provided, only files matching these patterns will be included.
|
|
19
|
+
# See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
|
|
20
|
+
included_patterns: list[str] | None = None
|
|
21
|
+
|
|
22
|
+
# If provided, files matching these patterns will be excluded.
|
|
23
|
+
# See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
|
|
24
|
+
excluded_patterns: list[str] | None = None
|
|
25
|
+
|
|
26
|
+
# If provided, files exceeding this size in bytes will be treated as non-existent.
|
|
27
|
+
max_file_size: int | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class GoogleDrive(op.SourceSpec):
|
|
31
|
+
"""Import data from Google Drive."""
|
|
32
|
+
|
|
33
|
+
_op_category = op.OpCategory.SOURCE
|
|
34
|
+
|
|
35
|
+
service_account_credential_path: str
|
|
36
|
+
root_folder_ids: list[str]
|
|
37
|
+
binary: bool = False
|
|
38
|
+
recent_changes_poll_interval: datetime.timedelta | None = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class RedisNotification:
|
|
43
|
+
"""Redis pub/sub configuration for event notifications."""
|
|
44
|
+
|
|
45
|
+
# Redis server URL (e.g., "redis://localhost:6379")
|
|
46
|
+
redis_url: str
|
|
47
|
+
# Redis channel name for pub/sub notifications
|
|
48
|
+
redis_channel: str
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class AmazonS3(op.SourceSpec):
|
|
52
|
+
"""Import data from an Amazon S3 bucket. Supports optional prefix and file filtering by glob patterns."""
|
|
53
|
+
|
|
54
|
+
_op_category = op.OpCategory.SOURCE
|
|
55
|
+
|
|
56
|
+
bucket_name: str
|
|
57
|
+
prefix: str | None = None
|
|
58
|
+
binary: bool = False
|
|
59
|
+
included_patterns: list[str] | None = None
|
|
60
|
+
excluded_patterns: list[str] | None = None
|
|
61
|
+
max_file_size: int | None = None
|
|
62
|
+
sqs_queue_url: str | None = None
|
|
63
|
+
redis: RedisNotification | None = None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class AzureBlob(op.SourceSpec):
|
|
67
|
+
"""
|
|
68
|
+
Import data from an Azure Blob Storage container. Supports optional prefix and file filtering by glob patterns.
|
|
69
|
+
|
|
70
|
+
Authentication mechanisms taken in the following order:
|
|
71
|
+
- SAS token (if provided)
|
|
72
|
+
- Account access key (if provided)
|
|
73
|
+
- Default Azure credential
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
_op_category = op.OpCategory.SOURCE
|
|
77
|
+
|
|
78
|
+
account_name: str
|
|
79
|
+
container_name: str
|
|
80
|
+
prefix: str | None = None
|
|
81
|
+
binary: bool = False
|
|
82
|
+
included_patterns: list[str] | None = None
|
|
83
|
+
excluded_patterns: list[str] | None = None
|
|
84
|
+
|
|
85
|
+
sas_token: TransientAuthEntryReference[str] | None = None
|
|
86
|
+
account_access_key: TransientAuthEntryReference[str] | None = None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class PostgresNotification:
|
|
91
|
+
"""Notification for a PostgreSQL table."""
|
|
92
|
+
|
|
93
|
+
# Optional: name of the PostgreSQL channel to use.
|
|
94
|
+
# If not provided, will generate a default channel name.
|
|
95
|
+
channel_name: str | None = None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class Postgres(op.SourceSpec):
|
|
99
|
+
"""Import data from a PostgreSQL table."""
|
|
100
|
+
|
|
101
|
+
_op_category = op.OpCategory.SOURCE
|
|
102
|
+
|
|
103
|
+
# Table name to read from (required)
|
|
104
|
+
table_name: str
|
|
105
|
+
|
|
106
|
+
# Database connection reference (optional - uses default if not provided)
|
|
107
|
+
database: TransientAuthEntryReference[DatabaseConnectionSpec] | None = None
|
|
108
|
+
|
|
109
|
+
# Optional: specific columns to include (if None, includes all columns)
|
|
110
|
+
included_columns: list[str] | None = None
|
|
111
|
+
|
|
112
|
+
# Optional: column name to use for ordinal tracking (for incremental updates)
|
|
113
|
+
# Should be a timestamp, serial, or other incrementing column
|
|
114
|
+
ordinal_column: str | None = None
|
|
115
|
+
|
|
116
|
+
# Optional: when set, supports change capture from PostgreSQL notification.
|
|
117
|
+
notification: PostgresNotification | None = None
|
|
118
|
+
|
|
119
|
+
# Optional: SQL expression filter for rows (arbitrary SQL boolean expression)
|
|
120
|
+
filter: str | None = None
|