duckbricks-utils 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.3
2
+ Name: duckbricks-utils
3
+ Version: 0.1.0
4
+ Summary: DuckLake connection utilities for DuckBricks notebooks and pipelines
5
+ Author: DuckBricks Team
6
+ Requires-Python: >=3.11,<4.0
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.11
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: duckdb (>=1.3.0)
12
+ Requires-Dist: python-dotenv (>=1.0)
13
+ Description-Content-Type: text/markdown
14
+
15
+ # duckbricks-utils
16
+
17
+ DuckLake connection utilities for DuckBricks notebooks and pipelines.
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ # From local path (development)
23
+ pip install /path/to/duckbricks-utils
24
+
25
+ # Once published to PyPI
26
+ pip install duckbricks-utils
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ```python
32
+ from duckbricks_utils import connect
33
+
34
+ conn = connect()
35
+ result = conn.execute("SELECT * FROM my_table LIMIT 10").df()
36
+ ```
37
+
38
+ ## Configuration
39
+
40
+ Connection settings are read from environment variables:
41
+
42
+ | Variable | Default | Description |
43
+ |---|---|---|
44
+ | `DUCKLAKE_PG_HOST` | `localhost` | PostgreSQL host |
45
+ | `DUCKLAKE_PG_PORT` | `5432` | PostgreSQL port |
46
+ | `DUCKLAKE_PG_DATABASE` | `duckbricks` | PostgreSQL database name |
47
+ | `DUCKLAKE_PG_USER` | `duckbricks` | PostgreSQL user |
48
+ | `DUCKLAKE_PG_PASSWORD` | `duckbricks` | PostgreSQL password |
49
+ | `DUCKBRICKS_DUCKLAKE_NAME` | `duckbricks` | DuckLake catalog name |
50
+ | `DUCKBRICKS_DATA_PATH` | `/data/parquet/` | Parquet storage path |
51
+
@@ -0,0 +1,36 @@
1
+ # duckbricks-utils
2
+
3
+ DuckLake connection utilities for DuckBricks notebooks and pipelines.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ # From local path (development)
9
+ pip install /path/to/duckbricks-utils
10
+
11
+ # Once published to PyPI
12
+ pip install duckbricks-utils
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ ```python
18
+ from duckbricks_utils import connect
19
+
20
+ conn = connect()
21
+ result = conn.execute("SELECT * FROM my_table LIMIT 10").df()
22
+ ```
23
+
24
+ ## Configuration
25
+
26
+ Connection settings are read from environment variables:
27
+
28
+ | Variable | Default | Description |
29
+ |---|---|---|
30
+ | `DUCKLAKE_PG_HOST` | `localhost` | PostgreSQL host |
31
+ | `DUCKLAKE_PG_PORT` | `5432` | PostgreSQL port |
32
+ | `DUCKLAKE_PG_DATABASE` | `duckbricks` | PostgreSQL database name |
33
+ | `DUCKLAKE_PG_USER` | `duckbricks` | PostgreSQL user |
34
+ | `DUCKLAKE_PG_PASSWORD` | `duckbricks` | PostgreSQL password |
35
+ | `DUCKBRICKS_DUCKLAKE_NAME` | `duckbricks` | DuckLake catalog name |
36
+ | `DUCKBRICKS_DATA_PATH` | `/data/parquet/` | Parquet storage path |
@@ -0,0 +1,16 @@
1
+ """DuckBricks workspace utilities.
2
+
3
+ Provides a consistent interface for connecting to the DuckLake catalog from
4
+ any environment: local IDE, Marimo notebooks, or job executors.
5
+
6
+ Example usage:
7
+ from duckbricks_utils import connect
8
+
9
+ conn = connect()
10
+ result = conn.execute("SELECT * FROM my_table LIMIT 10").df()
11
+ """
12
+
13
+ from duckbricks_utils._connection import catalog_name, connect, data_path
14
+ from duckbricks_utils.storage import StorageBackend, StorageBackendFactory
15
+
16
+ __all__ = ["StorageBackend", "StorageBackendFactory", "catalog_name", "connect", "data_path"]
@@ -0,0 +1,63 @@
1
+ """DuckLake connection helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ import duckdb
8
+
9
+ from duckbricks_utils.storage.factory import StorageBackendFactory
10
+
11
+
12
+ def _pg_dsn() -> str:
13
+ host = os.getenv("DUCKLAKE_PG_HOST", "localhost")
14
+ port = os.getenv("DUCKLAKE_PG_PORT", "5432")
15
+ database = os.getenv("DUCKLAKE_PG_DATABASE", "duckbricks")
16
+ user = os.getenv("DUCKLAKE_PG_USER", "duckbricks")
17
+ password = os.getenv("DUCKLAKE_PG_PASSWORD", "duckbricks")
18
+ return f"host={host} port={port} dbname={database} user={user} password={password}"
19
+
20
+
21
+ def catalog_name() -> str:
22
+ """Return the configured DuckLake catalog name."""
23
+ return os.getenv("DUCKBRICKS_DUCKLAKE_NAME", "duckbricks")
24
+
25
+
26
+ def data_path() -> str:
27
+ """Return the active storage backend's data path.
28
+
29
+ Delegates to the configured ``StorageBackend`` so that cloud paths
30
+ (s3://, gs://, az://) are returned correctly for non-local backends.
31
+ """
32
+ return StorageBackendFactory.from_env().data_path()
33
+
34
+
35
+ def connect(override_data_path: str | None = None) -> duckdb.DuckDBPyConnection:
36
+ """Return a DuckDB connection with the DuckLake catalog attached.
37
+
38
+ The storage backend is selected by ``DUCKBRICKS_STORAGE_BACKEND``
39
+ (default: ``local``). Each backend installs its required DuckDB
40
+ extension and creates a named secret before the catalog is attached.
41
+
42
+ Args:
43
+ override_data_path: Optional path override for the DATA_PATH used
44
+ in the ATTACH statement. Bypasses the backend's own data_path().
45
+
46
+ Returns:
47
+ An open DuckDB connection with the DuckLake catalog set as default.
48
+ """
49
+ backend = StorageBackendFactory.from_env()
50
+ storage_path = override_data_path or backend.data_path()
51
+ name = catalog_name()
52
+ dsn = _pg_dsn()
53
+
54
+ conn = duckdb.connect()
55
+ backend.configure(conn)
56
+ conn.execute("INSTALL ducklake; LOAD ducklake;")
57
+ conn.execute("INSTALL postgres; LOAD postgres;")
58
+ conn.execute(
59
+ f"ATTACH 'ducklake:postgres:{dsn}' AS {name} "
60
+ f"(DATA_PATH '{storage_path}', AUTOMATIC_MIGRATION TRUE)"
61
+ )
62
+ conn.execute(f"USE {name}")
63
+ return conn
@@ -0,0 +1,6 @@
1
+ """Storage backend abstractions for DuckLake."""
2
+
3
+ from duckbricks_utils.storage.base import StorageBackend
4
+ from duckbricks_utils.storage.factory import StorageBackendFactory
5
+
6
+ __all__ = ["StorageBackend", "StorageBackendFactory"]
@@ -0,0 +1,45 @@
1
+ """Azure Blob Storage backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ import duckdb
8
+
9
+ from duckbricks_utils.storage.base import StorageBackend
10
+
11
+
12
+ class AzureStorage(StorageBackend):
13
+ """Stores DuckLake Parquet files on Azure Blob Storage.
14
+
15
+ Supports two authentication methods (in priority order):
16
+ 1. Connection string: ``AZURE_CONNECTION_STRING``
17
+ 2. Account + key: ``AZURE_ACCOUNT`` + ``AZURE_KEY``
18
+
19
+ Required environment variables:
20
+ AZURE_CONNECTION_STRING — full Azure connection string (option 1)
21
+ AZURE_ACCOUNT — storage account name (option 2)
22
+ AZURE_KEY — storage account key (option 2)
23
+ DUCKBRICKS_DATA_PATH — az://container/prefix/
24
+ """
25
+
26
+ def configure(self, conn: duckdb.DuckDBPyConnection) -> None:
27
+ conn.execute("INSTALL azure; LOAD azure;")
28
+ connection_string = os.getenv("AZURE_CONNECTION_STRING")
29
+ if connection_string:
30
+ conn.execute(
31
+ f"CREATE OR REPLACE SECRET duckbricks_storage ("
32
+ f"TYPE AZURE, CONNECTION_STRING '{connection_string}'"
33
+ f")"
34
+ )
35
+ else:
36
+ account = os.environ["AZURE_ACCOUNT"]
37
+ key = os.environ["AZURE_KEY"]
38
+ conn.execute(
39
+ f"CREATE OR REPLACE SECRET duckbricks_storage ("
40
+ f"TYPE AZURE, ACCOUNT_NAME '{account}', ACCOUNT_KEY '{key}'"
41
+ f")"
42
+ )
43
+
44
+ def data_path(self) -> str:
45
+ return os.environ["DUCKBRICKS_DATA_PATH"]
@@ -0,0 +1,25 @@
1
+ """Abstract base class for DuckLake storage backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ import duckdb
8
+
9
+
10
+ class StorageBackend(ABC):
11
+ """Encapsulates the DuckDB extension setup and secret configuration
12
+ required for a specific DuckLake storage provider.
13
+
14
+ Subclasses implement ``configure`` to install the necessary DuckDB
15
+ extension and create a named secret, and ``data_path`` to return the
16
+ provider-specific path prefix used in the ATTACH statement.
17
+ """
18
+
19
+ @abstractmethod
20
+ def configure(self, conn: duckdb.DuckDBPyConnection) -> None:
21
+ """Install required extensions and create DuckDB secrets on ``conn``."""
22
+
23
+ @abstractmethod
24
+ def data_path(self) -> str:
25
+ """Return the DATA_PATH value to use in the DuckLake ATTACH statement."""
@@ -0,0 +1,45 @@
1
+ """Factory for resolving the active storage backend from environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ from duckbricks_utils.storage.base import StorageBackend
8
+
9
+ _BACKEND_REGISTRY: dict[str, str] = {
10
+ "local": "duckbricks_utils.storage.local.LocalStorage",
11
+ "s3": "duckbricks_utils.storage.s3.S3Storage",
12
+ "minio": "duckbricks_utils.storage.minio.MinIOStorage",
13
+ "r2": "duckbricks_utils.storage.r2.R2Storage",
14
+ "gcs": "duckbricks_utils.storage.gcs.GCSStorage",
15
+ "azure": "duckbricks_utils.storage.azure.AzureStorage",
16
+ }
17
+
18
+
19
+ class StorageBackendFactory:
20
+ """Resolves and instantiates the correct ``StorageBackend`` from environment.
21
+
22
+ The active backend is selected by the ``DUCKBRICKS_STORAGE_BACKEND``
23
+ environment variable (default: ``local``).
24
+ """
25
+
26
+ @staticmethod
27
+ def from_env() -> StorageBackend:
28
+ """Return a ``StorageBackend`` instance for the configured provider."""
29
+ backend_name = os.getenv("DUCKBRICKS_STORAGE_BACKEND", "local").lower()
30
+ if backend_name not in _BACKEND_REGISTRY:
31
+ supported = ", ".join(_BACKEND_REGISTRY)
32
+ raise ValueError(
33
+ f"Unknown storage backend '{backend_name}'. "
34
+ f"Supported values: {supported}"
35
+ )
36
+ import importlib
37
+
38
+ module_path, class_name = _BACKEND_REGISTRY[backend_name].rsplit(".", 1)
39
+ module = importlib.import_module(module_path)
40
+ return getattr(module, class_name)()
41
+
42
+ @staticmethod
43
+ def supported_backends() -> list[str]:
44
+ """Return the list of supported backend names."""
45
+ return list(_BACKEND_REGISTRY)
@@ -0,0 +1,32 @@
1
+ """Google Cloud Storage backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ import duckdb
8
+
9
+ from duckbricks_utils.storage.base import StorageBackend
10
+
11
+
12
+ class GCSStorage(StorageBackend):
13
+ """Stores DuckLake Parquet files on Google Cloud Storage.
14
+
15
+ Required environment variables:
16
+ GCS_KEY_ID — GCS HMAC access key ID
17
+ GCS_SECRET — GCS HMAC secret
18
+ DUCKBRICKS_DATA_PATH — gs://bucket/prefix/
19
+ """
20
+
21
+ def configure(self, conn: duckdb.DuckDBPyConnection) -> None:
22
+ conn.execute("INSTALL httpfs; LOAD httpfs;")
23
+ key_id = os.environ["GCS_KEY_ID"]
24
+ secret = os.environ["GCS_SECRET"]
25
+ conn.execute(
26
+ f"CREATE OR REPLACE SECRET duckbricks_storage ("
27
+ f"TYPE GCS, KEY_ID '{key_id}', SECRET '{secret}'"
28
+ f")"
29
+ )
30
+
31
+ def data_path(self) -> str:
32
+ return os.environ["DUCKBRICKS_DATA_PATH"]
@@ -0,0 +1,23 @@
1
+ """Local filesystem storage backend (default)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ import duckdb
8
+
9
+ from duckbricks_utils.storage.base import StorageBackend
10
+
11
+
12
+ class LocalStorage(StorageBackend):
13
+ """Stores DuckLake Parquet files on the local filesystem.
14
+
15
+ No DuckDB extensions or secrets are required beyond ducklake itself.
16
+ The data path is read from ``DUCKBRICKS_DATA_PATH``.
17
+ """
18
+
19
+ def configure(self, conn: duckdb.DuckDBPyConnection) -> None:
20
+ pass
21
+
22
+ def data_path(self) -> str:
23
+ return os.getenv("DUCKBRICKS_DATA_PATH", "/data/parquet/")
@@ -0,0 +1,35 @@
1
+ """MinIO storage backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ import duckdb
8
+
9
+ from duckbricks_utils.storage.base import StorageBackend
10
+
11
+
12
+ class MinIOStorage(StorageBackend):
13
+ """Stores DuckLake Parquet files on a MinIO-compatible S3 endpoint.
14
+
15
+ Required environment variables:
16
+ AWS_ACCESS_KEY_ID — MinIO access key
17
+ AWS_SECRET_ACCESS_KEY — MinIO secret key
18
+ MINIO_ENDPOINT — MinIO endpoint URL (e.g. http://minio:9000)
19
+ DUCKBRICKS_DATA_PATH — s3://bucket/prefix/
20
+ """
21
+
22
+ def configure(self, conn: duckdb.DuckDBPyConnection) -> None:
23
+ conn.execute("INSTALL httpfs; LOAD httpfs;")
24
+ key_id = os.environ["AWS_ACCESS_KEY_ID"]
25
+ secret = os.environ["AWS_SECRET_ACCESS_KEY"]
26
+ endpoint = os.environ["MINIO_ENDPOINT"].rstrip("/")
27
+ conn.execute(
28
+ f"CREATE OR REPLACE SECRET duckbricks_storage ("
29
+ f"TYPE S3, KEY_ID '{key_id}', SECRET '{secret}', "
30
+ f"ENDPOINT '{endpoint}', USE_SSL false, URL_STYLE path"
31
+ f")"
32
+ )
33
+
34
+ def data_path(self) -> str:
35
+ return os.environ["DUCKBRICKS_DATA_PATH"]
@@ -0,0 +1,36 @@
1
+ """Cloudflare R2 storage backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ import duckdb
8
+
9
+ from duckbricks_utils.storage.base import StorageBackend
10
+
11
+
12
+ class R2Storage(StorageBackend):
13
+ """Stores DuckLake Parquet files on Cloudflare R2.
14
+
15
+ Required environment variables:
16
+ AWS_ACCESS_KEY_ID — R2 access key ID
17
+ AWS_SECRET_ACCESS_KEY — R2 secret access key
18
+ R2_ACCOUNT_ID — Cloudflare account ID
19
+ DUCKBRICKS_DATA_PATH — s3://bucket/prefix/
20
+ """
21
+
22
+ def configure(self, conn: duckdb.DuckDBPyConnection) -> None:
23
+ conn.execute("INSTALL httpfs; LOAD httpfs;")
24
+ key_id = os.environ["AWS_ACCESS_KEY_ID"]
25
+ secret = os.environ["AWS_SECRET_ACCESS_KEY"]
26
+ account_id = os.environ["R2_ACCOUNT_ID"]
27
+ endpoint = f"https://{account_id}.r2.cloudflarestorage.com"
28
+ conn.execute(
29
+ f"CREATE OR REPLACE SECRET duckbricks_storage ("
30
+ f"TYPE S3, KEY_ID '{key_id}', SECRET '{secret}', "
31
+ f"ENDPOINT '{endpoint}', REGION 'auto'"
32
+ f")"
33
+ )
34
+
35
+ def data_path(self) -> str:
36
+ return os.environ["DUCKBRICKS_DATA_PATH"]
@@ -0,0 +1,34 @@
1
+ """AWS S3 storage backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ import duckdb
8
+
9
+ from duckbricks_utils.storage.base import StorageBackend
10
+
11
+
12
+ class S3Storage(StorageBackend):
13
+ """Stores DuckLake Parquet files on AWS S3.
14
+
15
+ Required environment variables:
16
+ AWS_ACCESS_KEY_ID — AWS access key
17
+ AWS_SECRET_ACCESS_KEY — AWS secret key
18
+ AWS_REGION — AWS region (default: us-east-1)
19
+ DUCKBRICKS_DATA_PATH — s3://bucket/prefix/
20
+ """
21
+
22
+ def configure(self, conn: duckdb.DuckDBPyConnection) -> None:
23
+ conn.execute("INSTALL httpfs; LOAD httpfs;")
24
+ key_id = os.environ["AWS_ACCESS_KEY_ID"]
25
+ secret = os.environ["AWS_SECRET_ACCESS_KEY"]
26
+ region = os.getenv("AWS_REGION", "us-east-1")
27
+ conn.execute(
28
+ f"CREATE OR REPLACE SECRET duckbricks_storage ("
29
+ f"TYPE S3, KEY_ID '{key_id}', SECRET '{secret}', REGION '{region}'"
30
+ f")"
31
+ )
32
+
33
+ def data_path(self) -> str:
34
+ return os.environ["DUCKBRICKS_DATA_PATH"]
@@ -0,0 +1,22 @@
1
+ [tool.poetry]
2
+ name = "duckbricks-utils"
3
+ version = "0.1.0"
4
+ description = "DuckLake connection utilities for DuckBricks notebooks and pipelines"
5
+ authors = ["DuckBricks Team"]
6
+ readme = "README.md"
7
+ packages = [{include = "duckbricks_utils"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.11"
11
+ duckdb = ">=1.3.0"
12
+ python-dotenv = ">=1.0"
13
+
14
+ [tool.poetry.group.dev.dependencies]
15
+ pytest = "*"
16
+
17
+ [tool.pytest.ini_options]
18
+ testpaths = ["tests"]
19
+
20
+ [build-system]
21
+ requires = ["poetry-core"]
22
+ build-backend = "poetry.core.masonry.api"