quackpipe 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quackpipe-0.6.1/LICENSE +21 -0
- quackpipe-0.6.1/PKG-INFO +193 -0
- quackpipe-0.6.1/README.md +155 -0
- quackpipe-0.6.1/pyproject.toml +83 -0
- quackpipe-0.6.1/setup.cfg +4 -0
- quackpipe-0.6.1/src/quackpipe/__init__.py +45 -0
- quackpipe-0.6.1/src/quackpipe/builder.py +58 -0
- quackpipe-0.6.1/src/quackpipe/cli.py +28 -0
- quackpipe-0.6.1/src/quackpipe/commands/__init__.py +0 -0
- quackpipe-0.6.1/src/quackpipe/commands/common.py +43 -0
- quackpipe-0.6.1/src/quackpipe/commands/generate_sqlmesh_config.py +85 -0
- quackpipe-0.6.1/src/quackpipe/commands/ui.py +74 -0
- quackpipe-0.6.1/src/quackpipe/config.py +35 -0
- quackpipe-0.6.1/src/quackpipe/core.py +123 -0
- quackpipe-0.6.1/src/quackpipe/etl_utils.py +110 -0
- quackpipe-0.6.1/src/quackpipe/exceptions.py +15 -0
- quackpipe-0.6.1/src/quackpipe/secrets.py +100 -0
- quackpipe-0.6.1/src/quackpipe/sources/__init__.py +3 -0
- quackpipe-0.6.1/src/quackpipe/sources/azure_blob.py +76 -0
- quackpipe-0.6.1/src/quackpipe/sources/base.py +43 -0
- quackpipe-0.6.1/src/quackpipe/sources/ducklake/__init__.py +115 -0
- quackpipe-0.6.1/src/quackpipe/sources/ducklake/providers.py +108 -0
- quackpipe-0.6.1/src/quackpipe/sources/postgres.py +68 -0
- quackpipe-0.6.1/src/quackpipe/sources/s3.py +77 -0
- quackpipe-0.6.1/src/quackpipe/sources/sqlite.py +42 -0
- quackpipe-0.6.1/src/quackpipe/test_utils/__init__.py +0 -0
- quackpipe-0.6.1/src/quackpipe/test_utils/data_fixtures.py +113 -0
- quackpipe-0.6.1/src/quackpipe/test_utils/fixtures.py +478 -0
- quackpipe-0.6.1/src/quackpipe/utils.py +59 -0
- quackpipe-0.6.1/src/quackpipe.egg-info/PKG-INFO +193 -0
- quackpipe-0.6.1/src/quackpipe.egg-info/SOURCES.txt +44 -0
- quackpipe-0.6.1/src/quackpipe.egg-info/dependency_links.txt +1 -0
- quackpipe-0.6.1/src/quackpipe.egg-info/entry_points.txt +2 -0
- quackpipe-0.6.1/src/quackpipe.egg-info/requires.txt +35 -0
- quackpipe-0.6.1/src/quackpipe.egg-info/top_level.txt +1 -0
- quackpipe-0.6.1/tests/test_azure_blob_handler.py +162 -0
- quackpipe-0.6.1/tests/test_cli.py +138 -0
- quackpipe-0.6.1/tests/test_ducklake_handler.py +156 -0
- quackpipe-0.6.1/tests/test_ducklake_integration.py +42 -0
- quackpipe-0.6.1/tests/test_e2e_ducklake_integration.py +131 -0
- quackpipe-0.6.1/tests/test_etl_utils.py +214 -0
- quackpipe-0.6.1/tests/test_postgres_handler.py +202 -0
- quackpipe-0.6.1/tests/test_quackpipe.py +347 -0
- quackpipe-0.6.1/tests/test_s3_handler.py +175 -0
- quackpipe-0.6.1/tests/test_secret_management.py +107 -0
- quackpipe-0.6.1/tests/test_sqlite_handler.py +89 -0
quackpipe-0.6.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 ekiourk consulting ltd
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the “Software”), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
quackpipe-0.6.1/PKG-INFO
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: quackpipe
|
|
3
|
+
Version: 0.6.1
|
|
4
|
+
Summary: A configuration-driven and programmatic ETL helper for DuckDB.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: pyyaml
|
|
10
|
+
Requires-Dist: duckdb>=0.9.0
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Requires-Dist: python-dotenv
|
|
13
|
+
Requires-Dist: azure-storage-blob
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pytest; extra == "dev"
|
|
16
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
17
|
+
Requires-Dist: quackpipe[fixtures]; extra == "dev"
|
|
18
|
+
Requires-Dist: ipdb; extra == "dev"
|
|
19
|
+
Provides-Extra: fixtures
|
|
20
|
+
Requires-Dist: testcontainers==4.10.0; extra == "fixtures"
|
|
21
|
+
Requires-Dist: sqlalchemy; extra == "fixtures"
|
|
22
|
+
Requires-Dist: testcontainers-postgres; extra == "fixtures"
|
|
23
|
+
Requires-Dist: testcontainers-minio; extra == "fixtures"
|
|
24
|
+
Requires-Dist: testcontainers-azurite; extra == "fixtures"
|
|
25
|
+
Requires-Dist: httpx; extra == "fixtures"
|
|
26
|
+
Provides-Extra: lint
|
|
27
|
+
Requires-Dist: ruff; extra == "lint"
|
|
28
|
+
Provides-Extra: logging
|
|
29
|
+
Requires-Dist: structlog>=23.0.0; extra == "logging"
|
|
30
|
+
Requires-Dist: colorlog>=6.0.0; extra == "logging"
|
|
31
|
+
Provides-Extra: postgres
|
|
32
|
+
Requires-Dist: psycopg; extra == "postgres"
|
|
33
|
+
Provides-Extra: s3
|
|
34
|
+
Requires-Dist: pyarrow; extra == "s3"
|
|
35
|
+
Provides-Extra: kafka
|
|
36
|
+
Requires-Dist: confluent-kafka; extra == "kafka"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# Quackpipe
|
|
40
|
+
|
|
41
|
+
**The missing link between your Python scripts and your data infrastructure.**
|
|
42
|
+
|
|
43
|
+
Quackpipe is a powerful ETL helper library that uses **DuckDB** to create a unified, high-performance data plane for Python applications. It bridges the gap between writing raw, complex connection code and adopting a full-scale data transformation framework.
|
|
44
|
+
|
|
45
|
+
With a simple YAML configuration, you can instantly connect to multiple data sources like **PostgreSQL**, **S3**, **Azure Blob Storage**, and **SQLite**, and even orchestrate complex **DuckLake** setups, all from a single, clean Python interface.
|
|
46
|
+
|
|
47
|
+
[](https://codecov.io/github/ekiourk/quackpipe)
|
|
48
|
+
|
|
49
|
+
## What Gap Does Quackpipe Fill?
|
|
50
|
+
|
|
51
|
+
In the modern data stack, you often face a choice:
|
|
52
|
+
|
|
53
|
+
* **Low-Level:** Write boilerplate code with multiple database drivers (`psycopg2`, `boto3`, etc.) to connect and move data manually. This is flexible but repetitive and error-prone.
|
|
54
|
+
* **High-Level:** Adopt a full DataOps framework like **SQLMesh** or **dbt**. These are powerful for building production-grade data warehouses but can be overkill for ad-hoc analysis, rapid prototyping, or simple scripting.
|
|
55
|
+
|
|
56
|
+
**Quackpipe provides the perfect middle ground.** It gives you the power of a unified query engine and the simplicity of a Python library, allowing you to:
|
|
57
|
+
|
|
58
|
+
* **Prototype Rapidly:** Spin up a multi-source data environment in seconds.
|
|
59
|
+
* **Simplify ETL Scripts:** Replace complex driver code with a single, clean `session` or a one-line `move_data` command.
|
|
60
|
+
* **Explore Data Interactively:** Use the built-in CLI to launch a web UI with all your sources pre-connected for instant ad-hoc querying.
|
|
61
|
+
* **Bridge to Production:** Automatically generate configuration for frameworks like **SQLMesh** when you're ready to graduate from a script to a versioned data model.
|
|
62
|
+
|
|
63
|
+
## Core Capabilities
|
|
64
|
+
|
|
65
|
+
* **Unified Data Access:** Query across PostgreSQL, S3, Azure, and SQLite as if they were all schemas in a single database.
|
|
66
|
+
* **Declarative Configuration:** Define all your data sources in one human-readable `config.yml` file.
|
|
67
|
+
* **Powerful ETL Utilities:** Move data between any two configured sources with the `move_data()` function.
|
|
68
|
+
* **Programmatic API:** Use the `QuackpipeBuilder` for dynamic, on-the-fly connection setups in your code.
|
|
69
|
+
* **Secure Secret Management:** Load credentials safely from `.env` files, keeping them out of your code and configuration.
|
|
70
|
+
* **Interactive UI:** Launch an interactive DuckDB web UI with all your sources pre-connected using a single CLI command.
|
|
71
|
+
* **Framework Integration:** Automatically generate a `sqlmesh_config.yml` file to seamlessly transition your project to a full DataOps framework.
|
|
72
|
+
|
|
73
|
+
## Installation
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install quackpipe
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Install support for the sources you need:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# Example: Install support for Postgres, S3, Azure, and the UI
|
|
83
|
+
pip install "quackpipe[postgres,s3,azure,ui]"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Configuration
|
|
87
|
+
|
|
88
|
+
`quackpipe` uses a simple `config.yml` file to define your sources and an `.env` file to manage your secrets.
|
|
89
|
+
|
|
90
|
+
### `config.yml` Example
|
|
91
|
+
|
|
92
|
+
```yaml
|
|
93
|
+
# config.yml
|
|
94
|
+
sources:
|
|
95
|
+
# A writeable PostgreSQL database.
|
|
96
|
+
pg_warehouse:
|
|
97
|
+
type: postgres
|
|
98
|
+
secret_name: "pg_prod" # See Secret Management section below
|
|
99
|
+
read_only: false # Allows writing data back to this source
|
|
100
|
+
|
|
101
|
+
# An S3 data lake for Parquet files.
|
|
102
|
+
s3_datalake:
|
|
103
|
+
type: s3
|
|
104
|
+
secret_name: "aws_prod"
|
|
105
|
+
region: "us-east-1"
|
|
106
|
+
|
|
107
|
+
# An Azure Blob Storage container.
|
|
108
|
+
azure_datalake:
|
|
109
|
+
type: azure
|
|
110
|
+
provider: connection_string
|
|
111
|
+
secret_name: "azure_prod"
|
|
112
|
+
|
|
113
|
+
# A composite DuckLake source.
|
|
114
|
+
my_lake:
|
|
115
|
+
type: ducklake
|
|
116
|
+
catalog:
|
|
117
|
+
type: sqlite
|
|
118
|
+
path: "/path/to/lake_catalog.db"
|
|
119
|
+
storage:
|
|
120
|
+
type: local
|
|
121
|
+
path: "/path/to/lake_storage/"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Secret Management with `.env`
|
|
125
|
+
|
|
126
|
+
Quackpipe uses a `secret_name` in the config to refer to a bundle of credentials. These are loaded from an `.env` file using a simple prefix convention: `SECRET_NAME_KEY`.
|
|
127
|
+
|
|
128
|
+
Create an `.env` file in your project root:
|
|
129
|
+
|
|
130
|
+
```dotenv
|
|
131
|
+
# .env
|
|
132
|
+
|
|
133
|
+
# Secrets for secret_name: "pg_prod"
|
|
134
|
+
PG_PROD_HOST=db.example.com
|
|
135
|
+
PG_PROD_USER=myuser
|
|
136
|
+
PG_PROD_PASSWORD=mypassword
|
|
137
|
+
PG_PROD_DATABASE=production
|
|
138
|
+
|
|
139
|
+
# Secrets for secret_name: "aws_prod"
|
|
140
|
+
AWS_PROD_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY
|
|
141
|
+
AWS_PROD_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_KEY
|
|
142
|
+
|
|
143
|
+
# Secrets for secret_name: "azure_prod"
|
|
144
|
+
AZURE_PROD_CONNECTION_STRING="DefaultEndpointsProtocol=https..."
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Usage Highlights
|
|
148
|
+
|
|
149
|
+
### 1. Interactive Querying with `session`
|
|
150
|
+
|
|
151
|
+
Need to join a CSV in S3 with a table in Postgres? `quackpipe` makes it trivial.
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
import quackpipe
|
|
155
|
+
|
|
156
|
+
# quackpipe automatically loads your .env file
|
|
157
|
+
with quackpipe.session(config_path="config.yml", env_file=".env") as con:
|
|
158
|
+
df = con.execute("""
|
|
159
|
+
SELECT u.name, o.order_total
|
|
160
|
+
FROM pg_warehouse.users u
|
|
161
|
+
JOIN read_parquet('s3://my-bucket/orders/*.parquet') o ON u.id = o.user_id
|
|
162
|
+
WHERE u.signup_date > '2024-01-01';
|
|
163
|
+
""").fetchdf()
|
|
164
|
+
|
|
165
|
+
print(df.head())
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### 2. One-Line Data Movement with `move_data`
|
|
169
|
+
|
|
170
|
+
Archive old records from your production database to your data lake with a single command.
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
from quackpipe.etl_utils import move_data
|
|
174
|
+
|
|
175
|
+
move_data(
|
|
176
|
+
config_path="config.yml",
|
|
177
|
+
env_file=".env",
|
|
178
|
+
source_query="SELECT * FROM pg_warehouse.logs WHERE timestamp < '2024-01-01'",
|
|
179
|
+
destination_name="s3_datalake",
|
|
180
|
+
table_name="logs_archive_2023"
|
|
181
|
+
)
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### 3. Instant Data Exploration with the CLI
|
|
185
|
+
|
|
186
|
+
Launch a web browser UI with all your sources attached and ready for ad-hoc queries.
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
# This command reads your config.yml and .env file
|
|
190
|
+
quackpipe ui
|
|
191
|
+
|
|
192
|
+
# Or connect to specific sources
|
|
193
|
+
quackpipe ui pg_warehouse s3_datalake
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Quackpipe
|
|
2
|
+
|
|
3
|
+
**The missing link between your Python scripts and your data infrastructure.**
|
|
4
|
+
|
|
5
|
+
Quackpipe is a powerful ETL helper library that uses **DuckDB** to create a unified, high-performance data plane for Python applications. It bridges the gap between writing raw, complex connection code and adopting a full-scale data transformation framework.
|
|
6
|
+
|
|
7
|
+
With a simple YAML configuration, you can instantly connect to multiple data sources like **PostgreSQL**, **S3**, **Azure Blob Storage**, and **SQLite**, and even orchestrate complex **DuckLake** setups, all from a single, clean Python interface.
|
|
8
|
+
|
|
9
|
+
[](https://codecov.io/github/ekiourk/quackpipe)
|
|
10
|
+
|
|
11
|
+
## What Gap Does Quackpipe Fill?
|
|
12
|
+
|
|
13
|
+
In the modern data stack, you often face a choice:
|
|
14
|
+
|
|
15
|
+
* **Low-Level:** Write boilerplate code with multiple database drivers (`psycopg2`, `boto3`, etc.) to connect and move data manually. This is flexible but repetitive and error-prone.
|
|
16
|
+
* **High-Level:** Adopt a full DataOps framework like **SQLMesh** or **dbt**. These are powerful for building production-grade data warehouses but can be overkill for ad-hoc analysis, rapid prototyping, or simple scripting.
|
|
17
|
+
|
|
18
|
+
**Quackpipe provides the perfect middle ground.** It gives you the power of a unified query engine and the simplicity of a Python library, allowing you to:
|
|
19
|
+
|
|
20
|
+
* **Prototype Rapidly:** Spin up a multi-source data environment in seconds.
|
|
21
|
+
* **Simplify ETL Scripts:** Replace complex driver code with a single, clean `session` or a one-line `move_data` command.
|
|
22
|
+
* **Explore Data Interactively:** Use the built-in CLI to launch a web UI with all your sources pre-connected for instant ad-hoc querying.
|
|
23
|
+
* **Bridge to Production:** Automatically generate configuration for frameworks like **SQLMesh** when you're ready to graduate from a script to a versioned data model.
|
|
24
|
+
|
|
25
|
+
## Core Capabilities
|
|
26
|
+
|
|
27
|
+
* **Unified Data Access:** Query across PostgreSQL, S3, Azure, and SQLite as if they were all schemas in a single database.
|
|
28
|
+
* **Declarative Configuration:** Define all your data sources in one human-readable `config.yml` file.
|
|
29
|
+
* **Powerful ETL Utilities:** Move data between any two configured sources with the `move_data()` function.
|
|
30
|
+
* **Programmatic API:** Use the `QuackpipeBuilder` for dynamic, on-the-fly connection setups in your code.
|
|
31
|
+
* **Secure Secret Management:** Load credentials safely from `.env` files, keeping them out of your code and configuration.
|
|
32
|
+
* **Interactive UI:** Launch an interactive DuckDB web UI with all your sources pre-connected using a single CLI command.
|
|
33
|
+
* **Framework Integration:** Automatically generate a `sqlmesh_config.yml` file to seamlessly transition your project to a full DataOps framework.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install quackpipe
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Install support for the sources you need:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# Example: Install support for Postgres, S3, Azure, and the UI
|
|
45
|
+
pip install "quackpipe[postgres,s3,azure,ui]"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Configuration
|
|
49
|
+
|
|
50
|
+
`quackpipe` uses a simple `config.yml` file to define your sources and an `.env` file to manage your secrets.
|
|
51
|
+
|
|
52
|
+
### `config.yml` Example
|
|
53
|
+
|
|
54
|
+
```yaml
|
|
55
|
+
# config.yml
|
|
56
|
+
sources:
|
|
57
|
+
# A writeable PostgreSQL database.
|
|
58
|
+
pg_warehouse:
|
|
59
|
+
type: postgres
|
|
60
|
+
secret_name: "pg_prod" # See Secret Management section below
|
|
61
|
+
read_only: false # Allows writing data back to this source
|
|
62
|
+
|
|
63
|
+
# An S3 data lake for Parquet files.
|
|
64
|
+
s3_datalake:
|
|
65
|
+
type: s3
|
|
66
|
+
secret_name: "aws_prod"
|
|
67
|
+
region: "us-east-1"
|
|
68
|
+
|
|
69
|
+
# An Azure Blob Storage container.
|
|
70
|
+
azure_datalake:
|
|
71
|
+
type: azure
|
|
72
|
+
provider: connection_string
|
|
73
|
+
secret_name: "azure_prod"
|
|
74
|
+
|
|
75
|
+
# A composite DuckLake source.
|
|
76
|
+
my_lake:
|
|
77
|
+
type: ducklake
|
|
78
|
+
catalog:
|
|
79
|
+
type: sqlite
|
|
80
|
+
path: "/path/to/lake_catalog.db"
|
|
81
|
+
storage:
|
|
82
|
+
type: local
|
|
83
|
+
path: "/path/to/lake_storage/"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Secret Management with `.env`
|
|
87
|
+
|
|
88
|
+
Quackpipe uses a `secret_name` in the config to refer to a bundle of credentials. These are loaded from an `.env` file using a simple prefix convention: `SECRET_NAME_KEY`.
|
|
89
|
+
|
|
90
|
+
Create an `.env` file in your project root:
|
|
91
|
+
|
|
92
|
+
```dotenv
|
|
93
|
+
# .env
|
|
94
|
+
|
|
95
|
+
# Secrets for secret_name: "pg_prod"
|
|
96
|
+
PG_PROD_HOST=db.example.com
|
|
97
|
+
PG_PROD_USER=myuser
|
|
98
|
+
PG_PROD_PASSWORD=mypassword
|
|
99
|
+
PG_PROD_DATABASE=production
|
|
100
|
+
|
|
101
|
+
# Secrets for secret_name: "aws_prod"
|
|
102
|
+
AWS_PROD_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY
|
|
103
|
+
AWS_PROD_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_KEY
|
|
104
|
+
|
|
105
|
+
# Secrets for secret_name: "azure_prod"
|
|
106
|
+
AZURE_PROD_CONNECTION_STRING="DefaultEndpointsProtocol=https..."
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Usage Highlights
|
|
110
|
+
|
|
111
|
+
### 1. Interactive Querying with `session`
|
|
112
|
+
|
|
113
|
+
Need to join a CSV in S3 with a table in Postgres? `quackpipe` makes it trivial.
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
import quackpipe
|
|
117
|
+
|
|
118
|
+
# quackpipe automatically loads your .env file
|
|
119
|
+
with quackpipe.session(config_path="config.yml", env_file=".env") as con:
|
|
120
|
+
df = con.execute("""
|
|
121
|
+
SELECT u.name, o.order_total
|
|
122
|
+
FROM pg_warehouse.users u
|
|
123
|
+
JOIN read_parquet('s3://my-bucket/orders/*.parquet') o ON u.id = o.user_id
|
|
124
|
+
WHERE u.signup_date > '2024-01-01';
|
|
125
|
+
""").fetchdf()
|
|
126
|
+
|
|
127
|
+
print(df.head())
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### 2. One-Line Data Movement with `move_data`
|
|
131
|
+
|
|
132
|
+
Archive old records from your production database to your data lake with a single command.
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from quackpipe.etl_utils import move_data
|
|
136
|
+
|
|
137
|
+
move_data(
|
|
138
|
+
config_path="config.yml",
|
|
139
|
+
env_file=".env",
|
|
140
|
+
source_query="SELECT * FROM pg_warehouse.logs WHERE timestamp < '2024-01-01'",
|
|
141
|
+
destination_name="s3_datalake",
|
|
142
|
+
table_name="logs_archive_2023"
|
|
143
|
+
)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### 3. Instant Data Exploration with the CLI
|
|
147
|
+
|
|
148
|
+
Launch a web browser UI with all your sources attached and ready for ad-hoc queries.
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
# This command reads your config.yml and .env file
|
|
152
|
+
quackpipe ui
|
|
153
|
+
|
|
154
|
+
# Or connect to specific sources
|
|
155
|
+
quackpipe ui pg_warehouse s3_datalake
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "quackpipe"
|
|
3
|
+
version = "0.6.1"
|
|
4
|
+
requires-python = ">=3.12"
|
|
5
|
+
description = "A configuration-driven and programmatic ETL helper for DuckDB."
|
|
6
|
+
license = {text = "MIT"}
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"pyyaml",
|
|
10
|
+
"duckdb>=0.9.0",
|
|
11
|
+
"pandas",
|
|
12
|
+
"python-dotenv",
|
|
13
|
+
"azure-storage-blob"
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
dev = [
|
|
18
|
+
"pytest",
|
|
19
|
+
"pytest-cov",
|
|
20
|
+
"quackpipe[fixtures]",
|
|
21
|
+
"ipdb"
|
|
22
|
+
]
|
|
23
|
+
fixtures = [
|
|
24
|
+
"testcontainers==4.10.0",
|
|
25
|
+
"sqlalchemy",
|
|
26
|
+
"testcontainers-postgres",
|
|
27
|
+
"testcontainers-minio",
|
|
28
|
+
"testcontainers-azurite",
|
|
29
|
+
"httpx"
|
|
30
|
+
]
|
|
31
|
+
lint = [
|
|
32
|
+
"ruff"
|
|
33
|
+
]
|
|
34
|
+
logging = [
|
|
35
|
+
"structlog>=23.0.0",
|
|
36
|
+
"colorlog>=6.0.0",
|
|
37
|
+
]
|
|
38
|
+
postgres = ["psycopg"]
|
|
39
|
+
s3 = ["pyarrow"]
|
|
40
|
+
kafka = ["confluent-kafka"]
|
|
41
|
+
|
|
42
|
+
[build-system]
|
|
43
|
+
requires = ["setuptools>=61.0"]
|
|
44
|
+
build-backend = "setuptools.build_meta"
|
|
45
|
+
|
|
46
|
+
[tool.setuptools]
|
|
47
|
+
package-dir = {"" = "src"}
|
|
48
|
+
|
|
49
|
+
[tool.setuptools.packages.find]
|
|
50
|
+
where = ["src"]
|
|
51
|
+
|
|
52
|
+
[tool.ruff]
|
|
53
|
+
src = ["src"]
|
|
54
|
+
line-length = 120
|
|
55
|
+
|
|
56
|
+
[tool.ruff.lint]
|
|
57
|
+
select = [
|
|
58
|
+
"E", # pycodestyle errors
|
|
59
|
+
"W", # pycodestyle warnings
|
|
60
|
+
"F", # pyflakes
|
|
61
|
+
"I", # isort
|
|
62
|
+
"B", # flake8-bugbear
|
|
63
|
+
"C4", # flake8-comprehensions
|
|
64
|
+
"UP", # pyupgrade
|
|
65
|
+
]
|
|
66
|
+
# Allow logging format strings
|
|
67
|
+
ignore = [
|
|
68
|
+
"G201",
|
|
69
|
+
"G202",
|
|
70
|
+
"E501", # line too long (handled by formatter)
|
|
71
|
+
"B008", # do not perform function calls in argument defaults
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
[tool.ruff.format]
|
|
75
|
+
# Use double quotes
|
|
76
|
+
quote-style = "double"
|
|
77
|
+
|
|
78
|
+
# Indent with spaces
|
|
79
|
+
indent-style = "space"
|
|
80
|
+
|
|
81
|
+
[project.scripts]
|
|
82
|
+
quackpipe = "quackpipe.cli:main"
|
|
83
|
+
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
quackpipe - A configuration-driven ETL helper for DuckDB.
|
|
3
|
+
|
|
4
|
+
This library provides simple, high-level functions to connect DuckDB
|
|
5
|
+
to various data sources based on a YAML configuration file or a
|
|
6
|
+
programmatic builder.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
# Expose the primary user-facing functions and classes.
|
|
13
|
+
from .builder import QuackpipeBuilder
|
|
14
|
+
from .config import SourceConfig, SourceType
|
|
15
|
+
from .core import session, with_session
|
|
16
|
+
from .exceptions import ConfigError, QuackpipeError, SecretError
|
|
17
|
+
from .secrets import configure_secret_provider
|
|
18
|
+
|
|
19
|
+
# Set up the library's top-level logger
|
|
20
|
+
_default_level = os.getenv('QUACKPIPE_LOG_LEVEL', 'WARNING').upper()
|
|
21
|
+
_root_logger = logging.getLogger(__name__)
|
|
22
|
+
_root_logger.setLevel(getattr(logging, _default_level, logging.WARNING))
|
|
23
|
+
_root_logger.addHandler(logging.NullHandler())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
# Core API
|
|
28
|
+
"session",
|
|
29
|
+
"with_session",
|
|
30
|
+
|
|
31
|
+
# Builder API
|
|
32
|
+
"QuackpipeBuilder",
|
|
33
|
+
|
|
34
|
+
# Configuration Types
|
|
35
|
+
"SourceConfig",
|
|
36
|
+
"SourceType",
|
|
37
|
+
|
|
38
|
+
# Secret Management
|
|
39
|
+
"configure_secret_provider",
|
|
40
|
+
|
|
41
|
+
# Exceptions
|
|
42
|
+
"QuackpipeError",
|
|
43
|
+
"ConfigError",
|
|
44
|
+
"SecretError",
|
|
45
|
+
]
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
The Builder API for programmatically constructing a quackpipe session.
|
|
3
|
+
"""
|
|
4
|
+
from typing import Any, Self
|
|
5
|
+
|
|
6
|
+
from .config import SourceConfig, SourceType
|
|
7
|
+
from .core import session as core_session # Avoid circular import
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QuackpipeBuilder:
|
|
11
|
+
"""A fluent builder for creating a quackpipe session without a YAML file."""
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self._sources: list[SourceConfig] = []
|
|
15
|
+
|
|
16
|
+
def add_source(self, name: str, type: SourceType, config: dict[str, Any] = None, secret_name: str = None) -> Self:
|
|
17
|
+
"""
|
|
18
|
+
Adds a data source to the configuration.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
name: The name for the data source (e.g., 'pg_main').
|
|
22
|
+
type: The type of the source, using the SourceType enum.
|
|
23
|
+
config: A dictionary of non-secret parameters.
|
|
24
|
+
secret_name: The logical name of the secret bundle.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
The builder instance for chaining.
|
|
28
|
+
"""
|
|
29
|
+
source = SourceConfig(
|
|
30
|
+
name=name,
|
|
31
|
+
type=type,
|
|
32
|
+
config=config or {},
|
|
33
|
+
secret_name=secret_name
|
|
34
|
+
)
|
|
35
|
+
self._sources.append(source)
|
|
36
|
+
return self
|
|
37
|
+
|
|
38
|
+
def get_configs(self) -> list[SourceConfig]:
|
|
39
|
+
"""
|
|
40
|
+
Returns the list of SourceConfig objects that have been added to the builder.
|
|
41
|
+
This is useful for passing to high-level utilities like `move_data`.
|
|
42
|
+
"""
|
|
43
|
+
return self._sources
|
|
44
|
+
|
|
45
|
+
def session(self, **kwargs):
|
|
46
|
+
"""
|
|
47
|
+
Builds and enters the session context manager. Can accept the same arguments
|
|
48
|
+
as the core session function, like `sources=['source_a']`.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
A context manager yielding a configured DuckDB connection.
|
|
52
|
+
"""
|
|
53
|
+
if not self._sources:
|
|
54
|
+
raise ValueError("Cannot build a session with no sources defined.")
|
|
55
|
+
|
|
56
|
+
# Pass the built configs and any extra arguments (like `sources`)
|
|
57
|
+
# to the core session manager.
|
|
58
|
+
return core_session(configs=self._sources, **kwargs)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli.py
|
|
3
|
+
|
|
4
|
+
This module provides the main entry point for the quackpipe command-line interface.
|
|
5
|
+
It discovers and registers commands from the 'commands' submodule.
|
|
6
|
+
"""
|
|
7
|
+
import argparse
|
|
8
|
+
|
|
9
|
+
# Import the registration functions from each command module
|
|
10
|
+
from .commands import generate_sqlmesh_config, ui
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def main():
|
|
14
|
+
"""Main function to parse arguments and dispatch commands."""
|
|
15
|
+
parser = argparse.ArgumentParser(description="quackpipe: A DuckDB ETL Helper CLI.")
|
|
16
|
+
subparsers = parser.add_subparsers(dest="command", required=True, help="Available commands")
|
|
17
|
+
|
|
18
|
+
# Register all available commands
|
|
19
|
+
generate_sqlmesh_config.register_command(subparsers)
|
|
20
|
+
ui.register_command(subparsers)
|
|
21
|
+
|
|
22
|
+
# Parse the arguments and call the handler function assigned by the subparser
|
|
23
|
+
args = parser.parse_args()
|
|
24
|
+
args.func(args)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
if __name__ == "__main__":
|
|
28
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
src/quackpipe/commands/common.py
|
|
3
|
+
|
|
4
|
+
This module contains common utilities shared across CLI command modules.
|
|
5
|
+
"""
|
|
6
|
+
import logging
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def setup_cli_logging(verbose_level: int = 0):
|
|
11
|
+
"""
|
|
12
|
+
Configures the root logger for quackpipe to ensure CLI output is visible.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
verbose_level (int): The verbosity level. 0 for WARNING, 1 for INFO, 2+ for DEBUG.
|
|
16
|
+
"""
|
|
17
|
+
# Map the integer verbosity level to a logging level
|
|
18
|
+
if verbose_level >= 2:
|
|
19
|
+
level = logging.DEBUG
|
|
20
|
+
elif verbose_level == 1:
|
|
21
|
+
level = logging.INFO
|
|
22
|
+
else:
|
|
23
|
+
# Default to WARNING to avoid being too noisy
|
|
24
|
+
level = logging.WARNING
|
|
25
|
+
|
|
26
|
+
# Get the top-level logger for the library
|
|
27
|
+
log = logging.getLogger("quackpipe")
|
|
28
|
+
log.setLevel(level)
|
|
29
|
+
|
|
30
|
+
# Create a handler to write messages to the console (stdout)
|
|
31
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
32
|
+
|
|
33
|
+
# Create a formatter and add it to the handler
|
|
34
|
+
formatter = logging.Formatter('%(asctime)s - %(message)s')
|
|
35
|
+
handler.setFormatter(formatter)
|
|
36
|
+
|
|
37
|
+
# Add the handler to the logger. This ensures messages will be output.
|
|
38
|
+
# We clear existing handlers to avoid duplicate messages if run in a notebook.
|
|
39
|
+
if log.hasHandlers():
|
|
40
|
+
log.handlers.clear()
|
|
41
|
+
log.addHandler(handler)
|
|
42
|
+
|
|
43
|
+
return log
|