duckrun 0.0.0__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun-0.1.0/PKG-INFO +11 -0
- duckrun-0.1.0/README.md +162 -0
- duckrun-0.1.0/duckrun.egg-info/PKG-INFO +11 -0
- duckrun-0.1.0/pyproject.toml +17 -0
- duckrun-0.0.0/PKG-INFO +0 -5
- duckrun-0.0.0/README.md +0 -39
- duckrun-0.0.0/duckrun.egg-info/PKG-INFO +0 -5
- duckrun-0.0.0/pyproject.toml +0 -0
- {duckrun-0.0.0 → duckrun-0.1.0}/LICENSE +0 -0
- {duckrun-0.0.0 → duckrun-0.1.0}/duckrun/__init__.py +0 -0
- {duckrun-0.0.0 → duckrun-0.1.0}/duckrun/core.py +0 -0
- {duckrun-0.0.0 → duckrun-0.1.0}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.0.0 → duckrun-0.1.0}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.0.0 → duckrun-0.1.0}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.0.0 → duckrun-0.1.0}/setup.cfg +0 -0
duckrun-0.1.0/PKG-INFO
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: duckrun
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
|
+
License-Expression: MIT
|
6
|
+
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
7
|
+
Project-URL: Repository, https://github.com/djouallah/duckrun
|
8
|
+
Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
9
|
+
Requires-Python: >=3.9
|
10
|
+
License-File: LICENSE
|
11
|
+
Dynamic: license-file
|
duckrun-0.1.0/README.md
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
# 🦆 Duckrun
|
2
|
+
|
3
|
+
Simple lakehouse task runner for Microsoft Fabric, powered by DuckDB.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
```bash
|
8
|
+
pip install duckrun
|
9
|
+
```
|
10
|
+
|
11
|
+
For local development (enables Azure CLI authentication):
|
12
|
+
```bash
|
13
|
+
pip install duckrun[local]
|
14
|
+
```
|
15
|
+
|
16
|
+
## Quick Start
|
17
|
+
|
18
|
+
```python
|
19
|
+
import duckrun as dr
|
20
|
+
|
21
|
+
# Connect to your Fabric lakehouse
|
22
|
+
lakehouse = dr.connect(
|
23
|
+
workspace="my_workspace",
|
24
|
+
lakehouse_name="my_lakehouse",
|
25
|
+
schema="dbo",
|
26
|
+
sql_folder="./sql" # folder containing your .sql and .py files
|
27
|
+
)
|
28
|
+
|
29
|
+
# Define your pipeline
|
30
|
+
pipeline = [
|
31
|
+
('load_data', (url, path)), # Python task
|
32
|
+
('clean_data', 'overwrite'), # SQL task
|
33
|
+
('aggregate', 'append') # SQL task
|
34
|
+
]
|
35
|
+
|
36
|
+
# Run it
|
37
|
+
lakehouse.run(pipeline)
|
38
|
+
```
|
39
|
+
|
40
|
+
## How It Works
|
41
|
+
|
42
|
+
Duckrun runs two types of tasks:
|
43
|
+
|
44
|
+
### 1. Python Tasks
|
45
|
+
Format: `('function_name', (arg1, arg2, ...))`
|
46
|
+
|
47
|
+
Create a file `sql_folder/function_name.py` with a function matching the name:
|
48
|
+
|
49
|
+
```python
|
50
|
+
# sql_folder/load_data.py
|
51
|
+
def load_data(url, path):
|
52
|
+
# your code here
|
53
|
+
return result
|
54
|
+
```
|
55
|
+
|
56
|
+
### 2. SQL Tasks
|
57
|
+
Format: `('table_name', 'mode')` or `('table_name', 'mode', {params})`
|
58
|
+
|
59
|
+
Create a file `sql_folder/table_name.sql`:
|
60
|
+
|
61
|
+
```sql
|
62
|
+
-- sql_folder/clean_data.sql
|
63
|
+
SELECT
|
64
|
+
id,
|
65
|
+
TRIM(name) as name,
|
66
|
+
date
|
67
|
+
FROM raw_data
|
68
|
+
WHERE date >= '2024-01-01'
|
69
|
+
```
|
70
|
+
|
71
|
+
**Modes:**
|
72
|
+
- `overwrite` - Replace table completely
|
73
|
+
- `append` - Add to existing table
|
74
|
+
- `ignore` - Create only if doesn't exist
|
75
|
+
|
76
|
+
## SQL Parameters
|
77
|
+
|
78
|
+
Your SQL files automatically have access to:
|
79
|
+
- `$ws` - workspace name
|
80
|
+
- `$lh` - lakehouse name
|
81
|
+
- `$schema` - schema name
|
82
|
+
|
83
|
+
Pass custom parameters:
|
84
|
+
|
85
|
+
```python
|
86
|
+
pipeline = [
|
87
|
+
('sales', 'append', {'start_date': '2024-01-01', 'end_date': '2024-12-31'})
|
88
|
+
]
|
89
|
+
```
|
90
|
+
|
91
|
+
```sql
|
92
|
+
-- sql_folder/sales.sql
|
93
|
+
SELECT * FROM transactions
|
94
|
+
WHERE date BETWEEN '$start_date' AND '$end_date'
|
95
|
+
```
|
96
|
+
|
97
|
+
## Table Name Convention
|
98
|
+
|
99
|
+
Use `__` to create variants of the same table:
|
100
|
+
|
101
|
+
```python
|
102
|
+
pipeline = [
|
103
|
+
('sales__initial', 'overwrite', {}), # writes to 'sales' table
|
104
|
+
('sales__incremental', 'append', {}), # appends to 'sales' table
|
105
|
+
]
|
106
|
+
```
|
107
|
+
|
108
|
+
Both write to the same `sales` table, but use different SQL files.
|
109
|
+
|
110
|
+
## Query Data
|
111
|
+
|
112
|
+
```python
|
113
|
+
# Run queries
|
114
|
+
lakehouse.sql("SELECT * FROM my_table LIMIT 10").show()
|
115
|
+
|
116
|
+
# Get as DataFrame
|
117
|
+
df = lakehouse.sql("SELECT COUNT(*) FROM sales").df()
|
118
|
+
```
|
119
|
+
|
120
|
+
## Real-World Example
|
121
|
+
|
122
|
+
```python
|
123
|
+
import duckrun as dr
|
124
|
+
|
125
|
+
lakehouse = dr.connect(
|
126
|
+
workspace="Analytics",
|
127
|
+
lakehouse_name="Sales",
|
128
|
+
schema="dbo",
|
129
|
+
sql_folder="./etl"
|
130
|
+
)
|
131
|
+
|
132
|
+
# Daily pipeline
|
133
|
+
daily = [
|
134
|
+
('download_files', (api_url, local_path)),
|
135
|
+
('staging_orders', 'overwrite', {'run_date': '2024-06-01'}),
|
136
|
+
('staging_customers', 'overwrite', {'run_date': '2024-06-01'}),
|
137
|
+
('fact_sales', 'append'),
|
138
|
+
('dim_customer', 'overwrite')
|
139
|
+
]
|
140
|
+
|
141
|
+
lakehouse.run(daily)
|
142
|
+
|
143
|
+
# Check results
|
144
|
+
lakehouse.sql("SELECT COUNT(*) FROM fact_sales").show()
|
145
|
+
```
|
146
|
+
|
147
|
+
## Remote SQL Files
|
148
|
+
|
149
|
+
You can load SQL/Python files from a URL:
|
150
|
+
|
151
|
+
```python
|
152
|
+
lakehouse = dr.connect(
|
153
|
+
workspace="Analytics",
|
154
|
+
lakehouse_name="Sales",
|
155
|
+
schema="dbo",
|
156
|
+
sql_folder="https://raw.githubusercontent.com/user/repo/main/sql"
|
157
|
+
)
|
158
|
+
```
|
159
|
+
|
160
|
+
## License
|
161
|
+
|
162
|
+
MIT
|
@@ -0,0 +1,11 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: duckrun
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
|
+
License-Expression: MIT
|
6
|
+
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
7
|
+
Project-URL: Repository, https://github.com/djouallah/duckrun
|
8
|
+
Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
9
|
+
Requires-Python: >=3.9
|
10
|
+
License-File: LICENSE
|
11
|
+
Dynamic: license-file
|
@@ -0,0 +1,17 @@
|
|
1
|
+
|
2
|
+
[build-system]
|
3
|
+
requires = ["setuptools>=61.0", "wheel"]
|
4
|
+
build-backend = "setuptools.build_meta"
|
5
|
+
|
6
|
+
[project]
|
7
|
+
name = "duckrun"
|
8
|
+
version = "0.1.0"
|
9
|
+
description = "Lakehouse task runner powered by DuckDB for Microsoft Fabric"
|
10
|
+
license = "MIT"
|
11
|
+
requires-python = ">=3.9"
|
12
|
+
|
13
|
+
[project.urls]
|
14
|
+
Homepage = "https://github.com/djouallah/duckrun"
|
15
|
+
Repository = "https://github.com/djouallah/duckrun"
|
16
|
+
Issues = "https://github.com/djouallah/duckrun/issues"
|
17
|
+
|
duckrun-0.0.0/PKG-INFO
DELETED
duckrun-0.0.0/README.md
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
# 🦆 Duckrun
|
2
|
-
|
3
|
-
Lakehouse task runner powered by DuckDB for Microsoft Fabric.
|
4
|
-
|
5
|
-
## Features
|
6
|
-
|
7
|
-
- 🦆 **DuckDB-powered**: Fast in-memory processing
|
8
|
-
- 📦 **Delta Lake**: Native Delta table support
|
9
|
-
- 🔄 **Simple API**: Clean tuple-based pipeline definition
|
10
|
-
- 🎯 **Fabric-native**: Built for Microsoft Fabric lakehouses
|
11
|
-
- 🐍 **Python + SQL**: Mix Python and SQL tasks seamlessly
|
12
|
-
|
13
|
-
## Installation
|
14
|
-
```bash
|
15
|
-
pip install duckrun
|
16
|
-
|
17
|
-
from duckrun import Duckrun
|
18
|
-
|
19
|
-
# Connect to your lakehouse
|
20
|
-
dr = Duckrun.connect(
|
21
|
-
workspace="your_workspace",
|
22
|
-
lakehouse_name="your_lakehouse",
|
23
|
-
schema="dbo",
|
24
|
-
sql_folder="./sql"
|
25
|
-
)
|
26
|
-
|
27
|
-
# Define pipeline
|
28
|
-
pipeline = [
|
29
|
-
('download', (urls, paths, depth)),
|
30
|
-
('staging', 'overwrite', {'run_date': '2024-06-01'}),
|
31
|
-
('transform', 'append'),
|
32
|
-
('fact_sales', 'append')
|
33
|
-
]
|
34
|
-
|
35
|
-
# Run it
|
36
|
-
dr.run(pipeline)
|
37
|
-
|
38
|
-
# Query directly
|
39
|
-
dr.sql("SELECT * FROM staging").show()
|
duckrun-0.0.0/pyproject.toml
DELETED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|