duckrun 0.0.0__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: duckrun
3
+ Version: 0.1.0
4
+ Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/djouallah/duckrun
7
+ Project-URL: Repository, https://github.com/djouallah/duckrun
8
+ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
9
+ Requires-Python: >=3.9
10
+ License-File: LICENSE
11
+ Dynamic: license-file
@@ -0,0 +1,162 @@
1
+ # 🦆 Duckrun
2
+
3
+ Simple lakehouse task runner for Microsoft Fabric, powered by DuckDB.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install duckrun
9
+ ```
10
+
11
+ For local development (enables Azure CLI authentication):
12
+ ```bash
13
+ pip install duckrun[local]
14
+ ```
15
+
16
+ ## Quick Start
17
+
18
+ ```python
19
+ import duckrun as dr
20
+
21
+ # Connect to your Fabric lakehouse
22
+ lakehouse = dr.connect(
23
+ workspace="my_workspace",
24
+ lakehouse_name="my_lakehouse",
25
+ schema="dbo",
26
+ sql_folder="./sql" # folder containing your .sql and .py files
27
+ )
28
+
29
+ # Define your pipeline
30
+ pipeline = [
31
+ ('load_data', (url, path)), # Python task
32
+ ('clean_data', 'overwrite'), # SQL task
33
+ ('aggregate', 'append') # SQL task
34
+ ]
35
+
36
+ # Run it
37
+ lakehouse.run(pipeline)
38
+ ```
39
+
40
+ ## How It Works
41
+
42
+ Duckrun runs two types of tasks:
43
+
44
+ ### 1. Python Tasks
45
+ Format: `('function_name', (arg1, arg2, ...))`
46
+
47
+ Create a file `sql_folder/function_name.py` with a function matching the name:
48
+
49
+ ```python
50
+ # sql_folder/load_data.py
51
+ def load_data(url, path):
52
+ # your code here
53
+ return result
54
+ ```
55
+
56
+ ### 2. SQL Tasks
57
+ Format: `('table_name', 'mode')` or `('table_name', 'mode', {params})`
58
+
59
+ Create a file `sql_folder/table_name.sql`:
60
+
61
+ ```sql
62
+ -- sql_folder/clean_data.sql
63
+ SELECT
64
+ id,
65
+ TRIM(name) as name,
66
+ date
67
+ FROM raw_data
68
+ WHERE date >= '2024-01-01'
69
+ ```
70
+
71
+ **Modes:**
72
+ - `overwrite` - Replace table completely
73
+ - `append` - Add to existing table
74
+ - `ignore` - Create only if doesn't exist
75
+
76
+ ## SQL Parameters
77
+
78
+ Your SQL files automatically have access to:
79
+ - `$ws` - workspace name
80
+ - `$lh` - lakehouse name
81
+ - `$schema` - schema name
82
+
83
+ Pass custom parameters:
84
+
85
+ ```python
86
+ pipeline = [
87
+ ('sales', 'append', {'start_date': '2024-01-01', 'end_date': '2024-12-31'})
88
+ ]
89
+ ```
90
+
91
+ ```sql
92
+ -- sql_folder/sales.sql
93
+ SELECT * FROM transactions
94
+ WHERE date BETWEEN '$start_date' AND '$end_date'
95
+ ```
96
+
97
+ ## Table Name Convention
98
+
99
+ Use `__` to create variants of the same table:
100
+
101
+ ```python
102
+ pipeline = [
103
+ ('sales__initial', 'overwrite', {}), # writes to 'sales' table
104
+ ('sales__incremental', 'append', {}), # appends to 'sales' table
105
+ ]
106
+ ```
107
+
108
+ Both write to the same `sales` table, but use different SQL files.
109
+
110
+ ## Query Data
111
+
112
+ ```python
113
+ # Run queries
114
+ lakehouse.sql("SELECT * FROM my_table LIMIT 10").show()
115
+
116
+ # Get as DataFrame
117
+ df = lakehouse.sql("SELECT COUNT(*) FROM sales").df()
118
+ ```
119
+
120
+ ## Real-World Example
121
+
122
+ ```python
123
+ import duckrun as dr
124
+
125
+ lakehouse = dr.connect(
126
+ workspace="Analytics",
127
+ lakehouse_name="Sales",
128
+ schema="dbo",
129
+ sql_folder="./etl"
130
+ )
131
+
132
+ # Daily pipeline
133
+ daily = [
134
+ ('download_files', (api_url, local_path)),
135
+ ('staging_orders', 'overwrite', {'run_date': '2024-06-01'}),
136
+ ('staging_customers', 'overwrite', {'run_date': '2024-06-01'}),
137
+ ('fact_sales', 'append'),
138
+ ('dim_customer', 'overwrite')
139
+ ]
140
+
141
+ lakehouse.run(daily)
142
+
143
+ # Check results
144
+ lakehouse.sql("SELECT COUNT(*) FROM fact_sales").show()
145
+ ```
146
+
147
+ ## Remote SQL Files
148
+
149
+ You can load SQL/Python files from a URL:
150
+
151
+ ```python
152
+ lakehouse = dr.connect(
153
+ workspace="Analytics",
154
+ lakehouse_name="Sales",
155
+ schema="dbo",
156
+ sql_folder="https://raw.githubusercontent.com/user/repo/main/sql"
157
+ )
158
+ ```
159
+
160
+ ## License
161
+
162
+ MIT
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: duckrun
3
+ Version: 0.1.0
4
+ Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/djouallah/duckrun
7
+ Project-URL: Repository, https://github.com/djouallah/duckrun
8
+ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
9
+ Requires-Python: >=3.9
10
+ License-File: LICENSE
11
+ Dynamic: license-file
@@ -0,0 +1,17 @@
1
+
2
+ [build-system]
3
+ requires = ["setuptools>=61.0", "wheel"]
4
+ build-backend = "setuptools.build_meta"
5
+
6
+ [project]
7
+ name = "duckrun"
8
+ version = "0.1.0"
9
+ description = "Lakehouse task runner powered by DuckDB for Microsoft Fabric"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+
13
+ [project.urls]
14
+ Homepage = "https://github.com/djouallah/duckrun"
15
+ Repository = "https://github.com/djouallah/duckrun"
16
+ Issues = "https://github.com/djouallah/duckrun/issues"
17
+
duckrun-0.0.0/PKG-INFO DELETED
@@ -1,5 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: duckrun
3
- Version: 0.0.0
4
- License-File: LICENSE
5
- Dynamic: license-file
duckrun-0.0.0/README.md DELETED
@@ -1,39 +0,0 @@
1
- # 🦆 Duckrun
2
-
3
- Lakehouse task runner powered by DuckDB for Microsoft Fabric.
4
-
5
- ## Features
6
-
7
- - 🦆 **DuckDB-powered**: Fast in-memory processing
8
- - 📦 **Delta Lake**: Native Delta table support
9
- - 🔄 **Simple API**: Clean tuple-based pipeline definition
10
- - 🎯 **Fabric-native**: Built for Microsoft Fabric lakehouses
11
- - 🐍 **Python + SQL**: Mix Python and SQL tasks seamlessly
12
-
13
- ## Installation
14
- ```bash
15
- pip install duckrun
16
-
17
- from duckrun import Duckrun
18
-
19
- # Connect to your lakehouse
20
- dr = Duckrun.connect(
21
- workspace="your_workspace",
22
- lakehouse_name="your_lakehouse",
23
- schema="dbo",
24
- sql_folder="./sql"
25
- )
26
-
27
- # Define pipeline
28
- pipeline = [
29
- ('download', (urls, paths, depth)),
30
- ('staging', 'overwrite', {'run_date': '2024-06-01'}),
31
- ('transform', 'append'),
32
- ('fact_sales', 'append')
33
- ]
34
-
35
- # Run it
36
- dr.run(pipeline)
37
-
38
- # Query directly
39
- dr.sql("SELECT * FROM staging").show()
@@ -1,5 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: duckrun
3
- Version: 0.0.0
4
- License-File: LICENSE
5
- Dynamic: license-file
File without changes
File without changes
File without changes
File without changes
File without changes