duckrun 0.2.5.dev2__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/PKG-INFO +29 -15
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/README.md +28 -14
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun/core.py +30 -3
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun/runner.py +29 -1
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun.egg-info/PKG-INFO +29 -15
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/pyproject.toml +1 -1
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/LICENSE +0 -0
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun/__init__.py +0 -0
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun/files.py +0 -0
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun/stats.py +0 -0
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun/writer.py +0 -0
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.5.dev2 → duckrun-0.2.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: duckrun
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.6
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
5
|
Author: mim
|
6
6
|
License: MIT
|
@@ -26,12 +26,10 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
26
26
|
|
27
27
|
**Requirements:**
|
28
28
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
29
|
-
- Workspace and lakehouse names
|
29
|
+
- **Workspace and lakehouse names with spaces are now fully supported!** ✅
|
30
30
|
|
31
31
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
32
32
|
|
33
|
-
**Why no spaces?** Duckrun uses simple name-based paths instead of GUIDs. This keeps the code clean and readable, which is perfect for data engineering workspaces where naming conventions are already well-established. Just use underscores or hyphens instead: `my_workspace` or `my-lakehouse`.
|
34
|
-
|
35
33
|
## What It Does
|
36
34
|
|
37
35
|
It does orchestration, arbitrary SQL statements, and file manipulation. That's it - just stuff I encounter in my daily workflow when working with Fabric notebooks.
|
@@ -52,20 +50,28 @@ pip install duckrun[local]
|
|
52
50
|
```python
|
53
51
|
import duckrun
|
54
52
|
|
55
|
-
#
|
56
|
-
|
53
|
+
# 1. Workspace Management (list and create lakehouses)
|
54
|
+
ws = duckrun.connect("My Workspace")
|
55
|
+
lakehouses = ws.list_lakehouses() # Returns list of lakehouse names
|
56
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
57
|
+
|
58
|
+
# 2. Connect to lakehouse with a specific schema
|
59
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
60
|
+
|
61
|
+
# Works with workspace names containing spaces!
|
62
|
+
con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
|
57
63
|
|
58
64
|
# Schema defaults to 'dbo' if not specified (scans all schemas)
|
59
65
|
# ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
|
60
|
-
con = duckrun.connect("
|
66
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
|
61
67
|
|
62
|
-
# Explore data
|
68
|
+
# 3. Explore data
|
63
69
|
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
64
70
|
|
65
|
-
# Write to Delta tables (Spark-style API)
|
71
|
+
# 4. Write to Delta tables (Spark-style API)
|
66
72
|
con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
|
67
73
|
|
68
|
-
# Upload/download files to/from OneLake Files
|
74
|
+
# 5. Upload/download files to/from OneLake Files
|
69
75
|
con.copy("./local_folder", "target_folder") # Upload files
|
70
76
|
con.download("target_folder", "./downloaded") # Download files
|
71
77
|
```
|
@@ -75,15 +81,23 @@ That's it! No `sql_folder` needed for data exploration.
|
|
75
81
|
## Connection Format
|
76
82
|
|
77
83
|
```python
|
78
|
-
#
|
79
|
-
|
84
|
+
# Workspace management (list and create lakehouses)
|
85
|
+
ws = duckrun.connect("My Workspace")
|
86
|
+
ws.list_lakehouses() # Returns: ['lakehouse1', 'lakehouse2', ...]
|
87
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
88
|
+
|
89
|
+
# Lakehouse connection with schema (recommended for best performance)
|
90
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
91
|
+
|
92
|
+
# Supports workspace names with spaces!
|
93
|
+
con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
|
80
94
|
|
81
95
|
# Without schema (defaults to 'dbo', scans all schemas)
|
82
96
|
# ⚠️ This can be slow for large lakehouses!
|
83
|
-
con = duckrun.connect("
|
97
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
|
84
98
|
|
85
|
-
# With
|
86
|
-
con = duckrun.connect("
|
99
|
+
# With SQL folder for pipeline orchestration
|
100
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo", sql_folder="./sql")
|
87
101
|
```
|
88
102
|
|
89
103
|
### Multi-Schema Support
|
@@ -6,12 +6,10 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
6
6
|
|
7
7
|
**Requirements:**
|
8
8
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
9
|
-
- Workspace and lakehouse names
|
9
|
+
- **Workspace and lakehouse names with spaces are now fully supported!** ✅
|
10
10
|
|
11
11
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
12
12
|
|
13
|
-
**Why no spaces?** Duckrun uses simple name-based paths instead of GUIDs. This keeps the code clean and readable, which is perfect for data engineering workspaces where naming conventions are already well-established. Just use underscores or hyphens instead: `my_workspace` or `my-lakehouse`.
|
14
|
-
|
15
13
|
## What It Does
|
16
14
|
|
17
15
|
It does orchestration, arbitrary SQL statements, and file manipulation. That's it - just stuff I encounter in my daily workflow when working with Fabric notebooks.
|
@@ -32,20 +30,28 @@ pip install duckrun[local]
|
|
32
30
|
```python
|
33
31
|
import duckrun
|
34
32
|
|
35
|
-
#
|
36
|
-
|
33
|
+
# 1. Workspace Management (list and create lakehouses)
|
34
|
+
ws = duckrun.connect("My Workspace")
|
35
|
+
lakehouses = ws.list_lakehouses() # Returns list of lakehouse names
|
36
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
37
|
+
|
38
|
+
# 2. Connect to lakehouse with a specific schema
|
39
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
40
|
+
|
41
|
+
# Works with workspace names containing spaces!
|
42
|
+
con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
|
37
43
|
|
38
44
|
# Schema defaults to 'dbo' if not specified (scans all schemas)
|
39
45
|
# ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
|
40
|
-
con = duckrun.connect("
|
46
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
|
41
47
|
|
42
|
-
# Explore data
|
48
|
+
# 3. Explore data
|
43
49
|
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
44
50
|
|
45
|
-
# Write to Delta tables (Spark-style API)
|
51
|
+
# 4. Write to Delta tables (Spark-style API)
|
46
52
|
con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
|
47
53
|
|
48
|
-
# Upload/download files to/from OneLake Files
|
54
|
+
# 5. Upload/download files to/from OneLake Files
|
49
55
|
con.copy("./local_folder", "target_folder") # Upload files
|
50
56
|
con.download("target_folder", "./downloaded") # Download files
|
51
57
|
```
|
@@ -55,15 +61,23 @@ That's it! No `sql_folder` needed for data exploration.
|
|
55
61
|
## Connection Format
|
56
62
|
|
57
63
|
```python
|
58
|
-
#
|
59
|
-
|
64
|
+
# Workspace management (list and create lakehouses)
|
65
|
+
ws = duckrun.connect("My Workspace")
|
66
|
+
ws.list_lakehouses() # Returns: ['lakehouse1', 'lakehouse2', ...]
|
67
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
68
|
+
|
69
|
+
# Lakehouse connection with schema (recommended for best performance)
|
70
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
71
|
+
|
72
|
+
# Supports workspace names with spaces!
|
73
|
+
con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
|
60
74
|
|
61
75
|
# Without schema (defaults to 'dbo', scans all schemas)
|
62
76
|
# ⚠️ This can be slow for large lakehouses!
|
63
|
-
con = duckrun.connect("
|
77
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
|
64
78
|
|
65
|
-
# With
|
66
|
-
con = duckrun.connect("
|
79
|
+
# With SQL folder for pipeline orchestration
|
80
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo", sql_folder="./sql")
|
67
81
|
```
|
68
82
|
|
69
83
|
### Multi-Schema Support
|
@@ -441,6 +441,26 @@ class Duckrun:
|
|
441
441
|
print(f"❌ Error attaching lakehouse: {e}")
|
442
442
|
print("Continuing without pre-attached tables.")
|
443
443
|
|
444
|
+
def get_workspace_id(self) -> str:
|
445
|
+
"""
|
446
|
+
Get the workspace ID (GUID or name without spaces).
|
447
|
+
Use this when passing workspace parameter to Python functions.
|
448
|
+
|
449
|
+
Returns:
|
450
|
+
Workspace ID - either a GUID or workspace name without spaces
|
451
|
+
"""
|
452
|
+
return self.workspace_id
|
453
|
+
|
454
|
+
def get_lakehouse_id(self) -> str:
|
455
|
+
"""
|
456
|
+
Get the lakehouse ID (GUID or name).
|
457
|
+
Use this when passing lakehouse parameter to Python functions.
|
458
|
+
|
459
|
+
Returns:
|
460
|
+
Lakehouse ID - either a GUID or lakehouse name
|
461
|
+
"""
|
462
|
+
return self.lakehouse_id
|
463
|
+
|
444
464
|
def run(self, pipeline: List[Tuple]) -> bool:
|
445
465
|
"""
|
446
466
|
Execute pipeline of tasks.
|
@@ -702,7 +722,11 @@ class WorkspaceConnection:
|
|
702
722
|
try:
|
703
723
|
import notebookutils # type: ignore
|
704
724
|
token = notebookutils.credentials.getToken("pbi")
|
705
|
-
|
725
|
+
# Always resolve workspace name to ID, even in notebook environment
|
726
|
+
workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
|
727
|
+
if not workspace_id:
|
728
|
+
print(f"Workspace '{self.workspace_name}' not found")
|
729
|
+
return []
|
706
730
|
except ImportError:
|
707
731
|
# Fallback to azure-identity
|
708
732
|
print("Getting authentication token...")
|
@@ -727,7 +751,6 @@ class WorkspaceConnection:
|
|
727
751
|
lakehouses = response.json().get("value", [])
|
728
752
|
lakehouse_names = [lh.get("displayName", "") for lh in lakehouses]
|
729
753
|
|
730
|
-
print(f"Found {len(lakehouse_names)} lakehouses: {lakehouse_names}")
|
731
754
|
return lakehouse_names
|
732
755
|
|
733
756
|
except Exception as e:
|
@@ -749,7 +772,11 @@ class WorkspaceConnection:
|
|
749
772
|
try:
|
750
773
|
import notebookutils # type: ignore
|
751
774
|
token = notebookutils.credentials.getToken("pbi")
|
752
|
-
|
775
|
+
# Always resolve workspace name to ID, even in notebook environment
|
776
|
+
workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
|
777
|
+
if not workspace_id:
|
778
|
+
print(f"Workspace '{self.workspace_name}' not found")
|
779
|
+
return False
|
753
780
|
except ImportError:
|
754
781
|
# Fallback to azure-identity
|
755
782
|
print("Getting authentication token...")
|
@@ -110,12 +110,40 @@ def run(duckrun_instance, pipeline: List[Tuple]) -> bool:
|
|
110
110
|
|
111
111
|
|
112
112
|
def _run_python(duckrun_instance, name: str, args: tuple) -> Any:
|
113
|
-
"""
|
113
|
+
"""
|
114
|
+
Execute Python task, return result.
|
115
|
+
|
116
|
+
Automatically substitutes workspace/lakehouse names in args with their resolved IDs
|
117
|
+
to prevent URL encoding issues with names containing spaces.
|
118
|
+
"""
|
114
119
|
duckrun_instance._create_onelake_secret()
|
115
120
|
func = _load_py_function(duckrun_instance, name)
|
116
121
|
if not func:
|
117
122
|
raise RuntimeError(f"Python function '{name}' not found")
|
118
123
|
|
124
|
+
# Get original and resolved names
|
125
|
+
original_workspace = duckrun_instance.workspace
|
126
|
+
original_lakehouse = duckrun_instance.lakehouse_name
|
127
|
+
resolved_workspace = duckrun_instance.workspace_id
|
128
|
+
resolved_lakehouse = duckrun_instance.lakehouse_id
|
129
|
+
|
130
|
+
# Substitute workspace/lakehouse names in args if they differ
|
131
|
+
# This prevents URL encoding issues when names contain spaces
|
132
|
+
substituted_args = []
|
133
|
+
needs_substitution = (original_workspace != resolved_workspace or
|
134
|
+
original_lakehouse != resolved_lakehouse)
|
135
|
+
|
136
|
+
if needs_substitution:
|
137
|
+
for arg in args:
|
138
|
+
if arg == original_workspace:
|
139
|
+
substituted_args.append(resolved_workspace)
|
140
|
+
elif arg == original_lakehouse:
|
141
|
+
substituted_args.append(resolved_lakehouse)
|
142
|
+
else:
|
143
|
+
substituted_args.append(arg)
|
144
|
+
args = tuple(substituted_args)
|
145
|
+
print(f"📝 Auto-substituted workspace/lakehouse names in args for URL compatibility")
|
146
|
+
|
119
147
|
print(f"Running Python: {name}{args}")
|
120
148
|
result = func(*args)
|
121
149
|
print(f"✅ Python '{name}' completed")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: duckrun
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.6
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
5
|
Author: mim
|
6
6
|
License: MIT
|
@@ -26,12 +26,10 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
26
26
|
|
27
27
|
**Requirements:**
|
28
28
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
29
|
-
- Workspace and lakehouse names
|
29
|
+
- **Workspace and lakehouse names with spaces are now fully supported!** ✅
|
30
30
|
|
31
31
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
32
32
|
|
33
|
-
**Why no spaces?** Duckrun uses simple name-based paths instead of GUIDs. This keeps the code clean and readable, which is perfect for data engineering workspaces where naming conventions are already well-established. Just use underscores or hyphens instead: `my_workspace` or `my-lakehouse`.
|
34
|
-
|
35
33
|
## What It Does
|
36
34
|
|
37
35
|
It does orchestration, arbitrary SQL statements, and file manipulation. That's it - just stuff I encounter in my daily workflow when working with Fabric notebooks.
|
@@ -52,20 +50,28 @@ pip install duckrun[local]
|
|
52
50
|
```python
|
53
51
|
import duckrun
|
54
52
|
|
55
|
-
#
|
56
|
-
|
53
|
+
# 1. Workspace Management (list and create lakehouses)
|
54
|
+
ws = duckrun.connect("My Workspace")
|
55
|
+
lakehouses = ws.list_lakehouses() # Returns list of lakehouse names
|
56
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
57
|
+
|
58
|
+
# 2. Connect to lakehouse with a specific schema
|
59
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
60
|
+
|
61
|
+
# Works with workspace names containing spaces!
|
62
|
+
con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
|
57
63
|
|
58
64
|
# Schema defaults to 'dbo' if not specified (scans all schemas)
|
59
65
|
# ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
|
60
|
-
con = duckrun.connect("
|
66
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
|
61
67
|
|
62
|
-
# Explore data
|
68
|
+
# 3. Explore data
|
63
69
|
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
64
70
|
|
65
|
-
# Write to Delta tables (Spark-style API)
|
71
|
+
# 4. Write to Delta tables (Spark-style API)
|
66
72
|
con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
|
67
73
|
|
68
|
-
# Upload/download files to/from OneLake Files
|
74
|
+
# 5. Upload/download files to/from OneLake Files
|
69
75
|
con.copy("./local_folder", "target_folder") # Upload files
|
70
76
|
con.download("target_folder", "./downloaded") # Download files
|
71
77
|
```
|
@@ -75,15 +81,23 @@ That's it! No `sql_folder` needed for data exploration.
|
|
75
81
|
## Connection Format
|
76
82
|
|
77
83
|
```python
|
78
|
-
#
|
79
|
-
|
84
|
+
# Workspace management (list and create lakehouses)
|
85
|
+
ws = duckrun.connect("My Workspace")
|
86
|
+
ws.list_lakehouses() # Returns: ['lakehouse1', 'lakehouse2', ...]
|
87
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
88
|
+
|
89
|
+
# Lakehouse connection with schema (recommended for best performance)
|
90
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
91
|
+
|
92
|
+
# Supports workspace names with spaces!
|
93
|
+
con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
|
80
94
|
|
81
95
|
# Without schema (defaults to 'dbo', scans all schemas)
|
82
96
|
# ⚠️ This can be slow for large lakehouses!
|
83
|
-
con = duckrun.connect("
|
97
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
|
84
98
|
|
85
|
-
# With
|
86
|
-
con = duckrun.connect("
|
99
|
+
# With SQL folder for pipeline orchestration
|
100
|
+
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo", sql_folder="./sql")
|
87
101
|
```
|
88
102
|
|
89
103
|
### Multi-Schema Support
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|