duckrun 0.2.14.dev2__tar.gz → 0.2.14.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckrun might be problematic. Click here for more details.
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/PKG-INFO +2 -2
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun/auth.py +12 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun/core.py +304 -119
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun/runner.py +14 -6
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun.egg-info/PKG-INFO +2 -2
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun.egg-info/requires.txt +1 -1
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/pyproject.toml +2 -2
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/LICENSE +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/README.md +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun/__init__.py +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun/files.py +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun/stats.py +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun/writer.py +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.14.dev2 → duckrun-0.2.14.dev3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.14.
|
|
3
|
+
Version: 0.2.14.dev3
|
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -10,7 +10,7 @@ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
|
|
10
10
|
Requires-Python: >=3.9
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
|
-
Requires-Dist: duckdb>=1.2.
|
|
13
|
+
Requires-Dist: duckdb>=1.2.0
|
|
14
14
|
Requires-Dist: deltalake<=0.18.2
|
|
15
15
|
Requires-Dist: requests>=2.28.0
|
|
16
16
|
Requires-Dist: obstore>=0.2.0
|
|
@@ -2,9 +2,21 @@
|
|
|
2
2
|
Enhanced authentication module for duckrun - supports multiple notebook environments
|
|
3
3
|
"""
|
|
4
4
|
import os
|
|
5
|
+
import sys
|
|
5
6
|
from typing import Optional, Tuple
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
def safe_print(message: str):
|
|
10
|
+
"""Print message with safe encoding handling for Windows"""
|
|
11
|
+
try:
|
|
12
|
+
print(message)
|
|
13
|
+
except UnicodeEncodeError:
|
|
14
|
+
# Fallback: remove emojis and special chars
|
|
15
|
+
import re
|
|
16
|
+
clean_message = re.sub(r'[^\x00-\x7F]+', '', message)
|
|
17
|
+
print(clean_message)
|
|
18
|
+
|
|
19
|
+
|
|
8
20
|
def get_token() -> Optional[str]:
|
|
9
21
|
"""
|
|
10
22
|
Smart authentication that works across multiple environments:
|
|
@@ -4,11 +4,8 @@ import os
|
|
|
4
4
|
import importlib.util
|
|
5
5
|
import json
|
|
6
6
|
import time
|
|
7
|
-
from deltalake import DeltaTable, write_deltalake
|
|
8
7
|
from typing import List, Tuple, Union, Optional, Callable, Dict, Any
|
|
9
8
|
from string import Template
|
|
10
|
-
import obstore as obs
|
|
11
|
-
from obstore.store import AzureStore
|
|
12
9
|
from datetime import datetime
|
|
13
10
|
from .stats import get_stats as _get_stats
|
|
14
11
|
from .runner import run as _run
|
|
@@ -17,7 +14,8 @@ from .writer import QueryResult
|
|
|
17
14
|
|
|
18
15
|
class Duckrun:
|
|
19
16
|
"""
|
|
20
|
-
|
|
17
|
+
OneLake task runner with clean tuple-based API.
|
|
18
|
+
Supports lakehouses, warehouses, databases, and other OneLake items.
|
|
21
19
|
Powered by DuckDB for fast data processing.
|
|
22
20
|
|
|
23
21
|
Task formats:
|
|
@@ -30,6 +28,10 @@ class Duckrun:
|
|
|
30
28
|
dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema, lists all tables
|
|
31
29
|
dr.run(pipeline)
|
|
32
30
|
|
|
31
|
+
# For other OneLake items:
|
|
32
|
+
dr = Duckrun.connect("SNOWFLAKE/ONELAKEUSEAST.SnowflakeDatabase")
|
|
33
|
+
dr = Duckrun.connect("workspace/warehouse.Warehouse")
|
|
34
|
+
|
|
33
35
|
# For data exploration with Spark-style API:
|
|
34
36
|
dr = Duckrun.connect("workspace/lakehouse.lakehouse")
|
|
35
37
|
dr.sql("SELECT * FROM table").show()
|
|
@@ -65,24 +67,53 @@ class Duckrun:
|
|
|
65
67
|
self.storage_account = storage_account
|
|
66
68
|
self.token_only = token_only
|
|
67
69
|
|
|
68
|
-
#
|
|
70
|
+
# Store both full name (with .ItemType) and display name (without .ItemType) for backward compatibility
|
|
71
|
+
# lakehouse_id: Full name with suffix for API calls (e.g., "data.Lakehouse")
|
|
72
|
+
# lakehouse_display_name: Name only without suffix for user code/templates (e.g., "data")
|
|
73
|
+
self.lakehouse_id = lakehouse_id
|
|
74
|
+
|
|
75
|
+
# Extract display name (remove .ItemType suffix if present)
|
|
69
76
|
import re
|
|
77
|
+
# Check if lakehouse_id has .ItemType suffix
|
|
78
|
+
if not re.match(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', lakehouse_id, re.IGNORECASE):
|
|
79
|
+
# Friendly name - extract base name without suffix
|
|
80
|
+
for suffix in ['.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase']:
|
|
81
|
+
if lakehouse_id.endswith(suffix):
|
|
82
|
+
self.lakehouse_display_name = lakehouse_id[:-len(suffix)]
|
|
83
|
+
break
|
|
84
|
+
else:
|
|
85
|
+
self.lakehouse_display_name = lakehouse_id
|
|
86
|
+
else:
|
|
87
|
+
# GUID - use as is
|
|
88
|
+
self.lakehouse_display_name = lakehouse_id
|
|
89
|
+
|
|
90
|
+
# Construct proper ABFSS URLs
|
|
91
|
+
# Format: abfss://{workspace}@{storage_account}.dfs.fabric.microsoft.com/{item}/Tables/
|
|
92
|
+
# where {workspace} and {item} can be:
|
|
93
|
+
# - Names with .lakehouse suffix (lakehouse optimization when no spaces in workspace)
|
|
94
|
+
# - GUIDs (when resolved via API for non-lakehouse items or items with spaces)
|
|
70
95
|
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
71
|
-
|
|
96
|
+
|
|
97
|
+
# Determine the item URL part for ABFSS
|
|
72
98
|
if guid_pattern.match(lakehouse_id):
|
|
73
|
-
|
|
99
|
+
# Already a GUID - use as-is (from API resolution)
|
|
100
|
+
item_url_part = lakehouse_id
|
|
74
101
|
else:
|
|
75
|
-
#
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_url_part}/Tables/'
|
|
81
|
-
self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_url_part}/Files/'
|
|
102
|
+
# Friendly name - use as-is (already includes .ItemType suffix from connect())
|
|
103
|
+
item_url_part = lakehouse_id
|
|
104
|
+
|
|
105
|
+
self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{item_url_part}/Tables/'
|
|
106
|
+
self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{item_url_part}/Files/'
|
|
82
107
|
|
|
83
108
|
# Keep legacy properties for backward compatibility
|
|
84
109
|
self.workspace = workspace_id
|
|
85
|
-
self.lakehouse_name =
|
|
110
|
+
self.lakehouse_name = self.lakehouse_display_name # Use display name (without suffix) for backward compatibility
|
|
111
|
+
|
|
112
|
+
# Store display name without suffix for backward compatibility with user Python functions
|
|
113
|
+
# Extract base name by removing .ItemType suffix if present
|
|
114
|
+
import re
|
|
115
|
+
suffix_pattern = re.compile(r'\.(Lakehouse|Warehouse|Database|SnowflakeDatabase)$', re.IGNORECASE)
|
|
116
|
+
self.lakehouse_display_name = suffix_pattern.sub('', lakehouse_id)
|
|
86
117
|
|
|
87
118
|
self.con = duckdb.connect()
|
|
88
119
|
self.con.sql("SET preserve_insertion_order = false")
|
|
@@ -109,12 +140,15 @@ class Duckrun:
|
|
|
109
140
|
compaction_threshold: int = 100, storage_account: str = "onelake",
|
|
110
141
|
token_only: bool = False):
|
|
111
142
|
"""
|
|
112
|
-
Create and connect to lakehouse or workspace.
|
|
143
|
+
Create and connect to OneLake items (lakehouse, warehouse, database, etc.) or workspace.
|
|
113
144
|
|
|
114
145
|
Smart detection based on connection string format:
|
|
115
146
|
- "workspace" → workspace management only
|
|
116
|
-
- "ws/
|
|
117
|
-
- "ws/
|
|
147
|
+
- "ws/item.lakehouse/schema" → lakehouse connection with specific schema
|
|
148
|
+
- "ws/item.lakehouse" → lakehouse connection (defaults to dbo schema)
|
|
149
|
+
- "ws/item.warehouse" → warehouse connection
|
|
150
|
+
- "ws/item.database" → database connection
|
|
151
|
+
- "ws/item.snowflakedatabase" → Snowflake database connection
|
|
118
152
|
|
|
119
153
|
Args:
|
|
120
154
|
connection_string: OneLake path or workspace name
|
|
@@ -129,19 +163,26 @@ class Duckrun:
|
|
|
129
163
|
ws.list_lakehouses()
|
|
130
164
|
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
|
131
165
|
|
|
132
|
-
#
|
|
166
|
+
# Lakehouse connections (supports spaces in names)
|
|
133
167
|
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/schema", sql_folder="./sql")
|
|
134
168
|
dr = Duckrun.connect("Data Workspace/Sales Data.lakehouse/analytics") # spaces supported
|
|
135
169
|
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse") # defaults to dbo schema
|
|
136
170
|
dr = Duckrun.connect("workspace/lakehouse.lakehouse", storage_account="xxx-onelake") # custom storage
|
|
137
171
|
|
|
172
|
+
# Warehouse and database connections (always uses API to resolve GUIDs)
|
|
173
|
+
dr = Duckrun.connect("SNOWFLAKE/ONELAKEUSEAST.SnowflakeDatabase")
|
|
174
|
+
dr = Duckrun.connect("My Workspace/My Warehouse.Warehouse")
|
|
175
|
+
dr = Duckrun.connect("workspace/database.Database")
|
|
176
|
+
|
|
138
177
|
# Fast connection without table listing (token only)
|
|
139
178
|
dr = Duckrun.connect("workspace/lakehouse.lakehouse", token_only=True)
|
|
140
179
|
|
|
141
180
|
Note:
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
181
|
+
- Lakehouse items without spaces in workspace name use optimization (no API calls)
|
|
182
|
+
- Non-lakehouse items always resolve to GUIDs via Fabric API
|
|
183
|
+
- Internally constructs proper ABFSS URLs:
|
|
184
|
+
"My Workspace/My Item.lakehouse/schema" →
|
|
185
|
+
"abfss://workspace_guid@onelake.dfs.fabric.microsoft.com/item_guid/Tables/schema"
|
|
145
186
|
"""
|
|
146
187
|
|
|
147
188
|
# Check if it's a workspace-only connection (no "/" means workspace name only)
|
|
@@ -150,70 +191,94 @@ class Duckrun:
|
|
|
150
191
|
|
|
151
192
|
scan_all_schemas = False
|
|
152
193
|
|
|
153
|
-
# Parse
|
|
154
|
-
# Support workspace and
|
|
194
|
+
# Parse connection string: "ws/item_name.item_type/schema" or "ws/item_name.item_type"
|
|
195
|
+
# Support workspace and item names with spaces
|
|
196
|
+
# Item types: .lakehouse, .Lakehouse, .warehouse, .Warehouse, .database, .Database, .snowflakedatabase, .SnowflakeDatabase
|
|
155
197
|
parts = connection_string.split("/")
|
|
156
198
|
if len(parts) == 2:
|
|
157
|
-
workspace_name,
|
|
199
|
+
workspace_name, item_name_with_type = parts
|
|
158
200
|
scan_all_schemas = True
|
|
159
201
|
schema = "dbo"
|
|
160
202
|
elif len(parts) == 3:
|
|
161
|
-
workspace_name,
|
|
203
|
+
workspace_name, item_name_with_type, schema = parts
|
|
162
204
|
else:
|
|
163
205
|
raise ValueError(
|
|
164
206
|
f"Invalid connection string format: '{connection_string}'. "
|
|
165
207
|
"Expected formats:\n"
|
|
166
208
|
" 'workspace name' (workspace management only)\n"
|
|
167
|
-
" 'workspace name/
|
|
168
|
-
" 'workspace name/
|
|
209
|
+
" 'workspace name/item name.item_type' (item with dbo schema)\n"
|
|
210
|
+
" 'workspace name/item name.item_type/schema' (item with specific schema)\n"
|
|
211
|
+
"Supported item types: .lakehouse, .warehouse, .database, .snowflakedatabase (case-insensitive)"
|
|
169
212
|
)
|
|
170
213
|
|
|
171
|
-
|
|
172
|
-
|
|
214
|
+
# Extract item type and name
|
|
215
|
+
item_type = None
|
|
216
|
+
item_name = item_name_with_type
|
|
173
217
|
|
|
174
|
-
|
|
218
|
+
# Check for known item types (case-insensitive)
|
|
219
|
+
item_type_map = {
|
|
220
|
+
'.lakehouse': 'Lakehouse',
|
|
221
|
+
'.warehouse': 'Warehouse',
|
|
222
|
+
'.database': 'Database',
|
|
223
|
+
'.snowflakedatabase': 'SnowflakeDatabase'
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
# Parse item type and normalize the suffix to proper case
|
|
227
|
+
item_name_normalized = item_name_with_type
|
|
228
|
+
for suffix, mapped_type in item_type_map.items():
|
|
229
|
+
if item_name_with_type.lower().endswith(suffix):
|
|
230
|
+
item_type = mapped_type
|
|
231
|
+
item_name = item_name_with_type[:-len(suffix)]
|
|
232
|
+
# Normalize to proper case: ItemName.ItemType (e.g., data.Lakehouse)
|
|
233
|
+
item_name_normalized = f"{item_name}.{mapped_type}"
|
|
234
|
+
break
|
|
235
|
+
|
|
236
|
+
if not workspace_name or not item_name:
|
|
175
237
|
raise ValueError(
|
|
176
238
|
"Missing required parameters. Use one of these formats:\n"
|
|
177
239
|
" connect('workspace name') # workspace management\n"
|
|
178
|
-
" connect('workspace name/
|
|
179
|
-
" connect('workspace name/
|
|
240
|
+
" connect('workspace name/item name.item_type/schema') # full item connection\n"
|
|
241
|
+
" connect('workspace name/item name.item_type') # defaults to dbo"
|
|
180
242
|
)
|
|
181
243
|
|
|
182
|
-
#
|
|
183
|
-
|
|
244
|
+
# Per OneLake API docs: Can use friendly names if no spaces/special characters
|
|
245
|
+
# Otherwise must resolve to GUIDs
|
|
246
|
+
# Check for spaces or special characters that would require GUID resolution
|
|
247
|
+
has_special_chars = " " in workspace_name or " " in item_name
|
|
184
248
|
|
|
185
|
-
|
|
249
|
+
if has_special_chars:
|
|
250
|
+
# Names have spaces/special chars: resolve to GUIDs via API
|
|
251
|
+
workspace_id, item_id = cls._resolve_names_to_guids(workspace_name, item_name, item_type)
|
|
252
|
+
else:
|
|
253
|
+
# No spaces/special chars: use friendly names directly (works for all item types)
|
|
254
|
+
# Use normalized name with proper case for API compatibility
|
|
255
|
+
workspace_id = workspace_name
|
|
256
|
+
item_id = item_name_normalized # Use normalized with proper case
|
|
257
|
+
|
|
258
|
+
return cls(workspace_id, item_id, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account, token_only)
|
|
186
259
|
|
|
187
260
|
@classmethod
|
|
188
|
-
def _resolve_names_to_guids(cls, workspace_name: str,
|
|
261
|
+
def _resolve_names_to_guids(cls, workspace_name: str, item_name: str, item_type: Optional[str] = 'Lakehouse') -> tuple[str, str]:
|
|
189
262
|
"""
|
|
190
|
-
Resolve friendly workspace and
|
|
191
|
-
|
|
192
|
-
Optimization: If names don't contain spaces, use them directly (no API calls needed).
|
|
193
|
-
Only resolve to GUIDs when names contain spaces or are already GUIDs.
|
|
263
|
+
Resolve friendly workspace and item names to their GUIDs.
|
|
194
264
|
|
|
195
265
|
Args:
|
|
196
266
|
workspace_name: Display name of the workspace (can contain spaces)
|
|
197
|
-
|
|
267
|
+
item_name: Display name of the item (can contain spaces)
|
|
268
|
+
item_type: Type of item - 'Lakehouse', 'Warehouse', 'Database', 'SnowflakeDatabase', etc.
|
|
198
269
|
|
|
199
270
|
Returns:
|
|
200
|
-
Tuple of (workspace_id,
|
|
271
|
+
Tuple of (workspace_id, item_id) - resolved GUIDs
|
|
201
272
|
"""
|
|
202
273
|
|
|
203
274
|
# Check if names are already GUIDs first
|
|
204
275
|
import re
|
|
205
276
|
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
206
277
|
|
|
207
|
-
if guid_pattern.match(workspace_name) and guid_pattern.match(
|
|
208
|
-
return workspace_name,
|
|
209
|
-
|
|
210
|
-
# Optimization: If workspace name has no spaces, use both names directly (old behavior)
|
|
211
|
-
# Note: Lakehouse names cannot contain spaces in Microsoft Fabric, only workspace names can
|
|
212
|
-
if " " not in workspace_name:
|
|
213
|
-
return workspace_name, lakehouse_name
|
|
214
|
-
|
|
215
|
-
# Workspace name contains spaces - need to resolve both to GUIDs for proper ABFSS URLs
|
|
278
|
+
if guid_pattern.match(workspace_name) and guid_pattern.match(item_name):
|
|
279
|
+
return workspace_name, item_name
|
|
216
280
|
|
|
281
|
+
# Need to resolve to GUIDs via API
|
|
217
282
|
try:
|
|
218
283
|
# Get authentication token using enhanced auth system
|
|
219
284
|
from .auth import get_fabric_api_token
|
|
@@ -231,8 +296,7 @@ class Duckrun:
|
|
|
231
296
|
|
|
232
297
|
# Resolve workspace name to ID
|
|
233
298
|
if current_workspace_id:
|
|
234
|
-
# In notebook environment,
|
|
235
|
-
# but we should validate it matches the requested workspace name
|
|
299
|
+
# In notebook environment, validate it matches the requested workspace name
|
|
236
300
|
workspace_id = cls._resolve_workspace_id_by_name(token, workspace_name)
|
|
237
301
|
if not workspace_id:
|
|
238
302
|
# Fallback to current workspace if name resolution fails
|
|
@@ -244,21 +308,26 @@ class Duckrun:
|
|
|
244
308
|
if not workspace_id:
|
|
245
309
|
raise ValueError(f"Workspace '{workspace_name}' not found")
|
|
246
310
|
|
|
247
|
-
# Resolve
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
311
|
+
# Resolve item name to ID based on item type
|
|
312
|
+
if item_type == 'Lakehouse':
|
|
313
|
+
item_id = cls._resolve_lakehouse_id_by_name(token, workspace_id, item_name)
|
|
314
|
+
else:
|
|
315
|
+
# Use generic item resolver for non-lakehouse items
|
|
316
|
+
item_id = cls._resolve_item_id_by_name(token, workspace_id, item_name, item_type)
|
|
251
317
|
|
|
252
|
-
|
|
318
|
+
if not item_id:
|
|
319
|
+
raise ValueError(f"{item_type} '{item_name}' not found in workspace '{workspace_name}'")
|
|
320
|
+
|
|
321
|
+
return workspace_id, item_id
|
|
253
322
|
|
|
254
323
|
except Exception as e:
|
|
255
324
|
print(f"❌ Failed to resolve names to GUIDs: {e}")
|
|
256
|
-
print(f"❌ Cannot
|
|
257
|
-
print("❌ Microsoft Fabric requires actual workspace and
|
|
325
|
+
print(f"❌ Cannot resolve '{workspace_name}'/'{item_name}' ({item_type}) to GUIDs")
|
|
326
|
+
print("❌ Microsoft Fabric requires actual workspace and item GUIDs for ABFSS access")
|
|
258
327
|
raise ValueError(
|
|
259
|
-
f"Unable to resolve workspace '{workspace_name}' and
|
|
260
|
-
f"ABFSS URLs require actual GUIDs
|
|
261
|
-
f"Please ensure you have proper authentication and the workspace/
|
|
328
|
+
f"Unable to resolve workspace '{workspace_name}' and {item_type.lower()} '{item_name}' to GUIDs. "
|
|
329
|
+
f"ABFSS URLs require actual GUIDs. "
|
|
330
|
+
f"Please ensure you have proper authentication and the workspace/item names are correct."
|
|
262
331
|
)
|
|
263
332
|
|
|
264
333
|
@classmethod
|
|
@@ -300,6 +369,58 @@ class Duckrun:
|
|
|
300
369
|
return None
|
|
301
370
|
except Exception:
|
|
302
371
|
return None
|
|
372
|
+
|
|
373
|
+
@classmethod
|
|
374
|
+
def _resolve_item_id_by_name(cls, token: str, workspace_id: str, item_name: str, item_type: str) -> Optional[str]:
|
|
375
|
+
"""
|
|
376
|
+
Get item ID from display name within a workspace using generic items API.
|
|
377
|
+
Works for any item type: Warehouse, Database, SnowflakeDatabase, etc.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
token: Fabric API authentication token
|
|
381
|
+
workspace_id: Workspace GUID
|
|
382
|
+
item_name: Display name of the item
|
|
383
|
+
item_type: Type of item (e.g., 'Warehouse', 'Database', 'SnowflakeDatabase')
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
Item GUID if found, None otherwise
|
|
387
|
+
"""
|
|
388
|
+
try:
|
|
389
|
+
import requests
|
|
390
|
+
# Use generic items API with type filter
|
|
391
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/items"
|
|
392
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
393
|
+
|
|
394
|
+
# Add type filter as query parameter
|
|
395
|
+
params = {"type": item_type}
|
|
396
|
+
|
|
397
|
+
print(f" Searching for {item_type} '{item_name}' in workspace {workspace_id}")
|
|
398
|
+
print(f" API URL: {url}?type={item_type}")
|
|
399
|
+
|
|
400
|
+
response = requests.get(url, headers=headers, params=params)
|
|
401
|
+
response.raise_for_status()
|
|
402
|
+
|
|
403
|
+
result = response.json()
|
|
404
|
+
items = result.get("value", [])
|
|
405
|
+
|
|
406
|
+
print(f" Found {len(items)} items of type {item_type}")
|
|
407
|
+
if items:
|
|
408
|
+
print(f" Available items: {[item.get('displayName') for item in items]}")
|
|
409
|
+
|
|
410
|
+
for item in items:
|
|
411
|
+
if item.get("displayName") == item_name:
|
|
412
|
+
item_id = item.get("id")
|
|
413
|
+
print(f" Found matching item: {item_name} -> {item_id}")
|
|
414
|
+
return item_id
|
|
415
|
+
|
|
416
|
+
print(f" Item '{item_name}' not found in the list")
|
|
417
|
+
return None
|
|
418
|
+
except Exception as e:
|
|
419
|
+
print(f" Error resolving {item_type} item: {e}")
|
|
420
|
+
if hasattr(e, 'response') and e.response is not None:
|
|
421
|
+
print(f" Response status: {e.response.status_code}")
|
|
422
|
+
print(f" Response body: {e.response.text}")
|
|
423
|
+
return None
|
|
303
424
|
|
|
304
425
|
@classmethod
|
|
305
426
|
def connect_workspace(cls, workspace_name: str):
|
|
@@ -341,77 +462,138 @@ class Duckrun:
|
|
|
341
462
|
|
|
342
463
|
def _discover_tables_fast(self) -> List[Tuple[str, str]]:
|
|
343
464
|
"""
|
|
344
|
-
Fast
|
|
345
|
-
|
|
465
|
+
Fast table discovery using OneLake Delta Table API (Unity Catalog compatible).
|
|
466
|
+
Uses: https://learn.microsoft.com/en-us/fabric/onelake/table-apis/delta-table-apis-overview
|
|
346
467
|
|
|
347
468
|
Returns:
|
|
348
469
|
List of tuples: [(schema, table_name), ...]
|
|
349
470
|
"""
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
471
|
+
try:
|
|
472
|
+
# Get storage token for OneLake
|
|
473
|
+
token = self._get_storage_token()
|
|
474
|
+
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
|
475
|
+
print("Authenticating with Azure for table discovery...")
|
|
476
|
+
from .auth import get_token
|
|
477
|
+
token = get_token()
|
|
478
|
+
if not token:
|
|
479
|
+
print("❌ Failed to authenticate for table discovery")
|
|
480
|
+
return []
|
|
481
|
+
|
|
482
|
+
# OneLake Delta Table API endpoint (Unity Catalog compatible)
|
|
483
|
+
base_url = "https://onelake.table.fabric.microsoft.com/delta"
|
|
484
|
+
|
|
485
|
+
# Determine workspace/item identifier for API
|
|
486
|
+
# Per docs: Can use friendly names (WorkspaceName/ItemName.ItemType) if no special characters
|
|
487
|
+
# Otherwise must use GUIDs (WorkspaceID/ItemID)
|
|
488
|
+
import re
|
|
489
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
490
|
+
|
|
491
|
+
# Check if we're using GUIDs or friendly names
|
|
492
|
+
if guid_pattern.match(self.workspace_id) and guid_pattern.match(self.lakehouse_id):
|
|
493
|
+
# Using GUIDs - use them directly in API
|
|
494
|
+
workspace_identifier = self.workspace_id
|
|
495
|
+
item_identifier = self.lakehouse_id
|
|
496
|
+
catalog_name = self.lakehouse_id
|
|
371
497
|
else:
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
schema_path = f"{base_path}{schema_name}/"
|
|
388
|
-
result = obs.list_with_delimiter(store, prefix=schema_path)
|
|
498
|
+
# Using friendly names - lakehouse_id already includes .ItemType suffix
|
|
499
|
+
workspace_identifier = self.workspace_id
|
|
500
|
+
item_identifier = self.lakehouse_id
|
|
501
|
+
catalog_name = self.lakehouse_id
|
|
502
|
+
|
|
503
|
+
print(f"🔍 Discovering tables via OneLake Delta Table API...")
|
|
504
|
+
print(f" Using identifier: {workspace_identifier}/{item_identifier}")
|
|
505
|
+
|
|
506
|
+
tables_found = []
|
|
507
|
+
|
|
508
|
+
if self.scan_all_schemas:
|
|
509
|
+
# First, list all schemas
|
|
510
|
+
schemas_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/schemas"
|
|
511
|
+
params = {"catalog_name": catalog_name}
|
|
512
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
389
513
|
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
514
|
+
schemas_response = requests.get(schemas_url, headers=headers, params=params)
|
|
515
|
+
|
|
516
|
+
if schemas_response.status_code == 200:
|
|
517
|
+
schemas_result = schemas_response.json()
|
|
518
|
+
schemas = schemas_result.get("schemas", [])
|
|
519
|
+
schema_names = [s.get("name") for s in schemas if s.get("name")]
|
|
520
|
+
|
|
521
|
+
print(f" Found {len(schema_names)} schemas: {schema_names}")
|
|
522
|
+
|
|
523
|
+
# Get tables from each schema
|
|
524
|
+
for schema_name in schema_names:
|
|
525
|
+
tables_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/tables"
|
|
526
|
+
tables_params = {
|
|
527
|
+
"catalog_name": catalog_name,
|
|
528
|
+
"schema_name": schema_name
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
tables_response = requests.get(tables_url, headers=headers, params=tables_params)
|
|
532
|
+
|
|
533
|
+
if tables_response.status_code == 200:
|
|
534
|
+
tables_result = tables_response.json()
|
|
535
|
+
tables = tables_result.get("tables", [])
|
|
536
|
+
|
|
537
|
+
for table in tables:
|
|
538
|
+
table_name = table.get("name", "")
|
|
539
|
+
if table_name:
|
|
540
|
+
tables_found.append((schema_name, table_name))
|
|
541
|
+
|
|
542
|
+
if tables:
|
|
543
|
+
print(f" Schema '{schema_name}': {len(tables)} tables")
|
|
544
|
+
else:
|
|
545
|
+
print(f" Failed to list schemas: {schemas_response.status_code}")
|
|
546
|
+
if schemas_response.status_code != 404:
|
|
547
|
+
print(f" Response: {schemas_response.text[:300]}")
|
|
548
|
+
else:
|
|
549
|
+
# Single schema mode - list tables in specific schema
|
|
550
|
+
tables_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/tables"
|
|
551
|
+
params = {
|
|
552
|
+
"catalog_name": catalog_name,
|
|
553
|
+
"schema_name": self.schema
|
|
554
|
+
}
|
|
555
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
556
|
+
|
|
557
|
+
print(f" Listing tables in schema: {self.schema}")
|
|
558
|
+
tables_response = requests.get(tables_url, headers=headers, params=params)
|
|
559
|
+
|
|
560
|
+
if tables_response.status_code == 200:
|
|
561
|
+
tables_result = tables_response.json()
|
|
562
|
+
tables = tables_result.get("tables", [])
|
|
563
|
+
|
|
564
|
+
for table in tables:
|
|
565
|
+
table_name = table.get("name", "")
|
|
566
|
+
if table_name:
|
|
567
|
+
tables_found.append((self.schema, table_name))
|
|
568
|
+
|
|
569
|
+
print(f" Found {len(tables)} tables")
|
|
570
|
+
elif tables_response.status_code == 404:
|
|
571
|
+
print(f" Schema '{self.schema}' not found or has no tables")
|
|
572
|
+
else:
|
|
573
|
+
print(f" Failed to list tables: {tables_response.status_code}")
|
|
574
|
+
print(f" Response: {tables_response.text[:300]}")
|
|
399
575
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
576
|
+
return tables_found
|
|
577
|
+
|
|
578
|
+
except Exception as e:
|
|
579
|
+
print(f"❌ Error during table discovery: {e}")
|
|
580
|
+
import traceback
|
|
581
|
+
traceback.print_exc()
|
|
582
|
+
return []
|
|
406
583
|
|
|
407
584
|
def _attach_lakehouse(self):
|
|
408
585
|
"""Attach lakehouse tables as DuckDB views using fast discovery"""
|
|
586
|
+
print(f"🔌 Attaching tables from schema: {self.schema if not self.scan_all_schemas else 'all schemas'}")
|
|
409
587
|
self._create_onelake_secret()
|
|
410
588
|
|
|
411
589
|
try:
|
|
412
590
|
tables = self._discover_tables_fast()
|
|
413
591
|
|
|
414
592
|
if not tables:
|
|
593
|
+
if self.scan_all_schemas:
|
|
594
|
+
print(f"⚠️ No tables found in any schema")
|
|
595
|
+
else:
|
|
596
|
+
print(f"⚠️ No tables found in {self.schema} schema")
|
|
415
597
|
return
|
|
416
598
|
|
|
417
599
|
# Collect table names for display
|
|
@@ -434,6 +616,7 @@ class Duckrun:
|
|
|
434
616
|
AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
|
|
435
617
|
""")
|
|
436
618
|
except Exception as e:
|
|
619
|
+
print(f"⚠️ Failed to attach table {schema_name}.{table_name}: {e}")
|
|
437
620
|
continue
|
|
438
621
|
|
|
439
622
|
# Print discovered tables as comma-separated list
|
|
@@ -442,6 +625,8 @@ class Duckrun:
|
|
|
442
625
|
|
|
443
626
|
except Exception as e:
|
|
444
627
|
print(f"❌ Error attaching lakehouse: {e}")
|
|
628
|
+
import traceback
|
|
629
|
+
traceback.print_exc()
|
|
445
630
|
|
|
446
631
|
def _register_lookup_functions(self):
|
|
447
632
|
"""
|
|
@@ -130,9 +130,12 @@ def _run_python(duckrun_instance, name: str, args: tuple) -> Any:
|
|
|
130
130
|
|
|
131
131
|
# Get original and resolved names
|
|
132
132
|
original_workspace = duckrun_instance.workspace
|
|
133
|
-
original_lakehouse = duckrun_instance.
|
|
133
|
+
original_lakehouse = duckrun_instance.lakehouse_display_name # Base name without suffix (e.g., "data")
|
|
134
134
|
resolved_workspace = duckrun_instance.workspace_id
|
|
135
|
-
|
|
135
|
+
|
|
136
|
+
# Always pass base lakehouse name (without .Lakehouse suffix) to user functions
|
|
137
|
+
# User functions expect just the name like "data", not "data.Lakehouse"
|
|
138
|
+
resolved_lakehouse = duckrun_instance.lakehouse_display_name
|
|
136
139
|
|
|
137
140
|
# Substitute workspace/lakehouse names in args if they differ
|
|
138
141
|
# This prevents URL encoding issues when names contain spaces
|
|
@@ -149,7 +152,7 @@ def _run_python(duckrun_instance, name: str, args: tuple) -> Any:
|
|
|
149
152
|
else:
|
|
150
153
|
substituted_args.append(arg)
|
|
151
154
|
args = tuple(substituted_args)
|
|
152
|
-
print(f"📝 Auto-substituted workspace/lakehouse names in args
|
|
155
|
+
print(f"📝 Auto-substituted workspace/lakehouse names in args")
|
|
153
156
|
|
|
154
157
|
print(f"Running Python: {name}{args}")
|
|
155
158
|
result = func(*args)
|
|
@@ -282,12 +285,17 @@ def _read_sql_file(duckrun_instance, table_name: str, params: Optional[Dict] = N
|
|
|
282
285
|
# If GUID, use just the GUID
|
|
283
286
|
content = content.replace('${lh}.Lakehouse', duckrun_instance.lakehouse_name)
|
|
284
287
|
else:
|
|
285
|
-
# If not GUID,
|
|
286
|
-
|
|
288
|
+
# If not GUID, check if lakehouse_name already has .ItemType suffix
|
|
289
|
+
if duckrun_instance.lakehouse_name.endswith(('.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase')):
|
|
290
|
+
# Already has suffix - use as is
|
|
291
|
+
content = content.replace('${lh}.Lakehouse', duckrun_instance.lakehouse_name)
|
|
292
|
+
else:
|
|
293
|
+
# No suffix - add .Lakehouse for legacy format
|
|
294
|
+
content = content.replace('${lh}.Lakehouse', f'{duckrun_instance.lakehouse_name}.Lakehouse')
|
|
287
295
|
|
|
288
296
|
full_params = {
|
|
289
297
|
'ws': duckrun_instance.workspace,
|
|
290
|
-
'lh': duckrun_instance.
|
|
298
|
+
'lh': duckrun_instance.lakehouse_display_name, # Use display name (without suffix) for backward compat
|
|
291
299
|
'schema': duckrun_instance.schema,
|
|
292
300
|
'storage_account': duckrun_instance.storage_account,
|
|
293
301
|
'tables_url': duckrun_instance.table_base_url,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.14.
|
|
3
|
+
Version: 0.2.14.dev3
|
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -10,7 +10,7 @@ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
|
|
10
10
|
Requires-Python: >=3.9
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
|
-
Requires-Dist: duckdb>=1.2.
|
|
13
|
+
Requires-Dist: duckdb>=1.2.0
|
|
14
14
|
Requires-Dist: deltalake<=0.18.2
|
|
15
15
|
Requires-Dist: requests>=2.28.0
|
|
16
16
|
Requires-Dist: obstore>=0.2.0
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.14.
|
|
7
|
+
version = "0.2.14.dev3"
|
|
8
8
|
description = "Lakehouse task runner powered by DuckDB for Microsoft Fabric"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -13,7 +13,7 @@ authors = [
|
|
|
13
13
|
]
|
|
14
14
|
requires-python = ">=3.9"
|
|
15
15
|
dependencies = [
|
|
16
|
-
"duckdb>=1.2.
|
|
16
|
+
"duckdb>=1.2.0",
|
|
17
17
|
"deltalake<=0.18.2",
|
|
18
18
|
"requests>=2.28.0",
|
|
19
19
|
"obstore>=0.2.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|