duckrun 0.2.16.dev2__tar.gz → 0.2.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckrun might be problematic. Click here for more details.
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/PKG-INFO +2 -2
- duckrun-0.2.18/duckrun/__init__.py +11 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun/core.py +87 -50
- duckrun-0.2.18/duckrun/notebook.py +324 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun/semantic_model.py +143 -17
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun/stats.py +104 -16
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun.egg-info/PKG-INFO +2 -2
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun.egg-info/SOURCES.txt +1 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/pyproject.toml +2 -2
- duckrun-0.2.16.dev2/duckrun/__init__.py +0 -10
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/LICENSE +0 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/README.md +0 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun/auth.py +0 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun/files.py +0 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun/runner.py +0 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun/writer.py +0 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.16.dev2 → duckrun-0.2.18}/setup.cfg +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.2.18
|
|
4
|
+
Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Duckrun - Lakehouse task runner powered by DuckDB"""
|
|
2
|
+
|
|
3
|
+
from duckrun.core import Duckrun
|
|
4
|
+
from duckrun.notebook import import_notebook_from_web, import_notebook
|
|
5
|
+
|
|
6
|
+
__version__ = "0.2.18"
|
|
7
|
+
|
|
8
|
+
# Expose unified connect method at module level
|
|
9
|
+
connect = Duckrun.connect
|
|
10
|
+
|
|
11
|
+
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
|
|
@@ -12,7 +12,71 @@ from .runner import run as _run
|
|
|
12
12
|
from .files import copy as _copy, download as _download
|
|
13
13
|
from .writer import QueryResult
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
|
|
16
|
+
class WorkspaceOperationsMixin:
|
|
17
|
+
"""
|
|
18
|
+
Mixin class for workspace-level operations that work for both
|
|
19
|
+
full Duckrun connections and workspace-only connections.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def import_notebook_from_web(self, url: str,
|
|
23
|
+
notebook_name: Optional[str] = None,
|
|
24
|
+
overwrite: bool = False) -> dict:
|
|
25
|
+
"""
|
|
26
|
+
Import a Jupyter notebook from a web URL into the workspace.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
url: URL to the notebook file (e.g., GitHub raw URL). Required.
|
|
30
|
+
notebook_name: Name for the imported notebook. Optional - derived from URL if not provided.
|
|
31
|
+
overwrite: Whether to overwrite if notebook already exists (default: False)
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Dictionary with import result
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
38
|
+
result = con.import_notebook_from_web(
|
|
39
|
+
url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
ws = duckrun.connect("workspace")
|
|
43
|
+
result = ws.import_notebook_from_web(
|
|
44
|
+
url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
|
|
45
|
+
)
|
|
46
|
+
"""
|
|
47
|
+
from .notebook import import_notebook_from_web as _import_notebook_from_web
|
|
48
|
+
|
|
49
|
+
# Get workspace name from either self.workspace or self.workspace_name
|
|
50
|
+
workspace_name = getattr(self, 'workspace', None) or getattr(self, 'workspace_name', None)
|
|
51
|
+
|
|
52
|
+
return _import_notebook_from_web(
|
|
53
|
+
url=url,
|
|
54
|
+
notebook_name=notebook_name,
|
|
55
|
+
overwrite=overwrite,
|
|
56
|
+
workspace_name=workspace_name
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
|
60
|
+
"""Helper method to get workspace ID from name"""
|
|
61
|
+
try:
|
|
62
|
+
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
|
63
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
64
|
+
|
|
65
|
+
response = requests.get(url, headers=headers)
|
|
66
|
+
response.raise_for_status()
|
|
67
|
+
|
|
68
|
+
workspaces = response.json().get("value", [])
|
|
69
|
+
for workspace in workspaces:
|
|
70
|
+
if workspace.get("displayName") == workspace_name:
|
|
71
|
+
return workspace.get("id")
|
|
72
|
+
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
except Exception:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class Duckrun(WorkspaceOperationsMixin):
|
|
16
80
|
"""
|
|
17
81
|
OneLake task runner with clean tuple-based API.
|
|
18
82
|
Supports lakehouses, warehouses, databases, and other OneLake items.
|
|
@@ -971,12 +1035,13 @@ class Duckrun:
|
|
|
971
1035
|
"""Get underlying DuckDB connection"""
|
|
972
1036
|
return self.con
|
|
973
1037
|
|
|
974
|
-
def get_stats(self, source: str):
|
|
1038
|
+
def get_stats(self, source: str = None):
|
|
975
1039
|
"""
|
|
976
1040
|
Get comprehensive statistics for Delta Lake tables.
|
|
977
1041
|
|
|
978
1042
|
Args:
|
|
979
|
-
source: Can be one of:
|
|
1043
|
+
source: Optional. Can be one of:
|
|
1044
|
+
- None: Use all tables in the connection's schema (default)
|
|
980
1045
|
- Table name: 'table_name' (uses current schema)
|
|
981
1046
|
- Schema.table: 'schema.table_name' (specific table in schema)
|
|
982
1047
|
- Schema only: 'schema' (all tables in schema)
|
|
@@ -988,6 +1053,9 @@ class Duckrun:
|
|
|
988
1053
|
Examples:
|
|
989
1054
|
con = duckrun.connect("tmp/data.lakehouse/aemo")
|
|
990
1055
|
|
|
1056
|
+
# All tables in current schema (aemo)
|
|
1057
|
+
stats = con.get_stats()
|
|
1058
|
+
|
|
991
1059
|
# Single table in current schema
|
|
992
1060
|
stats = con.get_stats('price')
|
|
993
1061
|
|
|
@@ -1111,7 +1179,7 @@ class Duckrun:
|
|
|
1111
1179
|
return False
|
|
1112
1180
|
|
|
1113
1181
|
def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
|
|
1114
|
-
wait_seconds: int = 5) -> int:
|
|
1182
|
+
wait_seconds: int = 5, refresh: str = "full") -> int:
|
|
1115
1183
|
"""
|
|
1116
1184
|
Deploy a semantic model from a BIM file using DirectLake mode.
|
|
1117
1185
|
|
|
@@ -1120,8 +1188,11 @@ class Duckrun:
|
|
|
1120
1188
|
- URL: "https://raw.githubusercontent.com/.../model.bim"
|
|
1121
1189
|
- Local file: "model.bim"
|
|
1122
1190
|
- Workspace/Model: "workspace_name/model_name"
|
|
1123
|
-
dataset_name: Name for the semantic model (default:
|
|
1191
|
+
dataset_name: Name for the semantic model (default: schema name)
|
|
1124
1192
|
wait_seconds: Seconds to wait for permission propagation (default: 5)
|
|
1193
|
+
refresh: Refresh strategy:
|
|
1194
|
+
- "full": Clear values and process full refresh (default)
|
|
1195
|
+
- "ignore": Skip refresh entirely
|
|
1125
1196
|
|
|
1126
1197
|
Returns:
|
|
1127
1198
|
1 for success, 0 for failure
|
|
@@ -1129,14 +1200,17 @@ class Duckrun:
|
|
|
1129
1200
|
Examples:
|
|
1130
1201
|
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
|
1131
1202
|
|
|
1203
|
+
# Deploy with schema name as dataset name (dbo)
|
|
1204
|
+
dr.deploy("https://github.com/.../model.bim")
|
|
1205
|
+
|
|
1132
1206
|
# Deploy from workspace/model (uses same name by default)
|
|
1133
1207
|
dr.deploy("Source Workspace/Source Model") # Creates "Source Model"
|
|
1134
1208
|
|
|
1135
1209
|
# Deploy with custom name
|
|
1136
|
-
dr.deploy("
|
|
1210
|
+
dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
|
|
1137
1211
|
|
|
1138
|
-
# Deploy
|
|
1139
|
-
dr.deploy("https://
|
|
1212
|
+
# Deploy without refresh
|
|
1213
|
+
dr.deploy("https://github.com/.../model.bim", refresh="ignore")
|
|
1140
1214
|
"""
|
|
1141
1215
|
from .semantic_model import deploy_semantic_model
|
|
1142
1216
|
|
|
@@ -1148,9 +1222,9 @@ class Duckrun:
|
|
|
1148
1222
|
if len(parts) == 2:
|
|
1149
1223
|
dataset_name = parts[1] # Use the model name
|
|
1150
1224
|
else:
|
|
1151
|
-
dataset_name =
|
|
1225
|
+
dataset_name = self.schema # Use schema name
|
|
1152
1226
|
else:
|
|
1153
|
-
dataset_name =
|
|
1227
|
+
dataset_name = self.schema # Use schema name
|
|
1154
1228
|
|
|
1155
1229
|
# Call the deployment function (DirectLake only)
|
|
1156
1230
|
return deploy_semantic_model(
|
|
@@ -1159,28 +1233,10 @@ class Duckrun:
|
|
|
1159
1233
|
schema_name=self.schema,
|
|
1160
1234
|
dataset_name=dataset_name,
|
|
1161
1235
|
bim_url_or_path=bim_url,
|
|
1162
|
-
wait_seconds=wait_seconds
|
|
1236
|
+
wait_seconds=wait_seconds,
|
|
1237
|
+
refresh=refresh
|
|
1163
1238
|
)
|
|
1164
1239
|
|
|
1165
|
-
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
|
1166
|
-
"""Helper method to get workspace ID from name"""
|
|
1167
|
-
try:
|
|
1168
|
-
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
|
1169
|
-
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
1170
|
-
|
|
1171
|
-
response = requests.get(url, headers=headers)
|
|
1172
|
-
response.raise_for_status()
|
|
1173
|
-
|
|
1174
|
-
workspaces = response.json().get("value", [])
|
|
1175
|
-
for workspace in workspaces:
|
|
1176
|
-
if workspace.get("displayName") == workspace_name:
|
|
1177
|
-
return workspace.get("id")
|
|
1178
|
-
|
|
1179
|
-
return None
|
|
1180
|
-
|
|
1181
|
-
except Exception:
|
|
1182
|
-
return None
|
|
1183
|
-
|
|
1184
1240
|
def close(self):
|
|
1185
1241
|
"""Close DuckDB connection"""
|
|
1186
1242
|
if self.con:
|
|
@@ -1188,7 +1244,7 @@ class Duckrun:
|
|
|
1188
1244
|
print("Connection closed")
|
|
1189
1245
|
|
|
1190
1246
|
|
|
1191
|
-
class WorkspaceConnection:
|
|
1247
|
+
class WorkspaceConnection(WorkspaceOperationsMixin):
|
|
1192
1248
|
"""
|
|
1193
1249
|
Simple workspace connection for lakehouse management operations.
|
|
1194
1250
|
"""
|
|
@@ -1428,23 +1484,4 @@ class WorkspaceConnection:
|
|
|
1428
1484
|
print(f"❌ Error downloading semantic model: {e}")
|
|
1429
1485
|
import traceback
|
|
1430
1486
|
traceback.print_exc()
|
|
1431
|
-
return None
|
|
1432
|
-
|
|
1433
|
-
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
|
1434
|
-
"""Helper method to get workspace ID from name"""
|
|
1435
|
-
try:
|
|
1436
|
-
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
|
1437
|
-
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
1438
|
-
|
|
1439
|
-
response = requests.get(url, headers=headers)
|
|
1440
|
-
response.raise_for_status()
|
|
1441
|
-
|
|
1442
|
-
workspaces = response.json().get("value", [])
|
|
1443
|
-
for workspace in workspaces:
|
|
1444
|
-
if workspace.get("displayName") == workspace_name:
|
|
1445
|
-
return workspace.get("id")
|
|
1446
|
-
|
|
1447
|
-
return None
|
|
1448
|
-
|
|
1449
|
-
except Exception:
|
|
1450
1487
|
return None
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Notebook operations functionality for duckrun - Import notebooks from web using Fabric REST API
|
|
3
|
+
"""
|
|
4
|
+
import requests
|
|
5
|
+
import base64
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def import_notebook_from_web(
|
|
10
|
+
url: str,
|
|
11
|
+
notebook_name: Optional[str] = None,
|
|
12
|
+
overwrite: bool = False,
|
|
13
|
+
workspace_name: Optional[str] = None
|
|
14
|
+
) -> dict:
|
|
15
|
+
"""
|
|
16
|
+
Import a Jupyter notebook from a web URL into Microsoft Fabric workspace using REST API only.
|
|
17
|
+
Uses duckrun.connect context by default or explicit workspace name.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
url: URL to the notebook file (e.g., GitHub raw URL). Required.
|
|
21
|
+
notebook_name: Name for the imported notebook in Fabric. Optional - will use filename from URL if not provided.
|
|
22
|
+
overwrite: Whether to overwrite if notebook already exists (default: False)
|
|
23
|
+
workspace_name: Target workspace name. Optional - will use current workspace from duckrun context if available.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Dictionary with import result:
|
|
27
|
+
{
|
|
28
|
+
"success": bool,
|
|
29
|
+
"message": str,
|
|
30
|
+
"notebook": dict (if successful),
|
|
31
|
+
"overwritten": bool
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
# Basic usage with duckrun context
|
|
36
|
+
import duckrun
|
|
37
|
+
dr = duckrun.connect("MyWorkspace/MyLakehouse.lakehouse")
|
|
38
|
+
from duckrun.notebook import import_notebook_from_web
|
|
39
|
+
|
|
40
|
+
result = import_notebook_from_web(
|
|
41
|
+
url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb",
|
|
42
|
+
notebook_name="MyNotebook"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# With explicit workspace
|
|
46
|
+
result = import_notebook_from_web(
|
|
47
|
+
url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb",
|
|
48
|
+
notebook_name="MyNotebook",
|
|
49
|
+
workspace_name="Analytics Workspace",
|
|
50
|
+
overwrite=True
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Minimal usage - derives name from URL
|
|
54
|
+
result = import_notebook_from_web(
|
|
55
|
+
url="https://raw.githubusercontent.com/user/repo/main/RunPerfScenario.ipynb"
|
|
56
|
+
)
|
|
57
|
+
"""
|
|
58
|
+
try:
|
|
59
|
+
# Get authentication token
|
|
60
|
+
from duckrun.auth import get_fabric_api_token
|
|
61
|
+
token = get_fabric_api_token()
|
|
62
|
+
if not token:
|
|
63
|
+
return {
|
|
64
|
+
"success": False,
|
|
65
|
+
"message": "Failed to get authentication token",
|
|
66
|
+
"notebook": None,
|
|
67
|
+
"overwritten": False
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
base_url = "https://api.fabric.microsoft.com/v1"
|
|
71
|
+
headers = {
|
|
72
|
+
"Authorization": f"Bearer {token}",
|
|
73
|
+
"Content-Type": "application/json"
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Determine workspace ID
|
|
77
|
+
workspace_id = None
|
|
78
|
+
|
|
79
|
+
# Try to get from duckrun context if not provided
|
|
80
|
+
if not workspace_name:
|
|
81
|
+
try:
|
|
82
|
+
# Try to get from notebook context first
|
|
83
|
+
import notebookutils # type: ignore
|
|
84
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
|
85
|
+
print("📓 Using current workspace from Fabric notebook context")
|
|
86
|
+
except (ImportError, Exception):
|
|
87
|
+
# Not in notebook, try to get from environment/last connection
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
# If still no workspace_id, resolve from workspace_name
|
|
91
|
+
if not workspace_id:
|
|
92
|
+
if not workspace_name:
|
|
93
|
+
return {
|
|
94
|
+
"success": False,
|
|
95
|
+
"message": "workspace_name must be provided when not in Fabric notebook context",
|
|
96
|
+
"notebook": None,
|
|
97
|
+
"overwritten": False
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Get workspace ID by name
|
|
101
|
+
print(f"🔍 Resolving workspace: {workspace_name}")
|
|
102
|
+
ws_url = f"{base_url}/workspaces"
|
|
103
|
+
response = requests.get(ws_url, headers=headers)
|
|
104
|
+
response.raise_for_status()
|
|
105
|
+
|
|
106
|
+
workspaces = response.json().get("value", [])
|
|
107
|
+
workspace = next((ws for ws in workspaces if ws.get("displayName") == workspace_name), None)
|
|
108
|
+
|
|
109
|
+
if not workspace:
|
|
110
|
+
return {
|
|
111
|
+
"success": False,
|
|
112
|
+
"message": f"Workspace '{workspace_name}' not found",
|
|
113
|
+
"notebook": None,
|
|
114
|
+
"overwritten": False
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
workspace_id = workspace.get("id")
|
|
118
|
+
print(f"✓ Found workspace: {workspace_name}")
|
|
119
|
+
|
|
120
|
+
# Derive notebook name from URL if not provided
|
|
121
|
+
if not notebook_name:
|
|
122
|
+
# Extract filename from URL
|
|
123
|
+
notebook_name = url.split("/")[-1]
|
|
124
|
+
if notebook_name.endswith(".ipynb"):
|
|
125
|
+
notebook_name = notebook_name[:-6] # Remove .ipynb extension
|
|
126
|
+
print(f"📝 Using notebook name from URL: {notebook_name}")
|
|
127
|
+
|
|
128
|
+
# Check if notebook already exists
|
|
129
|
+
notebooks_url = f"{base_url}/workspaces/{workspace_id}/notebooks"
|
|
130
|
+
response = requests.get(notebooks_url, headers=headers)
|
|
131
|
+
response.raise_for_status()
|
|
132
|
+
|
|
133
|
+
notebooks = response.json().get("value", [])
|
|
134
|
+
existing_notebook = next((nb for nb in notebooks if nb.get("displayName") == notebook_name), None)
|
|
135
|
+
|
|
136
|
+
if existing_notebook and not overwrite:
|
|
137
|
+
return {
|
|
138
|
+
"success": True,
|
|
139
|
+
"message": f"Notebook '{notebook_name}' already exists (use overwrite=True to replace)",
|
|
140
|
+
"notebook": existing_notebook,
|
|
141
|
+
"overwritten": False
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
# Download notebook content from URL
|
|
145
|
+
print(f"⬇️ Downloading notebook from: {url}")
|
|
146
|
+
response = requests.get(url)
|
|
147
|
+
response.raise_for_status()
|
|
148
|
+
notebook_content = response.text
|
|
149
|
+
print(f"✓ Notebook downloaded successfully")
|
|
150
|
+
|
|
151
|
+
# Convert notebook content to base64
|
|
152
|
+
notebook_base64 = base64.b64encode(notebook_content.encode('utf-8')).decode('utf-8')
|
|
153
|
+
|
|
154
|
+
# Prepare the payload for creating/updating the notebook
|
|
155
|
+
if existing_notebook and overwrite:
|
|
156
|
+
# Update existing notebook
|
|
157
|
+
notebook_id = existing_notebook.get("id")
|
|
158
|
+
print(f"🔄 Updating existing notebook: {notebook_name}")
|
|
159
|
+
|
|
160
|
+
update_url = f"{base_url}/workspaces/{workspace_id}/notebooks/{notebook_id}/updateDefinition"
|
|
161
|
+
payload = {
|
|
162
|
+
"definition": {
|
|
163
|
+
"format": "ipynb",
|
|
164
|
+
"parts": [
|
|
165
|
+
{
|
|
166
|
+
"path": "notebook-content.py",
|
|
167
|
+
"payload": notebook_base64,
|
|
168
|
+
"payloadType": "InlineBase64"
|
|
169
|
+
}
|
|
170
|
+
]
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
response = requests.post(update_url, headers=headers, json=payload)
|
|
175
|
+
response.raise_for_status()
|
|
176
|
+
|
|
177
|
+
# Handle long-running operation
|
|
178
|
+
if response.status_code == 202:
|
|
179
|
+
operation_id = response.headers.get('x-ms-operation-id')
|
|
180
|
+
if operation_id:
|
|
181
|
+
_wait_for_operation(operation_id, headers)
|
|
182
|
+
|
|
183
|
+
return {
|
|
184
|
+
"success": True,
|
|
185
|
+
"message": f"Notebook '{notebook_name}' updated successfully",
|
|
186
|
+
"notebook": existing_notebook,
|
|
187
|
+
"overwritten": True
|
|
188
|
+
}
|
|
189
|
+
else:
|
|
190
|
+
# Create new notebook
|
|
191
|
+
print(f"➕ Creating new notebook: {notebook_name}")
|
|
192
|
+
|
|
193
|
+
payload = {
|
|
194
|
+
"displayName": notebook_name,
|
|
195
|
+
"definition": {
|
|
196
|
+
"format": "ipynb",
|
|
197
|
+
"parts": [
|
|
198
|
+
{
|
|
199
|
+
"path": "notebook-content.py",
|
|
200
|
+
"payload": notebook_base64,
|
|
201
|
+
"payloadType": "InlineBase64"
|
|
202
|
+
}
|
|
203
|
+
]
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
response = requests.post(notebooks_url, headers=headers, json=payload)
|
|
208
|
+
response.raise_for_status()
|
|
209
|
+
|
|
210
|
+
# Handle long-running operation
|
|
211
|
+
if response.status_code == 202:
|
|
212
|
+
operation_id = response.headers.get('x-ms-operation-id')
|
|
213
|
+
if operation_id:
|
|
214
|
+
_wait_for_operation(operation_id, headers)
|
|
215
|
+
|
|
216
|
+
created_notebook = response.json()
|
|
217
|
+
|
|
218
|
+
return {
|
|
219
|
+
"success": True,
|
|
220
|
+
"message": f"Notebook '{notebook_name}' created successfully",
|
|
221
|
+
"notebook": created_notebook,
|
|
222
|
+
"overwritten": False
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
except requests.exceptions.RequestException as e:
|
|
226
|
+
return {
|
|
227
|
+
"success": False,
|
|
228
|
+
"message": f"HTTP Error: {str(e)}",
|
|
229
|
+
"notebook": None,
|
|
230
|
+
"overwritten": False
|
|
231
|
+
}
|
|
232
|
+
except Exception as e:
|
|
233
|
+
return {
|
|
234
|
+
"success": False,
|
|
235
|
+
"message": f"Error: {str(e)}",
|
|
236
|
+
"notebook": None,
|
|
237
|
+
"overwritten": False
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _wait_for_operation(operation_id: str, headers: dict, max_attempts: int = 30) -> bool:
|
|
242
|
+
"""
|
|
243
|
+
Wait for a long-running Fabric API operation to complete.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
operation_id: The operation ID to monitor
|
|
247
|
+
headers: Request headers with authentication
|
|
248
|
+
max_attempts: Maximum number of polling attempts (default: 30)
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
True if operation succeeded, False otherwise
|
|
252
|
+
"""
|
|
253
|
+
import time
|
|
254
|
+
|
|
255
|
+
status_url = f"https://api.fabric.microsoft.com/v1/operations/{operation_id}"
|
|
256
|
+
|
|
257
|
+
for attempt in range(max_attempts):
|
|
258
|
+
time.sleep(2)
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
response = requests.get(status_url, headers=headers)
|
|
262
|
+
response.raise_for_status()
|
|
263
|
+
|
|
264
|
+
status_data = response.json()
|
|
265
|
+
status = status_data.get('status')
|
|
266
|
+
|
|
267
|
+
if status == 'Succeeded':
|
|
268
|
+
print(f"✓ Operation completed successfully")
|
|
269
|
+
return True
|
|
270
|
+
elif status == 'Failed':
|
|
271
|
+
error = status_data.get('error', {})
|
|
272
|
+
print(f"❌ Operation failed: {error.get('message', 'Unknown error')}")
|
|
273
|
+
return False
|
|
274
|
+
else:
|
|
275
|
+
print(f"⏳ Operation in progress... ({status})")
|
|
276
|
+
|
|
277
|
+
except Exception as e:
|
|
278
|
+
print(f"⚠️ Error checking operation status: {e}")
|
|
279
|
+
return False
|
|
280
|
+
|
|
281
|
+
print(f"⚠️ Operation timed out after {max_attempts} attempts")
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# Convenience wrapper for the try-except pattern mentioned in the request
|
|
286
|
+
def import_notebook(
|
|
287
|
+
url: str,
|
|
288
|
+
notebook_name: Optional[str] = None,
|
|
289
|
+
overwrite: bool = False,
|
|
290
|
+
workspace_name: Optional[str] = None
|
|
291
|
+
) -> None:
|
|
292
|
+
"""
|
|
293
|
+
Convenience wrapper that prints results and handles errors.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
url: URL to the notebook file
|
|
297
|
+
notebook_name: Name for the imported notebook
|
|
298
|
+
overwrite: Whether to overwrite if exists
|
|
299
|
+
workspace_name: Target workspace name
|
|
300
|
+
|
|
301
|
+
Examples:
|
|
302
|
+
from duckrun.notebook import import_notebook
|
|
303
|
+
|
|
304
|
+
import_notebook(
|
|
305
|
+
url="https://raw.githubusercontent.com/djouallah/fabric_demo/refs/heads/main/Benchmark/RunPerfScenario.ipynb",
|
|
306
|
+
notebook_name="RunPerfScenario",
|
|
307
|
+
overwrite=False
|
|
308
|
+
)
|
|
309
|
+
"""
|
|
310
|
+
try:
|
|
311
|
+
result = import_notebook_from_web(
|
|
312
|
+
url=url,
|
|
313
|
+
notebook_name=notebook_name,
|
|
314
|
+
overwrite=overwrite,
|
|
315
|
+
workspace_name=workspace_name
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
if result["success"]:
|
|
319
|
+
print(f"✅ {result['message']}")
|
|
320
|
+
else:
|
|
321
|
+
print(f"❌ {result['message']}")
|
|
322
|
+
|
|
323
|
+
except Exception as e:
|
|
324
|
+
print(f"Error: {e}")
|
|
@@ -129,29 +129,136 @@ def check_dataset_exists(dataset_name, workspace_id, client):
|
|
|
129
129
|
return False
|
|
130
130
|
|
|
131
131
|
|
|
132
|
-
def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
|
|
133
|
-
"""Refresh a dataset and monitor progress using Power BI API
|
|
132
|
+
def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None, refresh="full"):
|
|
133
|
+
"""Refresh a dataset and monitor progress using Power BI API
|
|
134
|
+
|
|
135
|
+
For DirectLake models, performs refresh based on refresh parameter:
|
|
136
|
+
- refresh="full": Two-step refresh (clearValues + full reframe)
|
|
137
|
+
- refresh="ignore": Skip refresh entirely
|
|
138
|
+
|
|
139
|
+
If a refresh is already in progress, waits for it to complete before starting a new one.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
# Skip refresh entirely if refresh is "ignore"
|
|
143
|
+
if refresh == "ignore":
|
|
144
|
+
print(" Ignoring refresh - skipping refresh")
|
|
145
|
+
return
|
|
134
146
|
|
|
135
147
|
# If dataset_id not provided, look it up by name
|
|
136
148
|
if not dataset_id:
|
|
137
149
|
dataset_id = get_dataset_id(dataset_name, workspace_id, client)
|
|
138
150
|
|
|
139
|
-
|
|
140
|
-
|
|
151
|
+
# Use Power BI API for refresh (not Fabric API)
|
|
152
|
+
powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
|
|
153
|
+
headers = client._get_headers()
|
|
154
|
+
|
|
155
|
+
# Check for in-progress refreshes
|
|
156
|
+
print(" Checking for in-progress refreshes...")
|
|
157
|
+
try:
|
|
158
|
+
status_response = requests.get(f"{powerbi_url}?$top=1", headers=headers)
|
|
159
|
+
if status_response.status_code == 200:
|
|
160
|
+
refreshes = status_response.json().get('value', [])
|
|
161
|
+
if refreshes:
|
|
162
|
+
latest_refresh = refreshes[0]
|
|
163
|
+
status = latest_refresh.get('status')
|
|
164
|
+
if status in ['InProgress', 'Unknown']:
|
|
165
|
+
refresh_id = latest_refresh.get('requestId')
|
|
166
|
+
print(f" ⚠️ Found in-progress refresh (ID: {refresh_id})")
|
|
167
|
+
print(f" Waiting for current refresh to complete...")
|
|
168
|
+
|
|
169
|
+
# Wait for the in-progress refresh to complete
|
|
170
|
+
max_wait_attempts = 60
|
|
171
|
+
for attempt in range(max_wait_attempts):
|
|
172
|
+
time.sleep(5)
|
|
173
|
+
check_response = requests.get(f"{powerbi_url}/{refresh_id}", headers=headers)
|
|
174
|
+
if check_response.status_code == 200:
|
|
175
|
+
current_status = check_response.json().get('status')
|
|
176
|
+
|
|
177
|
+
if current_status == 'Completed':
|
|
178
|
+
print(f" ✓ Previous refresh completed")
|
|
179
|
+
break
|
|
180
|
+
elif current_status == 'Failed':
|
|
181
|
+
print(f" ⚠️ Previous refresh failed, continuing with new refresh")
|
|
182
|
+
break
|
|
183
|
+
elif current_status == 'Cancelled':
|
|
184
|
+
print(f" ⚠️ Previous refresh was cancelled, continuing with new refresh")
|
|
185
|
+
break
|
|
186
|
+
|
|
187
|
+
if attempt % 6 == 0:
|
|
188
|
+
print(f" Still waiting... (status: {current_status})")
|
|
189
|
+
else:
|
|
190
|
+
print(f" ⚠️ Timeout waiting for previous refresh, will attempt new refresh anyway")
|
|
191
|
+
except Exception as e:
|
|
192
|
+
print(f" ⚠️ Could not check refresh status: {e}")
|
|
193
|
+
print(f" Continuing with refresh attempt...")
|
|
194
|
+
|
|
195
|
+
# Step 1: clearValues - Purge data from memory
|
|
196
|
+
print(" Step 1: Clearing values from memory...")
|
|
197
|
+
clearvalues_payload = {
|
|
198
|
+
"type": "clearValues",
|
|
141
199
|
"commitMode": "transactional",
|
|
142
200
|
"maxParallelism": 10,
|
|
143
201
|
"retryCount": 2,
|
|
144
202
|
"objects": []
|
|
145
203
|
}
|
|
146
204
|
|
|
147
|
-
|
|
148
|
-
powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
|
|
149
|
-
headers = client._get_headers()
|
|
205
|
+
response = requests.post(powerbi_url, headers=headers, json=clearvalues_payload)
|
|
150
206
|
|
|
151
|
-
response
|
|
207
|
+
if response.status_code in [200, 202]:
|
|
208
|
+
# For 202, monitor the clearValues operation
|
|
209
|
+
if response.status_code == 202:
|
|
210
|
+
location = response.headers.get('Location')
|
|
211
|
+
if location:
|
|
212
|
+
clear_refresh_id = location.split('/')[-1]
|
|
213
|
+
print(" ✓ Clear values initiated, monitoring progress...")
|
|
214
|
+
|
|
215
|
+
max_attempts = 60
|
|
216
|
+
for attempt in range(max_attempts):
|
|
217
|
+
time.sleep(2)
|
|
218
|
+
|
|
219
|
+
status_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes/{clear_refresh_id}"
|
|
220
|
+
status_response = requests.get(status_url, headers=headers)
|
|
221
|
+
status_response.raise_for_status()
|
|
222
|
+
status = status_response.json().get('status')
|
|
223
|
+
|
|
224
|
+
if status == 'Completed':
|
|
225
|
+
print(f" ✓ Clear values completed")
|
|
226
|
+
break
|
|
227
|
+
elif status == 'Failed':
|
|
228
|
+
error = status_response.json().get('serviceExceptionJson', '')
|
|
229
|
+
raise Exception(f"Clear values failed: {error}")
|
|
230
|
+
elif status == 'Cancelled':
|
|
231
|
+
raise Exception("Clear values was cancelled")
|
|
232
|
+
|
|
233
|
+
if attempt % 10 == 0 and attempt > 0:
|
|
234
|
+
print(f" Clear values status: {status}...")
|
|
235
|
+
else:
|
|
236
|
+
raise Exception(f"Clear values timed out")
|
|
237
|
+
else:
|
|
238
|
+
print(" ✓ Clear values completed")
|
|
239
|
+
else:
|
|
240
|
+
# Provide detailed error message
|
|
241
|
+
try:
|
|
242
|
+
error_details = response.json()
|
|
243
|
+
error_message = error_details.get('error', {}).get('message', response.text)
|
|
244
|
+
raise Exception(f"Clear values failed with status {response.status_code}: {error_message}")
|
|
245
|
+
except (json.JSONDecodeError, ValueError):
|
|
246
|
+
response.raise_for_status()
|
|
247
|
+
|
|
248
|
+
# Step 2: full refresh - Reframe data from Delta tables
|
|
249
|
+
print(" Step 2: Full refresh to reframe data...")
|
|
250
|
+
full_payload = {
|
|
251
|
+
"type": "full",
|
|
252
|
+
"commitMode": "transactional",
|
|
253
|
+
"maxParallelism": 10,
|
|
254
|
+
"retryCount": 2,
|
|
255
|
+
"objects": []
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
response = requests.post(powerbi_url, headers=headers, json=full_payload)
|
|
152
259
|
|
|
153
260
|
if response.status_code in [200, 202]:
|
|
154
|
-
print(f"✓ Refresh initiated")
|
|
261
|
+
print(f" ✓ Refresh initiated")
|
|
155
262
|
|
|
156
263
|
# For 202, get the refresh_id from the Location header
|
|
157
264
|
if response.status_code == 202:
|
|
@@ -183,7 +290,13 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
|
|
|
183
290
|
|
|
184
291
|
raise Exception(f"Refresh timed out")
|
|
185
292
|
else:
|
|
186
|
-
|
|
293
|
+
# Provide detailed error message
|
|
294
|
+
try:
|
|
295
|
+
error_details = response.json()
|
|
296
|
+
error_message = error_details.get('error', {}).get('message', response.text)
|
|
297
|
+
raise Exception(f"Refresh request failed with status {response.status_code}: {error_message}")
|
|
298
|
+
except (json.JSONDecodeError, ValueError):
|
|
299
|
+
response.raise_for_status()
|
|
187
300
|
|
|
188
301
|
|
|
189
302
|
def download_bim_from_github(url_or_path):
|
|
@@ -431,7 +544,7 @@ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
|
|
|
431
544
|
|
|
432
545
|
|
|
433
546
|
def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
|
|
434
|
-
bim_url_or_path, wait_seconds=5):
|
|
547
|
+
bim_url_or_path, wait_seconds=5, refresh="full"):
|
|
435
548
|
"""
|
|
436
549
|
Deploy a semantic model using DirectLake mode.
|
|
437
550
|
|
|
@@ -442,6 +555,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
442
555
|
dataset_name: Name for the semantic model
|
|
443
556
|
bim_url_or_path: URL to the BIM file or local file path (e.g., 'model.bim' or 'https://...')
|
|
444
557
|
wait_seconds: Seconds to wait before refresh (default: 5)
|
|
558
|
+
refresh: Refresh strategy (default: "full")
|
|
559
|
+
- "full": Clear values and process full refresh
|
|
560
|
+
- "ignore": Skip refresh entirely
|
|
445
561
|
|
|
446
562
|
Returns:
|
|
447
563
|
1 for success, 0 for failure
|
|
@@ -454,6 +570,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
454
570
|
# Using a local file
|
|
455
571
|
dr.deploy("./my_model.bim")
|
|
456
572
|
dr.deploy("C:/path/to/model.bim")
|
|
573
|
+
|
|
574
|
+
# Deploy without refresh
|
|
575
|
+
dr.deploy("./my_model.bim", refresh="ignore")
|
|
457
576
|
"""
|
|
458
577
|
print("=" * 70)
|
|
459
578
|
print("Semantic Model Deployment (DirectLake)")
|
|
@@ -471,14 +590,14 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
471
590
|
dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
|
|
472
591
|
|
|
473
592
|
if dataset_exists:
|
|
474
|
-
print(f"
|
|
593
|
+
print(f"✓ Dataset '{dataset_name}' already exists - skipping deployment")
|
|
475
594
|
|
|
476
595
|
if wait_seconds > 0:
|
|
477
596
|
print(f" Waiting {wait_seconds} seconds...")
|
|
478
597
|
time.sleep(wait_seconds)
|
|
479
598
|
|
|
480
|
-
print("\n[Step
|
|
481
|
-
refresh_dataset(dataset_name, workspace_id, client)
|
|
599
|
+
print("\n[Step 3/3] Refreshing existing semantic model...")
|
|
600
|
+
refresh_dataset(dataset_name, workspace_id, client, refresh=refresh)
|
|
482
601
|
|
|
483
602
|
print("\n" + "=" * 70)
|
|
484
603
|
print("🎉 Refresh Completed!")
|
|
@@ -510,7 +629,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
510
629
|
|
|
511
630
|
# Step 6: Refresh using the dataset ID returned from creation
|
|
512
631
|
print("\n[Step 6/6] Refreshing semantic model...")
|
|
513
|
-
refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id)
|
|
632
|
+
refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id, refresh=refresh)
|
|
514
633
|
|
|
515
634
|
print("\n" + "=" * 70)
|
|
516
635
|
print("🎉 Deployment Completed!")
|
|
@@ -537,7 +656,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
537
656
|
return 0
|
|
538
657
|
|
|
539
658
|
|
|
540
|
-
def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5):
|
|
659
|
+
def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5, refresh="full"):
|
|
541
660
|
"""
|
|
542
661
|
Copy a semantic model from one workspace to another.
|
|
543
662
|
|
|
@@ -550,6 +669,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
550
669
|
destination: Destination in format "workspace/lakehouse.lakehouse/schema"
|
|
551
670
|
new_model_name: Name for the new semantic model (default: same as source)
|
|
552
671
|
wait_seconds: Seconds to wait before refresh (default: 5)
|
|
672
|
+
refresh: Refresh strategy (default: "full")
|
|
673
|
+
- "full": Clear values and process full refresh
|
|
674
|
+
- "ignore": Skip refresh entirely
|
|
553
675
|
|
|
554
676
|
Returns:
|
|
555
677
|
1 for success, 0 for failure
|
|
@@ -562,6 +684,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
562
684
|
copy_model("Source WS", "Production Model", "Target WS/Data Lake.lakehouse/analytics",
|
|
563
685
|
new_model_name="Production Model - Copy")
|
|
564
686
|
|
|
687
|
+
# Copy without refresh
|
|
688
|
+
copy_model("Source WS", "Model", "Target WS/LH.lakehouse/dbo", refresh="ignore")
|
|
689
|
+
|
|
565
690
|
# Using the connect pattern
|
|
566
691
|
import duckrun
|
|
567
692
|
duckrun.semantic_model.copy_model("Source", "Model", "Target/LH.lakehouse/dbo")
|
|
@@ -688,7 +813,8 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
688
813
|
schema_name=schema,
|
|
689
814
|
dataset_name=new_model_name,
|
|
690
815
|
bim_url_or_path=temp_bim_path,
|
|
691
|
-
wait_seconds=wait_seconds
|
|
816
|
+
wait_seconds=wait_seconds,
|
|
817
|
+
refresh=refresh
|
|
692
818
|
)
|
|
693
819
|
|
|
694
820
|
# Clean up temp file
|
|
@@ -60,16 +60,61 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
|
|
|
60
60
|
return []
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
def
|
|
63
|
+
def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
|
|
64
|
+
"""Match tables across all schemas using a wildcard pattern.
|
|
65
|
+
Pattern can be:
|
|
66
|
+
- '*.summary' - matches 'summary' table in all schemas
|
|
67
|
+
- '*summary' - matches any table ending with 'summary'
|
|
68
|
+
- 'schema.*' - matches all tables in 'schema'
|
|
69
|
+
Returns a dict mapping schema names to lists of matching table names."""
|
|
70
|
+
import fnmatch
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
# Query all schemas and tables in one go
|
|
74
|
+
query = """
|
|
75
|
+
SELECT table_schema, table_name
|
|
76
|
+
FROM information_schema.tables
|
|
77
|
+
WHERE table_schema NOT LIKE 'pg_%'
|
|
78
|
+
AND table_schema != 'information_schema'
|
|
79
|
+
AND table_name NOT LIKE 'tbl_%'
|
|
80
|
+
"""
|
|
81
|
+
result = duckrun_instance.con.execute(query).fetchall()
|
|
82
|
+
|
|
83
|
+
matched = {}
|
|
84
|
+
|
|
85
|
+
# Check if pattern contains a dot (schema.table pattern)
|
|
86
|
+
if '.' in pattern:
|
|
87
|
+
schema_pattern, table_pattern = pattern.split('.', 1)
|
|
88
|
+
for schema, table in result:
|
|
89
|
+
if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
|
|
90
|
+
if schema not in matched:
|
|
91
|
+
matched[schema] = []
|
|
92
|
+
matched[schema].append(table)
|
|
93
|
+
else:
|
|
94
|
+
# Pattern matches only table names
|
|
95
|
+
for schema, table in result:
|
|
96
|
+
if fnmatch.fnmatch(table, pattern):
|
|
97
|
+
if schema not in matched:
|
|
98
|
+
matched[schema] = []
|
|
99
|
+
matched[schema].append(table)
|
|
100
|
+
|
|
101
|
+
return matched
|
|
102
|
+
except:
|
|
103
|
+
return {}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_stats(duckrun_instance, source: str = None):
|
|
64
107
|
"""
|
|
65
108
|
Get comprehensive statistics for Delta Lake tables.
|
|
66
109
|
|
|
67
110
|
Args:
|
|
68
111
|
duckrun_instance: The Duckrun connection instance
|
|
69
|
-
source: Can be one of:
|
|
112
|
+
source: Optional. Can be one of:
|
|
113
|
+
- None: Use all tables in the connection's schema (default)
|
|
70
114
|
- Table name: 'table_name' (uses main schema in DuckDB)
|
|
71
115
|
- Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
|
|
72
116
|
- Schema only: 'schema' (all tables in schema, if multi-schema)
|
|
117
|
+
- Wildcard pattern: '*.summary' (matches tables across all schemas)
|
|
73
118
|
|
|
74
119
|
Returns:
|
|
75
120
|
Arrow table with statistics including total rows, file count, row groups,
|
|
@@ -78,6 +123,9 @@ def get_stats(duckrun_instance, source: str):
|
|
|
78
123
|
Examples:
|
|
79
124
|
con = duckrun.connect("tmp/data.lakehouse/test")
|
|
80
125
|
|
|
126
|
+
# All tables in the connection's schema
|
|
127
|
+
stats = con.get_stats()
|
|
128
|
+
|
|
81
129
|
# Single table in main schema (DuckDB uses 'main', not 'test')
|
|
82
130
|
stats = con.get_stats('price_today')
|
|
83
131
|
|
|
@@ -86,6 +134,9 @@ def get_stats(duckrun_instance, source: str):
|
|
|
86
134
|
|
|
87
135
|
# All tables in a schema (only if multi-schema enabled)
|
|
88
136
|
stats = con.get_stats('aemo')
|
|
137
|
+
|
|
138
|
+
# Wildcard pattern across all schemas (only if multi-schema enabled)
|
|
139
|
+
stats = con.get_stats('*.summary')
|
|
89
140
|
"""
|
|
90
141
|
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
91
142
|
|
|
@@ -93,8 +144,31 @@ def get_stats(duckrun_instance, source: str):
|
|
|
93
144
|
duckdb_schema = "main"
|
|
94
145
|
url_schema = duckrun_instance.schema # This is from the connection URL path
|
|
95
146
|
|
|
147
|
+
# If source is not provided, default to all tables in the connection's schema
|
|
148
|
+
if source is None:
|
|
149
|
+
source = url_schema
|
|
150
|
+
|
|
151
|
+
# Check if source contains wildcard characters
|
|
152
|
+
if '*' in source or '?' in source:
|
|
153
|
+
# Wildcard pattern mode - only valid if multi-schema is enabled
|
|
154
|
+
if not duckrun_instance.scan_all_schemas:
|
|
155
|
+
raise ValueError(f"Wildcard pattern '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Enable multi-schema mode to use wildcards.")
|
|
156
|
+
|
|
157
|
+
matched_tables = _match_tables_by_pattern(duckrun_instance, source)
|
|
158
|
+
|
|
159
|
+
if not matched_tables:
|
|
160
|
+
raise ValueError(f"No tables found matching pattern '{source}'")
|
|
161
|
+
|
|
162
|
+
# Flatten the matched tables into a list with schema info
|
|
163
|
+
tables_with_schemas = []
|
|
164
|
+
for schema, tables in matched_tables.items():
|
|
165
|
+
for table in tables:
|
|
166
|
+
tables_with_schemas.append((schema, table))
|
|
167
|
+
|
|
168
|
+
print(f"Found {len(tables_with_schemas)} tables matching pattern '{source}'")
|
|
169
|
+
|
|
96
170
|
# Parse the source and validate existence
|
|
97
|
-
|
|
171
|
+
elif '.' in source:
|
|
98
172
|
# Format: schema.table - only valid if multi-schema is enabled
|
|
99
173
|
schema_name, table_name = source.split('.', 1)
|
|
100
174
|
|
|
@@ -105,46 +179,45 @@ def get_stats(duckrun_instance, source: str):
|
|
|
105
179
|
if not _table_exists(duckrun_instance, schema_name, table_name):
|
|
106
180
|
raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
|
|
107
181
|
|
|
108
|
-
|
|
182
|
+
tables_with_schemas = [(schema_name, table_name)]
|
|
109
183
|
else:
|
|
110
184
|
# Could be just table name or schema name
|
|
111
185
|
if duckrun_instance.scan_all_schemas:
|
|
112
186
|
# Multi-schema mode: DuckDB has actual schemas
|
|
113
187
|
# First check if it's a table in main schema
|
|
114
188
|
if _table_exists(duckrun_instance, duckdb_schema, source):
|
|
115
|
-
|
|
116
|
-
schema_name = duckdb_schema
|
|
189
|
+
tables_with_schemas = [(duckdb_schema, source)]
|
|
117
190
|
# Otherwise, check if it's a schema name
|
|
118
191
|
elif _schema_exists(duckrun_instance, source):
|
|
119
192
|
schema_name = source
|
|
120
193
|
list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
|
|
121
194
|
if not list_tables:
|
|
122
195
|
raise ValueError(f"Schema '{source}' exists but contains no tables")
|
|
196
|
+
tables_with_schemas = [(schema_name, tbl) for tbl in list_tables]
|
|
123
197
|
else:
|
|
124
198
|
raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
|
|
125
199
|
else:
|
|
126
200
|
# Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
|
|
127
201
|
if _table_exists(duckrun_instance, duckdb_schema, source):
|
|
128
202
|
# It's a table name
|
|
129
|
-
|
|
130
|
-
schema_name = url_schema # Use URL schema for file path construction
|
|
203
|
+
tables_with_schemas = [(url_schema, source)]
|
|
131
204
|
elif source == url_schema:
|
|
132
205
|
# Special case: user asked for stats on the URL schema name - list all tables
|
|
133
206
|
list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
|
|
134
|
-
schema_name = url_schema # Use URL schema for file path construction
|
|
135
207
|
if not list_tables:
|
|
136
208
|
raise ValueError(f"No tables found in schema '{url_schema}'")
|
|
209
|
+
tables_with_schemas = [(url_schema, tbl) for tbl in list_tables]
|
|
137
210
|
else:
|
|
138
211
|
raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
|
|
139
212
|
|
|
140
213
|
# Use the existing connection
|
|
141
214
|
con = duckrun_instance.con
|
|
142
215
|
|
|
143
|
-
print(f"Processing {len(
|
|
216
|
+
print(f"Processing {len(tables_with_schemas)} tables from {len(set(s for s, t in tables_with_schemas))} schema(s)")
|
|
144
217
|
|
|
145
218
|
successful_tables = []
|
|
146
|
-
for idx, tbl in enumerate(
|
|
147
|
-
print(f"[{idx+1}/{len(
|
|
219
|
+
for idx, (schema_name, tbl) in enumerate(tables_with_schemas):
|
|
220
|
+
print(f"[{idx+1}/{len(tables_with_schemas)}] Processing table '{schema_name}.{tbl}'...")
|
|
148
221
|
# Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
|
|
149
222
|
table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
|
|
150
223
|
|
|
@@ -171,8 +244,18 @@ def get_stats(duckrun_instance, source: str):
|
|
|
171
244
|
print(f"Warning: Could not convert RecordBatch for table '{tbl}': Unexpected type {type(add_actions)}")
|
|
172
245
|
xx = {}
|
|
173
246
|
|
|
174
|
-
# Check if VORDER exists
|
|
175
|
-
|
|
247
|
+
# Check if VORDER exists - handle both formats:
|
|
248
|
+
# 1. Flattened format: 'tags.VORDER' or 'tags.vorder' in keys
|
|
249
|
+
# 2. Nested format: check in 'tags' dict for 'VORDER' or 'vorder'
|
|
250
|
+
vorder = False
|
|
251
|
+
if 'tags.VORDER' in xx.keys() or 'tags.vorder' in xx.keys():
|
|
252
|
+
vorder = True
|
|
253
|
+
elif 'tags' in xx.keys() and xx['tags']:
|
|
254
|
+
# Check nested tags dictionary (tags is a list of dicts, one per file)
|
|
255
|
+
for tag_dict in xx['tags']:
|
|
256
|
+
if tag_dict and ('VORDER' in tag_dict or 'vorder' in tag_dict):
|
|
257
|
+
vorder = True
|
|
258
|
+
break
|
|
176
259
|
|
|
177
260
|
# Calculate total size
|
|
178
261
|
total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
|
|
@@ -187,6 +270,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
187
270
|
con.execute(f'''
|
|
188
271
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
189
272
|
SELECT
|
|
273
|
+
'{schema_name}' as schema,
|
|
190
274
|
'{tbl}' as tbl,
|
|
191
275
|
'empty' as file_name,
|
|
192
276
|
0 as num_rows,
|
|
@@ -202,6 +286,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
202
286
|
con.execute(f'''
|
|
203
287
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
204
288
|
SELECT
|
|
289
|
+
'{schema_name}' as schema,
|
|
205
290
|
'{tbl}' as tbl,
|
|
206
291
|
fm.file_name,
|
|
207
292
|
fm.num_rows,
|
|
@@ -237,6 +322,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
237
322
|
con.execute(f'''
|
|
238
323
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
239
324
|
SELECT
|
|
325
|
+
'{schema_name}' as schema,
|
|
240
326
|
'{tbl}' as tbl,
|
|
241
327
|
'empty' as file_name,
|
|
242
328
|
0 as num_rows,
|
|
@@ -264,6 +350,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
264
350
|
con.execute(f'''
|
|
265
351
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
266
352
|
SELECT
|
|
353
|
+
'{schema_name}' as schema,
|
|
267
354
|
'{tbl}' as tbl,
|
|
268
355
|
fm.file_name,
|
|
269
356
|
fm.num_rows,
|
|
@@ -291,7 +378,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
291
378
|
# No tables were processed successfully - return empty dataframe
|
|
292
379
|
print("⚠️ No tables could be processed successfully")
|
|
293
380
|
import pandas as pd
|
|
294
|
-
return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
|
|
381
|
+
return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
|
|
295
382
|
'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
|
|
296
383
|
|
|
297
384
|
# Union all successfully processed temp tables
|
|
@@ -301,6 +388,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
301
388
|
# Generate final summary
|
|
302
389
|
final_result = con.execute(f'''
|
|
303
390
|
SELECT
|
|
391
|
+
schema,
|
|
304
392
|
tbl,
|
|
305
393
|
SUM(num_rows) as total_rows,
|
|
306
394
|
COUNT(*) as num_files,
|
|
@@ -312,7 +400,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
312
400
|
ANY_VALUE(timestamp) as timestamp
|
|
313
401
|
FROM ({union_query})
|
|
314
402
|
WHERE tbl IS NOT NULL
|
|
315
|
-
GROUP BY tbl
|
|
403
|
+
GROUP BY schema, tbl
|
|
316
404
|
ORDER BY total_rows DESC
|
|
317
405
|
''').df()
|
|
318
406
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.2.18
|
|
4
|
+
Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.
|
|
8
|
-
description = "
|
|
7
|
+
version = "0.2.18"
|
|
8
|
+
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
11
11
|
authors = [
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|