duckrun 0.2.15__tar.gz → 0.2.19.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/PKG-INFO +2 -2
- duckrun-0.2.19.dev8/duckrun/__init__.py +12 -0
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/core.py +292 -47
- duckrun-0.2.19.dev8/duckrun/notebook.py +324 -0
- duckrun-0.2.19.dev8/duckrun/rle.py +940 -0
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/runner.py +1 -39
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/semantic_model.py +143 -17
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/stats.py +206 -62
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/writer.py +35 -6
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun.egg-info/PKG-INFO +2 -2
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun.egg-info/SOURCES.txt +5 -1
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/pyproject.toml +2 -2
- duckrun-0.2.19.dev8/tests/test_register.py +275 -0
- duckrun-0.2.19.dev8/tests/test_rle.py +16 -0
- duckrun-0.2.15/duckrun/__init__.py +0 -10
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/LICENSE +0 -0
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/README.md +0 -0
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/auth.py +0 -0
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/files.py +0 -0
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.15 → duckrun-0.2.19.dev8}/setup.cfg +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.2.19.dev8
|
|
4
|
+
Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Duckrun - Lakehouse task runner powered by DuckDB"""
|
|
2
|
+
|
|
3
|
+
from duckrun.core import Duckrun
|
|
4
|
+
from duckrun.notebook import import_notebook_from_web, import_notebook
|
|
5
|
+
from duckrun import rle
|
|
6
|
+
|
|
7
|
+
__version__ = "0.2.18"
|
|
8
|
+
|
|
9
|
+
# Expose unified connect method at module level
|
|
10
|
+
connect = Duckrun.connect
|
|
11
|
+
|
|
12
|
+
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook", "rle"]
|
|
@@ -12,7 +12,71 @@ from .runner import run as _run
|
|
|
12
12
|
from .files import copy as _copy, download as _download
|
|
13
13
|
from .writer import QueryResult
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
|
|
16
|
+
class WorkspaceOperationsMixin:
|
|
17
|
+
"""
|
|
18
|
+
Mixin class for workspace-level operations that work for both
|
|
19
|
+
full Duckrun connections and workspace-only connections.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def import_notebook_from_web(self, url: str,
|
|
23
|
+
notebook_name: Optional[str] = None,
|
|
24
|
+
overwrite: bool = False) -> dict:
|
|
25
|
+
"""
|
|
26
|
+
Import a Jupyter notebook from a web URL into the workspace.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
url: URL to the notebook file (e.g., GitHub raw URL). Required.
|
|
30
|
+
notebook_name: Name for the imported notebook. Optional - derived from URL if not provided.
|
|
31
|
+
overwrite: Whether to overwrite if notebook already exists (default: False)
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Dictionary with import result
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
38
|
+
result = con.import_notebook_from_web(
|
|
39
|
+
url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
ws = duckrun.connect("workspace")
|
|
43
|
+
result = ws.import_notebook_from_web(
|
|
44
|
+
url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
|
|
45
|
+
)
|
|
46
|
+
"""
|
|
47
|
+
from .notebook import import_notebook_from_web as _import_notebook_from_web
|
|
48
|
+
|
|
49
|
+
# Get workspace name from either self.workspace or self.workspace_name
|
|
50
|
+
workspace_name = getattr(self, 'workspace', None) or getattr(self, 'workspace_name', None)
|
|
51
|
+
|
|
52
|
+
return _import_notebook_from_web(
|
|
53
|
+
url=url,
|
|
54
|
+
notebook_name=notebook_name,
|
|
55
|
+
overwrite=overwrite,
|
|
56
|
+
workspace_name=workspace_name
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
|
60
|
+
"""Helper method to get workspace ID from name"""
|
|
61
|
+
try:
|
|
62
|
+
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
|
63
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
64
|
+
|
|
65
|
+
response = requests.get(url, headers=headers)
|
|
66
|
+
response.raise_for_status()
|
|
67
|
+
|
|
68
|
+
workspaces = response.json().get("value", [])
|
|
69
|
+
for workspace in workspaces:
|
|
70
|
+
if workspace.get("displayName") == workspace_name:
|
|
71
|
+
return workspace.get("id")
|
|
72
|
+
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
except Exception:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class Duckrun(WorkspaceOperationsMixin):
|
|
16
80
|
"""
|
|
17
81
|
OneLake task runner with clean tuple-based API.
|
|
18
82
|
Supports lakehouses, warehouses, databases, and other OneLake items.
|
|
@@ -971,33 +1035,59 @@ class Duckrun:
|
|
|
971
1035
|
"""Get underlying DuckDB connection"""
|
|
972
1036
|
return self.con
|
|
973
1037
|
|
|
974
|
-
def
|
|
1038
|
+
def register(self, name: str, df):
|
|
1039
|
+
"""
|
|
1040
|
+
Register a pandas DataFrame as a virtual table in DuckDB.
|
|
1041
|
+
|
|
1042
|
+
Args:
|
|
1043
|
+
name: Name for the virtual table
|
|
1044
|
+
df: pandas DataFrame to register
|
|
1045
|
+
|
|
1046
|
+
Example:
|
|
1047
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
1048
|
+
con.register("tb", df)
|
|
1049
|
+
con.sql("SELECT * FROM tb").show()
|
|
1050
|
+
"""
|
|
1051
|
+
self.con.register(name, df)
|
|
1052
|
+
|
|
1053
|
+
def get_stats(self, source: str = None, detailed = False):
|
|
975
1054
|
"""
|
|
976
1055
|
Get comprehensive statistics for Delta Lake tables.
|
|
977
1056
|
|
|
978
1057
|
Args:
|
|
979
|
-
source: Can be one of:
|
|
1058
|
+
source: Optional. Can be one of:
|
|
1059
|
+
- None: Use all tables in the connection's schema (default)
|
|
980
1060
|
- Table name: 'table_name' (uses current schema)
|
|
981
1061
|
- Schema.table: 'schema.table_name' (specific table in schema)
|
|
982
1062
|
- Schema only: 'schema' (all tables in schema)
|
|
1063
|
+
detailed: Optional. Controls the level of detail in statistics:
|
|
1064
|
+
- False (default): Aggregated table-level stats
|
|
1065
|
+
- True: Row group level statistics with compression details
|
|
983
1066
|
|
|
984
1067
|
Returns:
|
|
985
|
-
|
|
986
|
-
|
|
1068
|
+
DataFrame with statistics based on detailed parameter:
|
|
1069
|
+
- If detailed=False: Aggregated table-level summary
|
|
1070
|
+
- If detailed=True: Granular file and row group level stats
|
|
987
1071
|
|
|
988
1072
|
Examples:
|
|
989
1073
|
con = duckrun.connect("tmp/data.lakehouse/aemo")
|
|
990
1074
|
|
|
991
|
-
#
|
|
1075
|
+
# All tables in current schema (aemo) - aggregated
|
|
1076
|
+
stats = con.get_stats()
|
|
1077
|
+
|
|
1078
|
+
# Single table in current schema - aggregated
|
|
992
1079
|
stats = con.get_stats('price')
|
|
993
1080
|
|
|
1081
|
+
# Single table with detailed row group statistics
|
|
1082
|
+
stats_detailed = con.get_stats('price', detailed=True)
|
|
1083
|
+
|
|
994
1084
|
# Specific table in different schema
|
|
995
1085
|
stats = con.get_stats('aemo.price')
|
|
996
1086
|
|
|
997
1087
|
# All tables in a schema
|
|
998
1088
|
stats = con.get_stats('aemo')
|
|
999
1089
|
"""
|
|
1000
|
-
return _get_stats(self, source)
|
|
1090
|
+
return _get_stats(self, source, detailed)
|
|
1001
1091
|
|
|
1002
1092
|
def list_lakehouses(self) -> List[str]:
|
|
1003
1093
|
"""
|
|
@@ -1111,7 +1201,7 @@ class Duckrun:
|
|
|
1111
1201
|
return False
|
|
1112
1202
|
|
|
1113
1203
|
def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
|
|
1114
|
-
wait_seconds: int = 5) -> int:
|
|
1204
|
+
wait_seconds: int = 5, refresh: str = "full") -> int:
|
|
1115
1205
|
"""
|
|
1116
1206
|
Deploy a semantic model from a BIM file using DirectLake mode.
|
|
1117
1207
|
|
|
@@ -1120,8 +1210,11 @@ class Duckrun:
|
|
|
1120
1210
|
- URL: "https://raw.githubusercontent.com/.../model.bim"
|
|
1121
1211
|
- Local file: "model.bim"
|
|
1122
1212
|
- Workspace/Model: "workspace_name/model_name"
|
|
1123
|
-
dataset_name: Name for the semantic model (default:
|
|
1213
|
+
dataset_name: Name for the semantic model (default: schema name)
|
|
1124
1214
|
wait_seconds: Seconds to wait for permission propagation (default: 5)
|
|
1215
|
+
refresh: Refresh strategy:
|
|
1216
|
+
- "full": Clear values and process full refresh (default)
|
|
1217
|
+
- "ignore": Skip refresh entirely
|
|
1125
1218
|
|
|
1126
1219
|
Returns:
|
|
1127
1220
|
1 for success, 0 for failure
|
|
@@ -1129,14 +1222,17 @@ class Duckrun:
|
|
|
1129
1222
|
Examples:
|
|
1130
1223
|
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
|
1131
1224
|
|
|
1225
|
+
# Deploy with schema name as dataset name (dbo)
|
|
1226
|
+
dr.deploy("https://github.com/.../model.bim")
|
|
1227
|
+
|
|
1132
1228
|
# Deploy from workspace/model (uses same name by default)
|
|
1133
1229
|
dr.deploy("Source Workspace/Source Model") # Creates "Source Model"
|
|
1134
1230
|
|
|
1135
1231
|
# Deploy with custom name
|
|
1136
|
-
dr.deploy("
|
|
1232
|
+
dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
|
|
1137
1233
|
|
|
1138
|
-
# Deploy
|
|
1139
|
-
dr.deploy("https://
|
|
1234
|
+
# Deploy without refresh
|
|
1235
|
+
dr.deploy("https://github.com/.../model.bim", refresh="ignore")
|
|
1140
1236
|
"""
|
|
1141
1237
|
from .semantic_model import deploy_semantic_model
|
|
1142
1238
|
|
|
@@ -1148,9 +1244,9 @@ class Duckrun:
|
|
|
1148
1244
|
if len(parts) == 2:
|
|
1149
1245
|
dataset_name = parts[1] # Use the model name
|
|
1150
1246
|
else:
|
|
1151
|
-
dataset_name =
|
|
1247
|
+
dataset_name = self.schema # Use schema name
|
|
1152
1248
|
else:
|
|
1153
|
-
dataset_name =
|
|
1249
|
+
dataset_name = self.schema # Use schema name
|
|
1154
1250
|
|
|
1155
1251
|
# Call the deployment function (DirectLake only)
|
|
1156
1252
|
return deploy_semantic_model(
|
|
@@ -1159,36 +1255,204 @@ class Duckrun:
|
|
|
1159
1255
|
schema_name=self.schema,
|
|
1160
1256
|
dataset_name=dataset_name,
|
|
1161
1257
|
bim_url_or_path=bim_url,
|
|
1162
|
-
wait_seconds=wait_seconds
|
|
1258
|
+
wait_seconds=wait_seconds,
|
|
1259
|
+
refresh=refresh
|
|
1163
1260
|
)
|
|
1164
1261
|
|
|
1165
|
-
def
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1262
|
+
def rle(self, table_name: str = None, mode = "natural",
|
|
1263
|
+
min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
|
|
1264
|
+
max_ordering_depth: int = 3, limit: int = None):
|
|
1265
|
+
"""
|
|
1266
|
+
Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
|
|
1267
|
+
|
|
1268
|
+
Args:
|
|
1269
|
+
table_name: Name of the table to analyze. Can be:
|
|
1270
|
+
- 'table_name' (uses current schema)
|
|
1271
|
+
- 'schema.table_name' (specific schema)
|
|
1272
|
+
mode: Analysis mode or column ordering:
|
|
1273
|
+
- "natural": Calculate RLE for natural order only (fastest)
|
|
1274
|
+
- "auto": Natural order + cardinality-based ordering (recommended)
|
|
1275
|
+
- "advanced": Natural + cardinality + greedy incremental search (most thorough)
|
|
1276
|
+
- List[str]: Specific column ordering to test, e.g., ['date', 'duid']
|
|
1277
|
+
min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
|
|
1278
|
+
max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
|
|
1279
|
+
max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
|
|
1280
|
+
limit: Optional row limit for testing/development (default: None, analyzes all rows)
|
|
1281
|
+
|
|
1282
|
+
Returns:
|
|
1283
|
+
DataFrame with RLE analysis results
|
|
1284
|
+
|
|
1285
|
+
Examples:
|
|
1286
|
+
# Natural order only (baseline)
|
|
1287
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
|
|
1288
|
+
con.rle("mytable") # same as con.rle("mytable", "natural")
|
|
1170
1289
|
|
|
1171
|
-
|
|
1172
|
-
|
|
1290
|
+
# Auto optimization (natural + cardinality-based)
|
|
1291
|
+
con.rle("mytable", "auto")
|
|
1173
1292
|
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1293
|
+
# Advanced optimization (greedy incremental search)
|
|
1294
|
+
con.rle("mytable", "advanced")
|
|
1295
|
+
|
|
1296
|
+
# Test specific column ordering
|
|
1297
|
+
con.rle("mytable", ["date", "duid"])
|
|
1298
|
+
con.rle("mytable", ["cutoff", "time", "DUID", "date"])
|
|
1299
|
+
|
|
1300
|
+
# Advanced with custom depth
|
|
1301
|
+
con.rle("mytable", "advanced", max_ordering_depth=4)
|
|
1178
1302
|
|
|
1303
|
+
# Analyze table from different schema
|
|
1304
|
+
con.rle("otherschema.mytable", "auto")
|
|
1305
|
+
|
|
1306
|
+
# Custom thresholds for small tables
|
|
1307
|
+
con.rle("mytable", "auto", max_cardinality_pct=0.05)
|
|
1308
|
+
|
|
1309
|
+
# Limit rows for testing
|
|
1310
|
+
con.rle("mytable", "auto", limit=10000)
|
|
1311
|
+
"""
|
|
1312
|
+
from .rle import (
|
|
1313
|
+
calculate_cardinality_ratio,
|
|
1314
|
+
test_column_orderings_smart,
|
|
1315
|
+
calculate_rle_for_columns
|
|
1316
|
+
)
|
|
1317
|
+
from deltalake import DeltaTable
|
|
1318
|
+
|
|
1319
|
+
# Parse table name and construct path
|
|
1320
|
+
if table_name is None:
|
|
1321
|
+
if mode != "summary":
|
|
1322
|
+
print("⚠️ Table name is required for 'smart' and 'full' modes")
|
|
1323
|
+
return None
|
|
1324
|
+
# TODO: Implement all-tables summary
|
|
1325
|
+
print("⚠️ All-tables summary not yet implemented. Please specify a table name.")
|
|
1179
1326
|
return None
|
|
1327
|
+
|
|
1328
|
+
# Parse schema.table or just table
|
|
1329
|
+
if '.' in table_name:
|
|
1330
|
+
schema_name, tbl = table_name.split('.', 1)
|
|
1331
|
+
else:
|
|
1332
|
+
schema_name = self.schema
|
|
1333
|
+
tbl = table_name
|
|
1334
|
+
|
|
1335
|
+
# Construct the full table path using the same logic as get_stats
|
|
1336
|
+
table_path = f"{self.table_base_url}{schema_name}/{tbl}"
|
|
1337
|
+
|
|
1338
|
+
# Verify table exists and is not empty
|
|
1339
|
+
print(f"📊 Analyzing table: {schema_name}.{tbl}")
|
|
1340
|
+
|
|
1341
|
+
try:
|
|
1342
|
+
dt = DeltaTable(table_path)
|
|
1343
|
+
delta_files = dt.files()
|
|
1180
1344
|
|
|
1181
|
-
|
|
1345
|
+
if not delta_files:
|
|
1346
|
+
print("⚠️ Table is empty (no files)")
|
|
1347
|
+
return None
|
|
1348
|
+
|
|
1349
|
+
except Exception as e:
|
|
1350
|
+
print(f"❌ Error accessing Delta table: {e}")
|
|
1182
1351
|
return None
|
|
1352
|
+
|
|
1353
|
+
# Check if mode is a list of columns (custom ordering)
|
|
1354
|
+
if isinstance(mode, list):
|
|
1355
|
+
# User wants to test a specific column ordering
|
|
1356
|
+
print(f"Testing custom column ordering: {', '.join(mode)}")
|
|
1357
|
+
|
|
1358
|
+
# Calculate cardinality for NDV values
|
|
1359
|
+
card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
|
|
1360
|
+
|
|
1361
|
+
# Calculate RLE for the specified ordering
|
|
1362
|
+
rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
|
|
1363
|
+
|
|
1364
|
+
total_rle_all = sum(rle_counts.values())
|
|
1365
|
+
|
|
1366
|
+
print(f"\nResults:")
|
|
1367
|
+
print(f" Custom ordering: [{', '.join(mode)}]")
|
|
1368
|
+
print(f" Total RLE (all columns): {total_rle_all:,} runs")
|
|
1369
|
+
|
|
1370
|
+
# Return as DataFrame for consistency
|
|
1371
|
+
import pandas as pd
|
|
1372
|
+
results = [{
|
|
1373
|
+
'schema': schema_name,
|
|
1374
|
+
'table': tbl,
|
|
1375
|
+
'sort_order': 'custom',
|
|
1376
|
+
'columns_used': ', '.join(mode),
|
|
1377
|
+
'total_rle_all': total_rle_all,
|
|
1378
|
+
**rle_counts
|
|
1379
|
+
}]
|
|
1380
|
+
|
|
1381
|
+
df = pd.DataFrame(results)
|
|
1382
|
+
|
|
1383
|
+
# Transform to long format
|
|
1384
|
+
long_format_results = []
|
|
1385
|
+
|
|
1386
|
+
for _, row in df.iterrows():
|
|
1387
|
+
schema_val = row['schema']
|
|
1388
|
+
table_val = row['table']
|
|
1389
|
+
sort_order = row['sort_order']
|
|
1390
|
+
columns_used = row['columns_used']
|
|
1391
|
+
total_rle_all_val = row['total_rle_all']
|
|
1392
|
+
|
|
1393
|
+
# Get all column names except metadata columns
|
|
1394
|
+
metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
|
|
1395
|
+
data_columns = [col for col in df.columns if col not in metadata_cols]
|
|
1396
|
+
|
|
1397
|
+
# Get total rows from card_stats if available
|
|
1398
|
+
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
1399
|
+
|
|
1400
|
+
# Parse the columns_used to get ordering
|
|
1401
|
+
sort_columns_list = [c.strip() for c in columns_used.split(',')]
|
|
1402
|
+
|
|
1403
|
+
# Create one row per data column
|
|
1404
|
+
for col in data_columns:
|
|
1405
|
+
rle_value = row[col]
|
|
1406
|
+
|
|
1407
|
+
# Get NDV from card_stats
|
|
1408
|
+
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
1409
|
+
|
|
1410
|
+
# Determine if column was included in the sort and its position
|
|
1411
|
+
is_in_sort = col in sort_columns_list
|
|
1412
|
+
order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
|
|
1413
|
+
comment = '' if is_in_sort else 'not included in the sort'
|
|
1414
|
+
|
|
1415
|
+
long_format_results.append({
|
|
1416
|
+
'schema': schema_val,
|
|
1417
|
+
'table': table_val,
|
|
1418
|
+
'sort_type': sort_order,
|
|
1419
|
+
'column': col,
|
|
1420
|
+
'order': order_position,
|
|
1421
|
+
'RLE': rle_value,
|
|
1422
|
+
'NDV': ndv_value,
|
|
1423
|
+
'total_rows': total_rows,
|
|
1424
|
+
'total_RLE': total_rle_all_val,
|
|
1425
|
+
'comments': comment
|
|
1426
|
+
})
|
|
1427
|
+
|
|
1428
|
+
long_df = pd.DataFrame(long_format_results)
|
|
1429
|
+
|
|
1430
|
+
return long_df
|
|
1431
|
+
|
|
1432
|
+
# All modes now use test_column_orderings_smart with the mode parameter
|
|
1433
|
+
return test_column_orderings_smart(
|
|
1434
|
+
self.con,
|
|
1435
|
+
table_path,
|
|
1436
|
+
table_name=table_name, # Pass table name for cardinality calculation on full dataset
|
|
1437
|
+
mode=mode,
|
|
1438
|
+
limit=limit,
|
|
1439
|
+
min_distinct_threshold=min_distinct_threshold,
|
|
1440
|
+
max_cardinality_pct=max_cardinality_pct,
|
|
1441
|
+
max_ordering_depth=max_ordering_depth,
|
|
1442
|
+
schema_name=schema_name,
|
|
1443
|
+
table_display_name=tbl,
|
|
1444
|
+
duckrun_instance=self # Pass duckrun instance for detailed parquet stats
|
|
1445
|
+
)
|
|
1183
1446
|
|
|
1184
1447
|
def close(self):
|
|
1185
1448
|
"""Close DuckDB connection"""
|
|
1449
|
+
|
|
1186
1450
|
if self.con:
|
|
1187
1451
|
self.con.close()
|
|
1188
1452
|
print("Connection closed")
|
|
1189
1453
|
|
|
1190
1454
|
|
|
1191
|
-
class WorkspaceConnection:
|
|
1455
|
+
class WorkspaceConnection(WorkspaceOperationsMixin):
|
|
1192
1456
|
"""
|
|
1193
1457
|
Simple workspace connection for lakehouse management operations.
|
|
1194
1458
|
"""
|
|
@@ -1428,23 +1692,4 @@ class WorkspaceConnection:
|
|
|
1428
1692
|
print(f"❌ Error downloading semantic model: {e}")
|
|
1429
1693
|
import traceback
|
|
1430
1694
|
traceback.print_exc()
|
|
1431
|
-
return None
|
|
1432
|
-
|
|
1433
|
-
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
|
1434
|
-
"""Helper method to get workspace ID from name"""
|
|
1435
|
-
try:
|
|
1436
|
-
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
|
1437
|
-
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
1438
|
-
|
|
1439
|
-
response = requests.get(url, headers=headers)
|
|
1440
|
-
response.raise_for_status()
|
|
1441
|
-
|
|
1442
|
-
workspaces = response.json().get("value", [])
|
|
1443
|
-
for workspace in workspaces:
|
|
1444
|
-
if workspace.get("displayName") == workspace_name:
|
|
1445
|
-
return workspace.get("id")
|
|
1446
|
-
|
|
1447
|
-
return None
|
|
1448
|
-
|
|
1449
|
-
except Exception:
|
|
1450
1695
|
return None
|