duckrun 0.2.15__tar.gz → 0.2.19.dev8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/PKG-INFO +2 -2
  2. duckrun-0.2.19.dev8/duckrun/__init__.py +12 -0
  3. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/core.py +292 -47
  4. duckrun-0.2.19.dev8/duckrun/notebook.py +324 -0
  5. duckrun-0.2.19.dev8/duckrun/rle.py +940 -0
  6. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/runner.py +1 -39
  7. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/semantic_model.py +143 -17
  8. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/stats.py +206 -62
  9. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/writer.py +35 -6
  10. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun.egg-info/PKG-INFO +2 -2
  11. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun.egg-info/SOURCES.txt +5 -1
  12. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/pyproject.toml +2 -2
  13. duckrun-0.2.19.dev8/tests/test_register.py +275 -0
  14. duckrun-0.2.19.dev8/tests/test_rle.py +16 -0
  15. duckrun-0.2.15/duckrun/__init__.py +0 -10
  16. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/LICENSE +0 -0
  17. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/README.md +0 -0
  18. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/auth.py +0 -0
  19. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/files.py +0 -0
  20. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun/lakehouse.py +0 -0
  21. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun.egg-info/dependency_links.txt +0 -0
  22. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun.egg-info/requires.txt +0 -0
  23. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/duckrun.egg-info/top_level.txt +0 -0
  24. {duckrun-0.2.15 → duckrun-0.2.19.dev8}/setup.cfg +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.15
4
- Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
3
+ Version: 0.2.19.dev8
4
+ Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
7
7
  Project-URL: Homepage, https://github.com/djouallah/duckrun
@@ -0,0 +1,12 @@
1
+ """Duckrun - Lakehouse task runner powered by DuckDB"""
2
+
3
+ from duckrun.core import Duckrun
4
+ from duckrun.notebook import import_notebook_from_web, import_notebook
5
+ from duckrun import rle
6
+
7
+ __version__ = "0.2.18"
8
+
9
+ # Expose unified connect method at module level
10
+ connect = Duckrun.connect
11
+
12
+ __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook", "rle"]
@@ -12,7 +12,71 @@ from .runner import run as _run
12
12
  from .files import copy as _copy, download as _download
13
13
  from .writer import QueryResult
14
14
 
15
- class Duckrun:
15
+
16
+ class WorkspaceOperationsMixin:
17
+ """
18
+ Mixin class for workspace-level operations that work for both
19
+ full Duckrun connections and workspace-only connections.
20
+ """
21
+
22
+ def import_notebook_from_web(self, url: str,
23
+ notebook_name: Optional[str] = None,
24
+ overwrite: bool = False) -> dict:
25
+ """
26
+ Import a Jupyter notebook from a web URL into the workspace.
27
+
28
+ Args:
29
+ url: URL to the notebook file (e.g., GitHub raw URL). Required.
30
+ notebook_name: Name for the imported notebook. Optional - derived from URL if not provided.
31
+ overwrite: Whether to overwrite if notebook already exists (default: False)
32
+
33
+ Returns:
34
+ Dictionary with import result
35
+
36
+ Examples:
37
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
38
+ result = con.import_notebook_from_web(
39
+ url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
40
+ )
41
+
42
+ ws = duckrun.connect("workspace")
43
+ result = ws.import_notebook_from_web(
44
+ url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
45
+ )
46
+ """
47
+ from .notebook import import_notebook_from_web as _import_notebook_from_web
48
+
49
+ # Get workspace name from either self.workspace or self.workspace_name
50
+ workspace_name = getattr(self, 'workspace', None) or getattr(self, 'workspace_name', None)
51
+
52
+ return _import_notebook_from_web(
53
+ url=url,
54
+ notebook_name=notebook_name,
55
+ overwrite=overwrite,
56
+ workspace_name=workspace_name
57
+ )
58
+
59
+ def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
60
+ """Helper method to get workspace ID from name"""
61
+ try:
62
+ url = "https://api.fabric.microsoft.com/v1/workspaces"
63
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
64
+
65
+ response = requests.get(url, headers=headers)
66
+ response.raise_for_status()
67
+
68
+ workspaces = response.json().get("value", [])
69
+ for workspace in workspaces:
70
+ if workspace.get("displayName") == workspace_name:
71
+ return workspace.get("id")
72
+
73
+ return None
74
+
75
+ except Exception:
76
+ return None
77
+
78
+
79
+ class Duckrun(WorkspaceOperationsMixin):
16
80
  """
17
81
  OneLake task runner with clean tuple-based API.
18
82
  Supports lakehouses, warehouses, databases, and other OneLake items.
@@ -971,33 +1035,59 @@ class Duckrun:
971
1035
  """Get underlying DuckDB connection"""
972
1036
  return self.con
973
1037
 
974
- def get_stats(self, source: str):
1038
+ def register(self, name: str, df):
1039
+ """
1040
+ Register a pandas DataFrame as a virtual table in DuckDB.
1041
+
1042
+ Args:
1043
+ name: Name for the virtual table
1044
+ df: pandas DataFrame to register
1045
+
1046
+ Example:
1047
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
1048
+ con.register("tb", df)
1049
+ con.sql("SELECT * FROM tb").show()
1050
+ """
1051
+ self.con.register(name, df)
1052
+
1053
+ def get_stats(self, source: str = None, detailed = False):
975
1054
  """
976
1055
  Get comprehensive statistics for Delta Lake tables.
977
1056
 
978
1057
  Args:
979
- source: Can be one of:
1058
+ source: Optional. Can be one of:
1059
+ - None: Use all tables in the connection's schema (default)
980
1060
  - Table name: 'table_name' (uses current schema)
981
1061
  - Schema.table: 'schema.table_name' (specific table in schema)
982
1062
  - Schema only: 'schema' (all tables in schema)
1063
+ detailed: Optional. Controls the level of detail in statistics:
1064
+ - False (default): Aggregated table-level stats
1065
+ - True: Row group level statistics with compression details
983
1066
 
984
1067
  Returns:
985
- Arrow table with statistics including total rows, file count, row groups,
986
- average row group size, file sizes, VORDER status, and timestamp
1068
+ DataFrame with statistics based on detailed parameter:
1069
+ - If detailed=False: Aggregated table-level summary
1070
+ - If detailed=True: Granular file and row group level stats
987
1071
 
988
1072
  Examples:
989
1073
  con = duckrun.connect("tmp/data.lakehouse/aemo")
990
1074
 
991
- # Single table in current schema
1075
+ # All tables in current schema (aemo) - aggregated
1076
+ stats = con.get_stats()
1077
+
1078
+ # Single table in current schema - aggregated
992
1079
  stats = con.get_stats('price')
993
1080
 
1081
+ # Single table with detailed row group statistics
1082
+ stats_detailed = con.get_stats('price', detailed=True)
1083
+
994
1084
  # Specific table in different schema
995
1085
  stats = con.get_stats('aemo.price')
996
1086
 
997
1087
  # All tables in a schema
998
1088
  stats = con.get_stats('aemo')
999
1089
  """
1000
- return _get_stats(self, source)
1090
+ return _get_stats(self, source, detailed)
1001
1091
 
1002
1092
  def list_lakehouses(self) -> List[str]:
1003
1093
  """
@@ -1111,7 +1201,7 @@ class Duckrun:
1111
1201
  return False
1112
1202
 
1113
1203
  def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
1114
- wait_seconds: int = 5) -> int:
1204
+ wait_seconds: int = 5, refresh: str = "full") -> int:
1115
1205
  """
1116
1206
  Deploy a semantic model from a BIM file using DirectLake mode.
1117
1207
 
@@ -1120,8 +1210,11 @@ class Duckrun:
1120
1210
  - URL: "https://raw.githubusercontent.com/.../model.bim"
1121
1211
  - Local file: "model.bim"
1122
1212
  - Workspace/Model: "workspace_name/model_name"
1123
- dataset_name: Name for the semantic model (default: source model name if workspace/model format, else lakehouse_schema)
1213
+ dataset_name: Name for the semantic model (default: schema name)
1124
1214
  wait_seconds: Seconds to wait for permission propagation (default: 5)
1215
+ refresh: Refresh strategy:
1216
+ - "full": Clear values and process full refresh (default)
1217
+ - "ignore": Skip refresh entirely
1125
1218
 
1126
1219
  Returns:
1127
1220
  1 for success, 0 for failure
@@ -1129,14 +1222,17 @@ class Duckrun:
1129
1222
  Examples:
1130
1223
  dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
1131
1224
 
1225
+ # Deploy with schema name as dataset name (dbo)
1226
+ dr.deploy("https://github.com/.../model.bim")
1227
+
1132
1228
  # Deploy from workspace/model (uses same name by default)
1133
1229
  dr.deploy("Source Workspace/Source Model") # Creates "Source Model"
1134
1230
 
1135
1231
  # Deploy with custom name
1136
- dr.deploy("Source Workspace/Source Model", dataset_name="Sales Model Copy")
1232
+ dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
1137
1233
 
1138
- # Deploy from URL or local file
1139
- dr.deploy("https://raw.githubusercontent.com/.../model.bim", dataset_name="My Model")
1234
+ # Deploy without refresh
1235
+ dr.deploy("https://github.com/.../model.bim", refresh="ignore")
1140
1236
  """
1141
1237
  from .semantic_model import deploy_semantic_model
1142
1238
 
@@ -1148,9 +1244,9 @@ class Duckrun:
1148
1244
  if len(parts) == 2:
1149
1245
  dataset_name = parts[1] # Use the model name
1150
1246
  else:
1151
- dataset_name = f"{self.lakehouse_name}_{self.schema}"
1247
+ dataset_name = self.schema # Use schema name
1152
1248
  else:
1153
- dataset_name = f"{self.lakehouse_name}_{self.schema}"
1249
+ dataset_name = self.schema # Use schema name
1154
1250
 
1155
1251
  # Call the deployment function (DirectLake only)
1156
1252
  return deploy_semantic_model(
@@ -1159,36 +1255,204 @@ class Duckrun:
1159
1255
  schema_name=self.schema,
1160
1256
  dataset_name=dataset_name,
1161
1257
  bim_url_or_path=bim_url,
1162
- wait_seconds=wait_seconds
1258
+ wait_seconds=wait_seconds,
1259
+ refresh=refresh
1163
1260
  )
1164
1261
 
1165
- def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
1166
- """Helper method to get workspace ID from name"""
1167
- try:
1168
- url = "https://api.fabric.microsoft.com/v1/workspaces"
1169
- headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
1262
+ def rle(self, table_name: str = None, mode = "natural",
1263
+ min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
1264
+ max_ordering_depth: int = 3, limit: int = None):
1265
+ """
1266
+ Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
1267
+
1268
+ Args:
1269
+ table_name: Name of the table to analyze. Can be:
1270
+ - 'table_name' (uses current schema)
1271
+ - 'schema.table_name' (specific schema)
1272
+ mode: Analysis mode or column ordering:
1273
+ - "natural": Calculate RLE for natural order only (fastest)
1274
+ - "auto": Natural order + cardinality-based ordering (recommended)
1275
+ - "advanced": Natural + cardinality + greedy incremental search (most thorough)
1276
+ - List[str]: Specific column ordering to test, e.g., ['date', 'duid']
1277
+ min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
1278
+ max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
1279
+ max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
1280
+ limit: Optional row limit for testing/development (default: None, analyzes all rows)
1281
+
1282
+ Returns:
1283
+ DataFrame with RLE analysis results
1284
+
1285
+ Examples:
1286
+ # Natural order only (baseline)
1287
+ con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
1288
+ con.rle("mytable") # same as con.rle("mytable", "natural")
1170
1289
 
1171
- response = requests.get(url, headers=headers)
1172
- response.raise_for_status()
1290
+ # Auto optimization (natural + cardinality-based)
1291
+ con.rle("mytable", "auto")
1173
1292
 
1174
- workspaces = response.json().get("value", [])
1175
- for workspace in workspaces:
1176
- if workspace.get("displayName") == workspace_name:
1177
- return workspace.get("id")
1293
+ # Advanced optimization (greedy incremental search)
1294
+ con.rle("mytable", "advanced")
1295
+
1296
+ # Test specific column ordering
1297
+ con.rle("mytable", ["date", "duid"])
1298
+ con.rle("mytable", ["cutoff", "time", "DUID", "date"])
1299
+
1300
+ # Advanced with custom depth
1301
+ con.rle("mytable", "advanced", max_ordering_depth=4)
1178
1302
 
1303
+ # Analyze table from different schema
1304
+ con.rle("otherschema.mytable", "auto")
1305
+
1306
+ # Custom thresholds for small tables
1307
+ con.rle("mytable", "auto", max_cardinality_pct=0.05)
1308
+
1309
+ # Limit rows for testing
1310
+ con.rle("mytable", "auto", limit=10000)
1311
+ """
1312
+ from .rle import (
1313
+ calculate_cardinality_ratio,
1314
+ test_column_orderings_smart,
1315
+ calculate_rle_for_columns
1316
+ )
1317
+ from deltalake import DeltaTable
1318
+
1319
+ # Parse table name and construct path
1320
+ if table_name is None:
1321
+ if mode != "summary":
1322
+ print("⚠️ Table name is required for 'smart' and 'full' modes")
1323
+ return None
1324
+ # TODO: Implement all-tables summary
1325
+ print("⚠️ All-tables summary not yet implemented. Please specify a table name.")
1179
1326
  return None
1327
+
1328
+ # Parse schema.table or just table
1329
+ if '.' in table_name:
1330
+ schema_name, tbl = table_name.split('.', 1)
1331
+ else:
1332
+ schema_name = self.schema
1333
+ tbl = table_name
1334
+
1335
+ # Construct the full table path using the same logic as get_stats
1336
+ table_path = f"{self.table_base_url}{schema_name}/{tbl}"
1337
+
1338
+ # Verify table exists and is not empty
1339
+ print(f"📊 Analyzing table: {schema_name}.{tbl}")
1340
+
1341
+ try:
1342
+ dt = DeltaTable(table_path)
1343
+ delta_files = dt.files()
1180
1344
 
1181
- except Exception:
1345
+ if not delta_files:
1346
+ print("⚠️ Table is empty (no files)")
1347
+ return None
1348
+
1349
+ except Exception as e:
1350
+ print(f"❌ Error accessing Delta table: {e}")
1182
1351
  return None
1352
+
1353
+ # Check if mode is a list of columns (custom ordering)
1354
+ if isinstance(mode, list):
1355
+ # User wants to test a specific column ordering
1356
+ print(f"Testing custom column ordering: {', '.join(mode)}")
1357
+
1358
+ # Calculate cardinality for NDV values
1359
+ card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
1360
+
1361
+ # Calculate RLE for the specified ordering
1362
+ rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
1363
+
1364
+ total_rle_all = sum(rle_counts.values())
1365
+
1366
+ print(f"\nResults:")
1367
+ print(f" Custom ordering: [{', '.join(mode)}]")
1368
+ print(f" Total RLE (all columns): {total_rle_all:,} runs")
1369
+
1370
+ # Return as DataFrame for consistency
1371
+ import pandas as pd
1372
+ results = [{
1373
+ 'schema': schema_name,
1374
+ 'table': tbl,
1375
+ 'sort_order': 'custom',
1376
+ 'columns_used': ', '.join(mode),
1377
+ 'total_rle_all': total_rle_all,
1378
+ **rle_counts
1379
+ }]
1380
+
1381
+ df = pd.DataFrame(results)
1382
+
1383
+ # Transform to long format
1384
+ long_format_results = []
1385
+
1386
+ for _, row in df.iterrows():
1387
+ schema_val = row['schema']
1388
+ table_val = row['table']
1389
+ sort_order = row['sort_order']
1390
+ columns_used = row['columns_used']
1391
+ total_rle_all_val = row['total_rle_all']
1392
+
1393
+ # Get all column names except metadata columns
1394
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
1395
+ data_columns = [col for col in df.columns if col not in metadata_cols]
1396
+
1397
+ # Get total rows from card_stats if available
1398
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
1399
+
1400
+ # Parse the columns_used to get ordering
1401
+ sort_columns_list = [c.strip() for c in columns_used.split(',')]
1402
+
1403
+ # Create one row per data column
1404
+ for col in data_columns:
1405
+ rle_value = row[col]
1406
+
1407
+ # Get NDV from card_stats
1408
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
1409
+
1410
+ # Determine if column was included in the sort and its position
1411
+ is_in_sort = col in sort_columns_list
1412
+ order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
1413
+ comment = '' if is_in_sort else 'not included in the sort'
1414
+
1415
+ long_format_results.append({
1416
+ 'schema': schema_val,
1417
+ 'table': table_val,
1418
+ 'sort_type': sort_order,
1419
+ 'column': col,
1420
+ 'order': order_position,
1421
+ 'RLE': rle_value,
1422
+ 'NDV': ndv_value,
1423
+ 'total_rows': total_rows,
1424
+ 'total_RLE': total_rle_all_val,
1425
+ 'comments': comment
1426
+ })
1427
+
1428
+ long_df = pd.DataFrame(long_format_results)
1429
+
1430
+ return long_df
1431
+
1432
+ # All modes now use test_column_orderings_smart with the mode parameter
1433
+ return test_column_orderings_smart(
1434
+ self.con,
1435
+ table_path,
1436
+ table_name=table_name, # Pass table name for cardinality calculation on full dataset
1437
+ mode=mode,
1438
+ limit=limit,
1439
+ min_distinct_threshold=min_distinct_threshold,
1440
+ max_cardinality_pct=max_cardinality_pct,
1441
+ max_ordering_depth=max_ordering_depth,
1442
+ schema_name=schema_name,
1443
+ table_display_name=tbl,
1444
+ duckrun_instance=self # Pass duckrun instance for detailed parquet stats
1445
+ )
1183
1446
 
1184
1447
  def close(self):
1185
1448
  """Close DuckDB connection"""
1449
+
1186
1450
  if self.con:
1187
1451
  self.con.close()
1188
1452
  print("Connection closed")
1189
1453
 
1190
1454
 
1191
- class WorkspaceConnection:
1455
+ class WorkspaceConnection(WorkspaceOperationsMixin):
1192
1456
  """
1193
1457
  Simple workspace connection for lakehouse management operations.
1194
1458
  """
@@ -1428,23 +1692,4 @@ class WorkspaceConnection:
1428
1692
  print(f"❌ Error downloading semantic model: {e}")
1429
1693
  import traceback
1430
1694
  traceback.print_exc()
1431
- return None
1432
-
1433
- def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
1434
- """Helper method to get workspace ID from name"""
1435
- try:
1436
- url = "https://api.fabric.microsoft.com/v1/workspaces"
1437
- headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
1438
-
1439
- response = requests.get(url, headers=headers)
1440
- response.raise_for_status()
1441
-
1442
- workspaces = response.json().get("value", [])
1443
- for workspace in workspaces:
1444
- if workspace.get("displayName") == workspace_name:
1445
- return workspace.get("id")
1446
-
1447
- return None
1448
-
1449
- except Exception:
1450
1695
  return None