duckrun 0.2.18.dev1__py3-none-any.whl → 0.2.19.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckrun might be problematic. Click here for more details.

duckrun/__init__.py CHANGED
@@ -3,7 +3,7 @@
3
3
  from duckrun.core import Duckrun
4
4
  from duckrun.notebook import import_notebook_from_web, import_notebook
5
5
 
6
- __version__ = "0.2.18.dev1"
6
+ __version__ = "0.2.18"
7
7
 
8
8
  # Expose unified connect method at module level
9
9
  connect = Duckrun.connect
duckrun/core.py CHANGED
@@ -1035,33 +1035,44 @@ class Duckrun(WorkspaceOperationsMixin):
1035
1035
  """Get underlying DuckDB connection"""
1036
1036
  return self.con
1037
1037
 
1038
- def get_stats(self, source: str):
1038
+ def get_stats(self, source: str = None, detailed = False):
1039
1039
  """
1040
1040
  Get comprehensive statistics for Delta Lake tables.
1041
1041
 
1042
1042
  Args:
1043
- source: Can be one of:
1043
+ source: Optional. Can be one of:
1044
+ - None: Use all tables in the connection's schema (default)
1044
1045
  - Table name: 'table_name' (uses current schema)
1045
1046
  - Schema.table: 'schema.table_name' (specific table in schema)
1046
1047
  - Schema only: 'schema' (all tables in schema)
1048
+ detailed: Optional. Controls the level of detail in statistics:
1049
+ - False (default): Aggregated table-level stats
1050
+ - True: Row group level statistics with compression details
1047
1051
 
1048
1052
  Returns:
1049
- Arrow table with statistics including total rows, file count, row groups,
1050
- average row group size, file sizes, VORDER status, and timestamp
1053
+ DataFrame with statistics based on detailed parameter:
1054
+ - If detailed=False: Aggregated table-level summary
1055
+ - If detailed=True: Granular file and row group level stats
1051
1056
 
1052
1057
  Examples:
1053
1058
  con = duckrun.connect("tmp/data.lakehouse/aemo")
1054
1059
 
1055
- # Single table in current schema
1060
+ # All tables in current schema (aemo) - aggregated
1061
+ stats = con.get_stats()
1062
+
1063
+ # Single table in current schema - aggregated
1056
1064
  stats = con.get_stats('price')
1057
1065
 
1066
+ # Single table with detailed row group statistics
1067
+ stats_detailed = con.get_stats('price', detailed=True)
1068
+
1058
1069
  # Specific table in different schema
1059
1070
  stats = con.get_stats('aemo.price')
1060
1071
 
1061
1072
  # All tables in a schema
1062
1073
  stats = con.get_stats('aemo')
1063
1074
  """
1064
- return _get_stats(self, source)
1075
+ return _get_stats(self, source, detailed)
1065
1076
 
1066
1077
  def list_lakehouses(self) -> List[str]:
1067
1078
  """
@@ -1175,7 +1186,7 @@ class Duckrun(WorkspaceOperationsMixin):
1175
1186
  return False
1176
1187
 
1177
1188
  def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
1178
- wait_seconds: int = 5) -> int:
1189
+ wait_seconds: int = 5, refresh: str = "full") -> int:
1179
1190
  """
1180
1191
  Deploy a semantic model from a BIM file using DirectLake mode.
1181
1192
 
@@ -1184,8 +1195,11 @@ class Duckrun(WorkspaceOperationsMixin):
1184
1195
  - URL: "https://raw.githubusercontent.com/.../model.bim"
1185
1196
  - Local file: "model.bim"
1186
1197
  - Workspace/Model: "workspace_name/model_name"
1187
- dataset_name: Name for the semantic model (default: source model name if workspace/model format, else lakehouse_schema)
1198
+ dataset_name: Name for the semantic model (default: schema name)
1188
1199
  wait_seconds: Seconds to wait for permission propagation (default: 5)
1200
+ refresh: Refresh strategy:
1201
+ - "full": Clear values and process full refresh (default)
1202
+ - "ignore": Skip refresh entirely
1189
1203
 
1190
1204
  Returns:
1191
1205
  1 for success, 0 for failure
@@ -1193,14 +1207,17 @@ class Duckrun(WorkspaceOperationsMixin):
1193
1207
  Examples:
1194
1208
  dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
1195
1209
 
1210
+ # Deploy with schema name as dataset name (dbo)
1211
+ dr.deploy("https://github.com/.../model.bim")
1212
+
1196
1213
  # Deploy from workspace/model (uses same name by default)
1197
1214
  dr.deploy("Source Workspace/Source Model") # Creates "Source Model"
1198
1215
 
1199
1216
  # Deploy with custom name
1200
- dr.deploy("Source Workspace/Source Model", dataset_name="Sales Model Copy")
1217
+ dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
1201
1218
 
1202
- # Deploy from URL or local file
1203
- dr.deploy("https://raw.githubusercontent.com/.../model.bim", dataset_name="My Model")
1219
+ # Deploy without refresh
1220
+ dr.deploy("https://github.com/.../model.bim", refresh="ignore")
1204
1221
  """
1205
1222
  from .semantic_model import deploy_semantic_model
1206
1223
 
@@ -1212,9 +1229,9 @@ class Duckrun(WorkspaceOperationsMixin):
1212
1229
  if len(parts) == 2:
1213
1230
  dataset_name = parts[1] # Use the model name
1214
1231
  else:
1215
- dataset_name = f"{self.lakehouse_name}_{self.schema}"
1232
+ dataset_name = self.schema # Use schema name
1216
1233
  else:
1217
- dataset_name = f"{self.lakehouse_name}_{self.schema}"
1234
+ dataset_name = self.schema # Use schema name
1218
1235
 
1219
1236
  # Call the deployment function (DirectLake only)
1220
1237
  return deploy_semantic_model(
@@ -1223,7 +1240,8 @@ class Duckrun(WorkspaceOperationsMixin):
1223
1240
  schema_name=self.schema,
1224
1241
  dataset_name=dataset_name,
1225
1242
  bim_url_or_path=bim_url,
1226
- wait_seconds=wait_seconds
1243
+ wait_seconds=wait_seconds,
1244
+ refresh=refresh
1227
1245
  )
1228
1246
 
1229
1247
  def close(self):
duckrun/notebook.py CHANGED
@@ -160,6 +160,7 @@ def import_notebook_from_web(
160
160
  update_url = f"{base_url}/workspaces/{workspace_id}/notebooks/{notebook_id}/updateDefinition"
161
161
  payload = {
162
162
  "definition": {
163
+ "format": "ipynb",
163
164
  "parts": [
164
165
  {
165
166
  "path": "notebook-content.py",
@@ -192,6 +193,7 @@ def import_notebook_from_web(
192
193
  payload = {
193
194
  "displayName": notebook_name,
194
195
  "definition": {
196
+ "format": "ipynb",
195
197
  "parts": [
196
198
  {
197
199
  "path": "notebook-content.py",
duckrun/semantic_model.py CHANGED
@@ -129,14 +129,72 @@ def check_dataset_exists(dataset_name, workspace_id, client):
129
129
  return False
130
130
 
131
131
 
132
- def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
133
- """Refresh a dataset and monitor progress using Power BI API"""
132
+ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None, refresh="full"):
133
+ """Refresh a dataset and monitor progress using Power BI API
134
+
135
+ For DirectLake models, performs refresh based on refresh parameter:
136
+ - refresh="full": Two-step refresh (clearValues + full reframe)
137
+ - refresh="ignore": Skip refresh entirely
138
+
139
+ If a refresh is already in progress, waits for it to complete before starting a new one.
140
+ """
141
+
142
+ # Skip refresh entirely if refresh is "ignore"
143
+ if refresh == "ignore":
144
+ print(" Ignoring refresh - skipping refresh")
145
+ return
134
146
 
135
147
  # If dataset_id not provided, look it up by name
136
148
  if not dataset_id:
137
149
  dataset_id = get_dataset_id(dataset_name, workspace_id, client)
138
150
 
139
- payload = {
151
+ # Use Power BI API for refresh (not Fabric API)
152
+ powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
153
+ headers = client._get_headers()
154
+
155
+ # Check for in-progress refreshes
156
+ print(" Checking for in-progress refreshes...")
157
+ try:
158
+ status_response = requests.get(f"{powerbi_url}?$top=1", headers=headers)
159
+ if status_response.status_code == 200:
160
+ refreshes = status_response.json().get('value', [])
161
+ if refreshes:
162
+ latest_refresh = refreshes[0]
163
+ status = latest_refresh.get('status')
164
+ if status in ['InProgress', 'Unknown']:
165
+ refresh_id = latest_refresh.get('requestId')
166
+ print(f" ⚠️ Found in-progress refresh (ID: {refresh_id})")
167
+ print(f" Waiting for current refresh to complete...")
168
+
169
+ # Wait for the in-progress refresh to complete
170
+ max_wait_attempts = 60
171
+ for attempt in range(max_wait_attempts):
172
+ time.sleep(5)
173
+ check_response = requests.get(f"{powerbi_url}/{refresh_id}", headers=headers)
174
+ if check_response.status_code == 200:
175
+ current_status = check_response.json().get('status')
176
+
177
+ if current_status == 'Completed':
178
+ print(f" ✓ Previous refresh completed")
179
+ break
180
+ elif current_status == 'Failed':
181
+ print(f" ⚠️ Previous refresh failed, continuing with new refresh")
182
+ break
183
+ elif current_status == 'Cancelled':
184
+ print(f" ⚠️ Previous refresh was cancelled, continuing with new refresh")
185
+ break
186
+
187
+ if attempt % 6 == 0:
188
+ print(f" Still waiting... (status: {current_status})")
189
+ else:
190
+ print(f" ⚠️ Timeout waiting for previous refresh, will attempt new refresh anyway")
191
+ except Exception as e:
192
+ print(f" ⚠️ Could not check refresh status: {e}")
193
+ print(f" Continuing with refresh attempt...")
194
+
195
+ # Step 1: clearValues - Purge data from memory
196
+ print(" Step 1: Clearing values from memory...")
197
+ clearvalues_payload = {
140
198
  "type": "clearValues",
141
199
  "commitMode": "transactional",
142
200
  "maxParallelism": 10,
@@ -144,14 +202,63 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
144
202
  "objects": []
145
203
  }
146
204
 
147
- # Use Power BI API for refresh (not Fabric API)
148
- powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
149
- headers = client._get_headers()
205
+ response = requests.post(powerbi_url, headers=headers, json=clearvalues_payload)
150
206
 
151
- response = requests.post(powerbi_url, headers=headers, json=payload)
207
+ if response.status_code in [200, 202]:
208
+ # For 202, monitor the clearValues operation
209
+ if response.status_code == 202:
210
+ location = response.headers.get('Location')
211
+ if location:
212
+ clear_refresh_id = location.split('/')[-1]
213
+ print(" ✓ Clear values initiated, monitoring progress...")
214
+
215
+ max_attempts = 60
216
+ for attempt in range(max_attempts):
217
+ time.sleep(2)
218
+
219
+ status_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes/{clear_refresh_id}"
220
+ status_response = requests.get(status_url, headers=headers)
221
+ status_response.raise_for_status()
222
+ status = status_response.json().get('status')
223
+
224
+ if status == 'Completed':
225
+ print(f" ✓ Clear values completed")
226
+ break
227
+ elif status == 'Failed':
228
+ error = status_response.json().get('serviceExceptionJson', '')
229
+ raise Exception(f"Clear values failed: {error}")
230
+ elif status == 'Cancelled':
231
+ raise Exception("Clear values was cancelled")
232
+
233
+ if attempt % 10 == 0 and attempt > 0:
234
+ print(f" Clear values status: {status}...")
235
+ else:
236
+ raise Exception(f"Clear values timed out")
237
+ else:
238
+ print(" ✓ Clear values completed")
239
+ else:
240
+ # Provide detailed error message
241
+ try:
242
+ error_details = response.json()
243
+ error_message = error_details.get('error', {}).get('message', response.text)
244
+ raise Exception(f"Clear values failed with status {response.status_code}: {error_message}")
245
+ except (json.JSONDecodeError, ValueError):
246
+ response.raise_for_status()
247
+
248
+ # Step 2: full refresh - Reframe data from Delta tables
249
+ print(" Step 2: Full refresh to reframe data...")
250
+ full_payload = {
251
+ "type": "full",
252
+ "commitMode": "transactional",
253
+ "maxParallelism": 10,
254
+ "retryCount": 2,
255
+ "objects": []
256
+ }
257
+
258
+ response = requests.post(powerbi_url, headers=headers, json=full_payload)
152
259
 
153
260
  if response.status_code in [200, 202]:
154
- print(f"✓ Refresh initiated")
261
+ print(f" ✓ Refresh initiated")
155
262
 
156
263
  # For 202, get the refresh_id from the Location header
157
264
  if response.status_code == 202:
@@ -183,7 +290,13 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
183
290
 
184
291
  raise Exception(f"Refresh timed out")
185
292
  else:
186
- response.raise_for_status()
293
+ # Provide detailed error message
294
+ try:
295
+ error_details = response.json()
296
+ error_message = error_details.get('error', {}).get('message', response.text)
297
+ raise Exception(f"Refresh request failed with status {response.status_code}: {error_message}")
298
+ except (json.JSONDecodeError, ValueError):
299
+ response.raise_for_status()
187
300
 
188
301
 
189
302
  def download_bim_from_github(url_or_path):
@@ -431,7 +544,7 @@ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
431
544
 
432
545
 
433
546
  def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
434
- bim_url_or_path, wait_seconds=5):
547
+ bim_url_or_path, wait_seconds=5, refresh="full"):
435
548
  """
436
549
  Deploy a semantic model using DirectLake mode.
437
550
 
@@ -442,6 +555,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
442
555
  dataset_name: Name for the semantic model
443
556
  bim_url_or_path: URL to the BIM file or local file path (e.g., 'model.bim' or 'https://...')
444
557
  wait_seconds: Seconds to wait before refresh (default: 5)
558
+ refresh: Refresh strategy (default: "full")
559
+ - "full": Clear values and process full refresh
560
+ - "ignore": Skip refresh entirely
445
561
 
446
562
  Returns:
447
563
  1 for success, 0 for failure
@@ -454,6 +570,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
454
570
  # Using a local file
455
571
  dr.deploy("./my_model.bim")
456
572
  dr.deploy("C:/path/to/model.bim")
573
+
574
+ # Deploy without refresh
575
+ dr.deploy("./my_model.bim", refresh="ignore")
457
576
  """
458
577
  print("=" * 70)
459
578
  print("Semantic Model Deployment (DirectLake)")
@@ -471,14 +590,14 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
471
590
  dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
472
591
 
473
592
  if dataset_exists:
474
- print(f"\n✓ Dataset exists - refreshing...")
593
+ print(f"✓ Dataset '{dataset_name}' already exists - skipping deployment")
475
594
 
476
595
  if wait_seconds > 0:
477
596
  print(f" Waiting {wait_seconds} seconds...")
478
597
  time.sleep(wait_seconds)
479
598
 
480
- print("\n[Step 6/6] Refreshing semantic model...")
481
- refresh_dataset(dataset_name, workspace_id, client)
599
+ print("\n[Step 3/3] Refreshing existing semantic model...")
600
+ refresh_dataset(dataset_name, workspace_id, client, refresh=refresh)
482
601
 
483
602
  print("\n" + "=" * 70)
484
603
  print("🎉 Refresh Completed!")
@@ -510,7 +629,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
510
629
 
511
630
  # Step 6: Refresh using the dataset ID returned from creation
512
631
  print("\n[Step 6/6] Refreshing semantic model...")
513
- refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id)
632
+ refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id, refresh=refresh)
514
633
 
515
634
  print("\n" + "=" * 70)
516
635
  print("🎉 Deployment Completed!")
@@ -537,7 +656,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
537
656
  return 0
538
657
 
539
658
 
540
- def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5):
659
+ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5, refresh="full"):
541
660
  """
542
661
  Copy a semantic model from one workspace to another.
543
662
 
@@ -550,6 +669,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
550
669
  destination: Destination in format "workspace/lakehouse.lakehouse/schema"
551
670
  new_model_name: Name for the new semantic model (default: same as source)
552
671
  wait_seconds: Seconds to wait before refresh (default: 5)
672
+ refresh: Refresh strategy (default: "full")
673
+ - "full": Clear values and process full refresh
674
+ - "ignore": Skip refresh entirely
553
675
 
554
676
  Returns:
555
677
  1 for success, 0 for failure
@@ -562,6 +684,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
562
684
  copy_model("Source WS", "Production Model", "Target WS/Data Lake.lakehouse/analytics",
563
685
  new_model_name="Production Model - Copy")
564
686
 
687
+ # Copy without refresh
688
+ copy_model("Source WS", "Model", "Target WS/LH.lakehouse/dbo", refresh="ignore")
689
+
565
690
  # Using the connect pattern
566
691
  import duckrun
567
692
  duckrun.semantic_model.copy_model("Source", "Model", "Target/LH.lakehouse/dbo")
@@ -688,7 +813,8 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
688
813
  schema_name=schema,
689
814
  dataset_name=new_model_name,
690
815
  bim_url_or_path=temp_bim_path,
691
- wait_seconds=wait_seconds
816
+ wait_seconds=wait_seconds,
817
+ refresh=refresh
692
818
  )
693
819
 
694
820
  # Clean up temp file
duckrun/stats.py CHANGED
@@ -60,32 +60,89 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
60
60
  return []
61
61
 
62
62
 
63
- def get_stats(duckrun_instance, source: str):
63
+ def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
64
+ """Match tables across all schemas using a wildcard pattern.
65
+ Pattern can be:
66
+ - '*.summary' - matches 'summary' table in all schemas
67
+ - '*summary' - matches any table ending with 'summary'
68
+ - 'schema.*' - matches all tables in 'schema'
69
+ Returns a dict mapping schema names to lists of matching table names."""
70
+ import fnmatch
71
+
72
+ try:
73
+ # Query all schemas and tables in one go
74
+ query = """
75
+ SELECT table_schema, table_name
76
+ FROM information_schema.tables
77
+ WHERE table_schema NOT LIKE 'pg_%'
78
+ AND table_schema != 'information_schema'
79
+ AND table_name NOT LIKE 'tbl_%'
80
+ """
81
+ result = duckrun_instance.con.execute(query).fetchall()
82
+
83
+ matched = {}
84
+
85
+ # Check if pattern contains a dot (schema.table pattern)
86
+ if '.' in pattern:
87
+ schema_pattern, table_pattern = pattern.split('.', 1)
88
+ for schema, table in result:
89
+ if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
90
+ if schema not in matched:
91
+ matched[schema] = []
92
+ matched[schema].append(table)
93
+ else:
94
+ # Pattern matches only table names
95
+ for schema, table in result:
96
+ if fnmatch.fnmatch(table, pattern):
97
+ if schema not in matched:
98
+ matched[schema] = []
99
+ matched[schema].append(table)
100
+
101
+ return matched
102
+ except:
103
+ return {}
104
+
105
+
106
+ def get_stats(duckrun_instance, source: str = None, detailed = False):
64
107
  """
65
108
  Get comprehensive statistics for Delta Lake tables.
66
109
 
67
110
  Args:
68
111
  duckrun_instance: The Duckrun connection instance
69
- source: Can be one of:
112
+ source: Optional. Can be one of:
113
+ - None: Use all tables in the connection's schema (default)
70
114
  - Table name: 'table_name' (uses main schema in DuckDB)
71
115
  - Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
72
116
  - Schema only: 'schema' (all tables in schema, if multi-schema)
117
+ - Wildcard pattern: '*.summary' (matches tables across all schemas)
118
+ detailed: Optional. Controls the level of detail in statistics:
119
+ - False (default): Aggregated table-level stats (total rows, file count,
120
+ row groups, average row group size, file sizes, VORDER status)
121
+ - True: Row group level statistics with compression details, row group sizes,
122
+ and parquet metadata
73
123
 
74
124
  Returns:
75
- Arrow table with statistics including total rows, file count, row groups,
76
- average row group size, file sizes, VORDER status, and timestamp
125
+ DataFrame with statistics based on detailed parameter:
126
+ - If detailed=False: Aggregated table-level summary
127
+ - If detailed=True: Granular file and row group level stats
77
128
 
78
129
  Examples:
79
130
  con = duckrun.connect("tmp/data.lakehouse/test")
80
131
 
81
- # Single table in main schema (DuckDB uses 'main', not 'test')
82
- stats = con.get_stats('price_today')
132
+ # All tables in the connection's schema (aggregated)
133
+ stats = con.get_stats()
134
+
135
+ # Single table with detailed row group statistics
136
+ stats_detailed = con.get_stats('price_today', detailed=True)
83
137
 
84
138
  # Specific table in different schema (only if multi-schema enabled)
85
139
  stats = con.get_stats('aemo.price')
86
140
 
87
141
  # All tables in a schema (only if multi-schema enabled)
88
142
  stats = con.get_stats('aemo')
143
+
144
+ # Wildcard pattern across all schemas (only if multi-schema enabled)
145
+ stats = con.get_stats('*.summary')
89
146
  """
90
147
  timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
91
148
 
@@ -93,8 +150,31 @@ def get_stats(duckrun_instance, source: str):
93
150
  duckdb_schema = "main"
94
151
  url_schema = duckrun_instance.schema # This is from the connection URL path
95
152
 
153
+ # If source is not provided, default to all tables in the connection's schema
154
+ if source is None:
155
+ source = url_schema
156
+
157
+ # Check if source contains wildcard characters
158
+ if '*' in source or '?' in source:
159
+ # Wildcard pattern mode - only valid if multi-schema is enabled
160
+ if not duckrun_instance.scan_all_schemas:
161
+ raise ValueError(f"Wildcard pattern '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Enable multi-schema mode to use wildcards.")
162
+
163
+ matched_tables = _match_tables_by_pattern(duckrun_instance, source)
164
+
165
+ if not matched_tables:
166
+ raise ValueError(f"No tables found matching pattern '{source}'")
167
+
168
+ # Flatten the matched tables into a list with schema info
169
+ tables_with_schemas = []
170
+ for schema, tables in matched_tables.items():
171
+ for table in tables:
172
+ tables_with_schemas.append((schema, table))
173
+
174
+ print(f"Found {len(tables_with_schemas)} tables matching pattern '{source}'")
175
+
96
176
  # Parse the source and validate existence
97
- if '.' in source:
177
+ elif '.' in source:
98
178
  # Format: schema.table - only valid if multi-schema is enabled
99
179
  schema_name, table_name = source.split('.', 1)
100
180
 
@@ -105,46 +185,45 @@ def get_stats(duckrun_instance, source: str):
105
185
  if not _table_exists(duckrun_instance, schema_name, table_name):
106
186
  raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
107
187
 
108
- list_tables = [table_name]
188
+ tables_with_schemas = [(schema_name, table_name)]
109
189
  else:
110
190
  # Could be just table name or schema name
111
191
  if duckrun_instance.scan_all_schemas:
112
192
  # Multi-schema mode: DuckDB has actual schemas
113
193
  # First check if it's a table in main schema
114
194
  if _table_exists(duckrun_instance, duckdb_schema, source):
115
- list_tables = [source]
116
- schema_name = duckdb_schema
195
+ tables_with_schemas = [(duckdb_schema, source)]
117
196
  # Otherwise, check if it's a schema name
118
197
  elif _schema_exists(duckrun_instance, source):
119
198
  schema_name = source
120
199
  list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
121
200
  if not list_tables:
122
201
  raise ValueError(f"Schema '{source}' exists but contains no tables")
202
+ tables_with_schemas = [(schema_name, tbl) for tbl in list_tables]
123
203
  else:
124
204
  raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
125
205
  else:
126
206
  # Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
127
207
  if _table_exists(duckrun_instance, duckdb_schema, source):
128
208
  # It's a table name
129
- list_tables = [source]
130
- schema_name = url_schema # Use URL schema for file path construction
209
+ tables_with_schemas = [(url_schema, source)]
131
210
  elif source == url_schema:
132
211
  # Special case: user asked for stats on the URL schema name - list all tables
133
212
  list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
134
- schema_name = url_schema # Use URL schema for file path construction
135
213
  if not list_tables:
136
214
  raise ValueError(f"No tables found in schema '{url_schema}'")
215
+ tables_with_schemas = [(url_schema, tbl) for tbl in list_tables]
137
216
  else:
138
217
  raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
139
218
 
140
219
  # Use the existing connection
141
220
  con = duckrun_instance.con
142
221
 
143
- print(f"Processing {len(list_tables)} tables: {list_tables}")
222
+ print(f"Processing {len(tables_with_schemas)} tables from {len(set(s for s, t in tables_with_schemas))} schema(s)")
144
223
 
145
224
  successful_tables = []
146
- for idx, tbl in enumerate(list_tables):
147
- print(f"[{idx+1}/{len(list_tables)}] Processing table '{tbl}'...")
225
+ for idx, (schema_name, tbl) in enumerate(tables_with_schemas):
226
+ print(f"[{idx+1}/{len(tables_with_schemas)}] Processing table '{schema_name}.{tbl}'...")
148
227
  # Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
149
228
  table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
150
229
 
@@ -171,8 +250,18 @@ def get_stats(duckrun_instance, source: str):
171
250
  print(f"Warning: Could not convert RecordBatch for table '{tbl}': Unexpected type {type(add_actions)}")
172
251
  xx = {}
173
252
 
174
- # Check if VORDER exists
175
- vorder = 'tags.VORDER' in xx.keys()
253
+ # Check if VORDER exists - handle both formats:
254
+ # 1. Flattened format: 'tags.VORDER' or 'tags.vorder' in keys
255
+ # 2. Nested format: check in 'tags' dict for 'VORDER' or 'vorder'
256
+ vorder = False
257
+ if 'tags.VORDER' in xx.keys() or 'tags.vorder' in xx.keys():
258
+ vorder = True
259
+ elif 'tags' in xx.keys() and xx['tags']:
260
+ # Check nested tags dictionary (tags is a list of dicts, one per file)
261
+ for tag_dict in xx['tags']:
262
+ if tag_dict and ('VORDER' in tag_dict or 'vorder' in tag_dict):
263
+ vorder = True
264
+ break
176
265
 
177
266
  # Calculate total size
178
267
  total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
@@ -187,6 +276,7 @@ def get_stats(duckrun_instance, source: str):
187
276
  con.execute(f'''
188
277
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
189
278
  SELECT
279
+ '{schema_name}' as schema,
190
280
  '{tbl}' as tbl,
191
281
  'empty' as file_name,
192
282
  0 as num_rows,
@@ -199,21 +289,45 @@ def get_stats(duckrun_instance, source: str):
199
289
  ''')
200
290
  else:
201
291
  # Get parquet metadata and create temp table with compression info
202
- con.execute(f'''
203
- CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
204
- SELECT
205
- '{tbl}' as tbl,
206
- fm.file_name,
207
- fm.num_rows,
208
- fm.num_row_groups,
209
- CEIL({total_size}/(1024*1024)) as size,
210
- {vorder} as vorder,
211
- COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
212
- '{timestamp}' as timestamp
213
- FROM parquet_file_metadata({delta}) fm
214
- LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
215
- GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
216
- ''')
292
+ if detailed == True:
293
+ # Detailed mode: Include row group level statistics
294
+ con.execute(f'''
295
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
296
+ SELECT
297
+ '{schema_name}' as schema,
298
+ '{tbl}' as tbl,
299
+ pm.file_name,
300
+ pm.row_group_id,
301
+ pm.row_group_num_rows,
302
+ pm.row_group_num_columns,
303
+ pm.row_group_bytes,
304
+ {vorder} as vorder,
305
+ pm.compression,
306
+ pm.total_compressed_size,
307
+ pm.total_uncompressed_size,
308
+ ROUND(pm.total_compressed_size::DOUBLE / NULLIF(pm.total_uncompressed_size, 0), 4) as compression_ratio,
309
+ '{timestamp}' as timestamp
310
+ FROM parquet_metadata({delta}) pm
311
+ WHERE pm.column_id = 0 -- Only include first column to avoid duplication per column
312
+ ''')
313
+ else:
314
+ # Aggregated mode: Original summary statistics
315
+ con.execute(f'''
316
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
317
+ SELECT
318
+ '{schema_name}' as schema,
319
+ '{tbl}' as tbl,
320
+ fm.file_name,
321
+ fm.num_rows,
322
+ fm.num_row_groups,
323
+ CEIL({total_size}/(1024*1024)) as size,
324
+ {vorder} as vorder,
325
+ COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
326
+ '{timestamp}' as timestamp
327
+ FROM parquet_file_metadata({delta}) fm
328
+ LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
329
+ GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
330
+ ''')
217
331
 
218
332
  except Exception as e:
219
333
  error_msg = str(e)
@@ -237,6 +351,7 @@ def get_stats(duckrun_instance, source: str):
237
351
  con.execute(f'''
238
352
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
239
353
  SELECT
354
+ '{schema_name}' as schema,
240
355
  '{tbl}' as tbl,
241
356
  'empty' as file_name,
242
357
  0 as num_rows,
@@ -261,21 +376,45 @@ def get_stats(duckrun_instance, source: str):
261
376
  filenames.append(table_path + "/" + filename)
262
377
 
263
378
  # Use parquet_file_metadata to get actual parquet stats with compression
264
- con.execute(f'''
265
- CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
266
- SELECT
267
- '{tbl}' as tbl,
268
- fm.file_name,
269
- fm.num_rows,
270
- fm.num_row_groups,
271
- 0 as size,
272
- false as vorder,
273
- COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
274
- '{timestamp}' as timestamp
275
- FROM parquet_file_metadata({filenames}) fm
276
- LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
277
- GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
278
- ''')
379
+ if detailed == True:
380
+ # Detailed mode: Include row group level statistics
381
+ con.execute(f'''
382
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
383
+ SELECT
384
+ '{schema_name}' as schema,
385
+ '{tbl}' as tbl,
386
+ pm.file_name,
387
+ pm.row_group_id,
388
+ pm.row_group_num_rows,
389
+ pm.row_group_num_columns,
390
+ pm.row_group_bytes,
391
+ false as vorder,
392
+ pm.compression,
393
+ pm.total_compressed_size,
394
+ pm.total_uncompressed_size,
395
+ ROUND(pm.total_compressed_size::DOUBLE / NULLIF(pm.total_uncompressed_size, 0), 4) as compression_ratio,
396
+ '{timestamp}' as timestamp
397
+ FROM parquet_metadata({filenames}) pm
398
+ WHERE pm.column_id = 0 -- Only include first column to avoid duplication per column
399
+ ''')
400
+ else:
401
+ # Aggregated mode: Original summary statistics
402
+ con.execute(f'''
403
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
404
+ SELECT
405
+ '{schema_name}' as schema,
406
+ '{tbl}' as tbl,
407
+ fm.file_name,
408
+ fm.num_rows,
409
+ fm.num_row_groups,
410
+ 0 as size,
411
+ false as vorder,
412
+ COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
413
+ '{timestamp}' as timestamp
414
+ FROM parquet_file_metadata({filenames}) fm
415
+ LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
416
+ GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
417
+ ''')
279
418
 
280
419
  print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
281
420
  except Exception as fallback_error:
@@ -291,30 +430,59 @@ def get_stats(duckrun_instance, source: str):
291
430
  # No tables were processed successfully - return empty dataframe
292
431
  print("⚠️ No tables could be processed successfully")
293
432
  import pandas as pd
294
- return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
295
- 'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
433
+ if detailed == True:
434
+ return pd.DataFrame(columns=['schema', 'tbl', 'file_name', 'row_group_id', 'row_group_num_rows',
435
+ 'row_group_num_columns', 'row_group_bytes', 'vorder', 'compression',
436
+ 'total_compressed_size', 'total_uncompressed_size', 'compression_ratio', 'timestamp'])
437
+ else:
438
+ return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
439
+ 'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
296
440
 
297
441
  # Union all successfully processed temp tables
298
442
  union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
299
443
  union_query = ' UNION ALL '.join(union_parts)
300
444
 
301
- # Generate final summary
302
- final_result = con.execute(f'''
303
- SELECT
304
- tbl,
305
- SUM(num_rows) as total_rows,
306
- COUNT(*) as num_files,
307
- SUM(num_row_groups) as num_row_group,
308
- CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
309
- MIN(size) as file_size_MB,
310
- ANY_VALUE(vorder) as vorder,
311
- STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
312
- ANY_VALUE(timestamp) as timestamp
313
- FROM ({union_query})
314
- WHERE tbl IS NOT NULL
315
- GROUP BY tbl
316
- ORDER BY total_rows DESC
317
- ''').df()
445
+ # Generate final summary based on detailed flag
446
+ if detailed == True:
447
+ # Detailed mode: Return row group level data without aggregation
448
+ final_result = con.execute(f'''
449
+ SELECT
450
+ schema,
451
+ tbl,
452
+ file_name,
453
+ row_group_id,
454
+ row_group_num_rows,
455
+ row_group_num_columns,
456
+ row_group_bytes,
457
+ vorder,
458
+ compression,
459
+ total_compressed_size,
460
+ total_uncompressed_size,
461
+ compression_ratio,
462
+ timestamp
463
+ FROM ({union_query})
464
+ WHERE tbl IS NOT NULL
465
+ ORDER BY schema, tbl, file_name, row_group_id
466
+ ''').df()
467
+ else:
468
+ # Aggregated mode: Original summary statistics
469
+ final_result = con.execute(f'''
470
+ SELECT
471
+ schema,
472
+ tbl,
473
+ SUM(num_rows) as total_rows,
474
+ COUNT(*) as num_files,
475
+ SUM(num_row_groups) as num_row_group,
476
+ CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
477
+ MIN(size) as file_size_MB,
478
+ ANY_VALUE(vorder) as vorder,
479
+ STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
480
+ ANY_VALUE(timestamp) as timestamp
481
+ FROM ({union_query})
482
+ WHERE tbl IS NOT NULL
483
+ GROUP BY schema, tbl
484
+ ORDER BY total_rows DESC
485
+ ''').df()
318
486
 
319
487
  return final_result
320
488
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.18.dev1
3
+ Version: 0.2.19.dev0
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -0,0 +1,15 @@
1
+ duckrun/__init__.py,sha256=-DPOb_ETaBC0M7YqXj482FE1aZ-SxJeSeY6KB6hPgWU,350
2
+ duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
3
+ duckrun/core.py,sha256=jpg1okp6-Y4HubTJmSjyT9uhUc5pFr4A0tcNxNujSig,69086
4
+ duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
+ duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
+ duckrun/notebook.py,sha256=lzDRBoWZ_lePF-_5BbA1_42BImLZC5yrq6nzlmlKglM,12183
7
+ duckrun/runner.py,sha256=NGVyerJA44UP2umRdndfL0fuFM_gdOZmuJUz-PLOFf0,13461
8
+ duckrun/semantic_model.py,sha256=shRPBN1II60K_PH8JOqke-_3hAwLspcx4Add0VJRwwU,35913
9
+ duckrun/stats.py,sha256=HyzfDUGvYIxJ9QM8gbT_ISmVrVeEhhbxpxg1VLAgaRQ,23862
10
+ duckrun/writer.py,sha256=wIsU77DSj4J7d9_bIhvk6AbC51uUrLW0e6pcSPQOY1c,9424
11
+ duckrun-0.2.19.dev0.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
12
+ duckrun-0.2.19.dev0.dist-info/METADATA,sha256=I2EXHQLP-Gr_O2Y3yYiAb7el4OTeuutB5P-SvisnO4g,20807
13
+ duckrun-0.2.19.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ duckrun-0.2.19.dev0.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
15
+ duckrun-0.2.19.dev0.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- duckrun/__init__.py,sha256=0bJaY3gWsTwGcQS1P9KfaVOH9f8O-_CHXJzVbvqeOzA,355
2
- duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
3
- duckrun/core.py,sha256=DvxCBTob_OWOZAzcVqhoz5w95pxyH4sfoSmXMzG2BbY,68168
4
- duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
- duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
- duckrun/notebook.py,sha256=SzdKTpvzHiWMrvg7mCd3DN6R4gU_6Gm7gfkuETzylaE,12103
7
- duckrun/runner.py,sha256=NGVyerJA44UP2umRdndfL0fuFM_gdOZmuJUz-PLOFf0,13461
8
- duckrun/semantic_model.py,sha256=mkgAdi2hfJ1lkKhNo1vnPBNOFybFIxL34-zbP-71kAU,29516
9
- duckrun/stats.py,sha256=EqrCN1xwGo5nZgwezBvb6RepXT6b8H7xgK0yJJGFLfE,15155
10
- duckrun/writer.py,sha256=wIsU77DSj4J7d9_bIhvk6AbC51uUrLW0e6pcSPQOY1c,9424
11
- duckrun-0.2.18.dev1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
12
- duckrun-0.2.18.dev1.dist-info/METADATA,sha256=fUEehSe7mTzCQuZmyoxbysSFih3x8XfcnLMHv-h3ues,20807
13
- duckrun-0.2.18.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- duckrun-0.2.18.dev1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
15
- duckrun-0.2.18.dev1.dist-info/RECORD,,