duckrun 0.2.16.dev2__py3-none-any.whl → 0.2.19.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/__init__.py +4 -2
- duckrun/core.py +276 -47
- duckrun/notebook.py +324 -0
- duckrun/rle.py +860 -0
- duckrun/semantic_model.py +143 -17
- duckrun/stats.py +202 -67
- {duckrun-0.2.16.dev2.dist-info → duckrun-0.2.19.dev5.dist-info}/METADATA +2 -2
- duckrun-0.2.19.dev5.dist-info/RECORD +16 -0
- duckrun-0.2.16.dev2.dist-info/RECORD +0 -14
- {duckrun-0.2.16.dev2.dist-info → duckrun-0.2.19.dev5.dist-info}/WHEEL +0 -0
- {duckrun-0.2.16.dev2.dist-info → duckrun-0.2.19.dev5.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.16.dev2.dist-info → duckrun-0.2.19.dev5.dist-info}/top_level.txt +0 -0
duckrun/semantic_model.py
CHANGED
|
@@ -129,29 +129,136 @@ def check_dataset_exists(dataset_name, workspace_id, client):
|
|
|
129
129
|
return False
|
|
130
130
|
|
|
131
131
|
|
|
132
|
-
def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
|
|
133
|
-
"""Refresh a dataset and monitor progress using Power BI API
|
|
132
|
+
def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None, refresh="full"):
|
|
133
|
+
"""Refresh a dataset and monitor progress using Power BI API
|
|
134
|
+
|
|
135
|
+
For DirectLake models, performs refresh based on refresh parameter:
|
|
136
|
+
- refresh="full": Two-step refresh (clearValues + full reframe)
|
|
137
|
+
- refresh="ignore": Skip refresh entirely
|
|
138
|
+
|
|
139
|
+
If a refresh is already in progress, waits for it to complete before starting a new one.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
# Skip refresh entirely if refresh is "ignore"
|
|
143
|
+
if refresh == "ignore":
|
|
144
|
+
print(" Ignoring refresh - skipping refresh")
|
|
145
|
+
return
|
|
134
146
|
|
|
135
147
|
# If dataset_id not provided, look it up by name
|
|
136
148
|
if not dataset_id:
|
|
137
149
|
dataset_id = get_dataset_id(dataset_name, workspace_id, client)
|
|
138
150
|
|
|
139
|
-
|
|
140
|
-
|
|
151
|
+
# Use Power BI API for refresh (not Fabric API)
|
|
152
|
+
powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
|
|
153
|
+
headers = client._get_headers()
|
|
154
|
+
|
|
155
|
+
# Check for in-progress refreshes
|
|
156
|
+
print(" Checking for in-progress refreshes...")
|
|
157
|
+
try:
|
|
158
|
+
status_response = requests.get(f"{powerbi_url}?$top=1", headers=headers)
|
|
159
|
+
if status_response.status_code == 200:
|
|
160
|
+
refreshes = status_response.json().get('value', [])
|
|
161
|
+
if refreshes:
|
|
162
|
+
latest_refresh = refreshes[0]
|
|
163
|
+
status = latest_refresh.get('status')
|
|
164
|
+
if status in ['InProgress', 'Unknown']:
|
|
165
|
+
refresh_id = latest_refresh.get('requestId')
|
|
166
|
+
print(f" ⚠️ Found in-progress refresh (ID: {refresh_id})")
|
|
167
|
+
print(f" Waiting for current refresh to complete...")
|
|
168
|
+
|
|
169
|
+
# Wait for the in-progress refresh to complete
|
|
170
|
+
max_wait_attempts = 60
|
|
171
|
+
for attempt in range(max_wait_attempts):
|
|
172
|
+
time.sleep(5)
|
|
173
|
+
check_response = requests.get(f"{powerbi_url}/{refresh_id}", headers=headers)
|
|
174
|
+
if check_response.status_code == 200:
|
|
175
|
+
current_status = check_response.json().get('status')
|
|
176
|
+
|
|
177
|
+
if current_status == 'Completed':
|
|
178
|
+
print(f" ✓ Previous refresh completed")
|
|
179
|
+
break
|
|
180
|
+
elif current_status == 'Failed':
|
|
181
|
+
print(f" ⚠️ Previous refresh failed, continuing with new refresh")
|
|
182
|
+
break
|
|
183
|
+
elif current_status == 'Cancelled':
|
|
184
|
+
print(f" ⚠️ Previous refresh was cancelled, continuing with new refresh")
|
|
185
|
+
break
|
|
186
|
+
|
|
187
|
+
if attempt % 6 == 0:
|
|
188
|
+
print(f" Still waiting... (status: {current_status})")
|
|
189
|
+
else:
|
|
190
|
+
print(f" ⚠️ Timeout waiting for previous refresh, will attempt new refresh anyway")
|
|
191
|
+
except Exception as e:
|
|
192
|
+
print(f" ⚠️ Could not check refresh status: {e}")
|
|
193
|
+
print(f" Continuing with refresh attempt...")
|
|
194
|
+
|
|
195
|
+
# Step 1: clearValues - Purge data from memory
|
|
196
|
+
print(" Step 1: Clearing values from memory...")
|
|
197
|
+
clearvalues_payload = {
|
|
198
|
+
"type": "clearValues",
|
|
141
199
|
"commitMode": "transactional",
|
|
142
200
|
"maxParallelism": 10,
|
|
143
201
|
"retryCount": 2,
|
|
144
202
|
"objects": []
|
|
145
203
|
}
|
|
146
204
|
|
|
147
|
-
|
|
148
|
-
powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
|
|
149
|
-
headers = client._get_headers()
|
|
205
|
+
response = requests.post(powerbi_url, headers=headers, json=clearvalues_payload)
|
|
150
206
|
|
|
151
|
-
response
|
|
207
|
+
if response.status_code in [200, 202]:
|
|
208
|
+
# For 202, monitor the clearValues operation
|
|
209
|
+
if response.status_code == 202:
|
|
210
|
+
location = response.headers.get('Location')
|
|
211
|
+
if location:
|
|
212
|
+
clear_refresh_id = location.split('/')[-1]
|
|
213
|
+
print(" ✓ Clear values initiated, monitoring progress...")
|
|
214
|
+
|
|
215
|
+
max_attempts = 60
|
|
216
|
+
for attempt in range(max_attempts):
|
|
217
|
+
time.sleep(2)
|
|
218
|
+
|
|
219
|
+
status_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes/{clear_refresh_id}"
|
|
220
|
+
status_response = requests.get(status_url, headers=headers)
|
|
221
|
+
status_response.raise_for_status()
|
|
222
|
+
status = status_response.json().get('status')
|
|
223
|
+
|
|
224
|
+
if status == 'Completed':
|
|
225
|
+
print(f" ✓ Clear values completed")
|
|
226
|
+
break
|
|
227
|
+
elif status == 'Failed':
|
|
228
|
+
error = status_response.json().get('serviceExceptionJson', '')
|
|
229
|
+
raise Exception(f"Clear values failed: {error}")
|
|
230
|
+
elif status == 'Cancelled':
|
|
231
|
+
raise Exception("Clear values was cancelled")
|
|
232
|
+
|
|
233
|
+
if attempt % 10 == 0 and attempt > 0:
|
|
234
|
+
print(f" Clear values status: {status}...")
|
|
235
|
+
else:
|
|
236
|
+
raise Exception(f"Clear values timed out")
|
|
237
|
+
else:
|
|
238
|
+
print(" ✓ Clear values completed")
|
|
239
|
+
else:
|
|
240
|
+
# Provide detailed error message
|
|
241
|
+
try:
|
|
242
|
+
error_details = response.json()
|
|
243
|
+
error_message = error_details.get('error', {}).get('message', response.text)
|
|
244
|
+
raise Exception(f"Clear values failed with status {response.status_code}: {error_message}")
|
|
245
|
+
except (json.JSONDecodeError, ValueError):
|
|
246
|
+
response.raise_for_status()
|
|
247
|
+
|
|
248
|
+
# Step 2: full refresh - Reframe data from Delta tables
|
|
249
|
+
print(" Step 2: Full refresh to reframe data...")
|
|
250
|
+
full_payload = {
|
|
251
|
+
"type": "full",
|
|
252
|
+
"commitMode": "transactional",
|
|
253
|
+
"maxParallelism": 10,
|
|
254
|
+
"retryCount": 2,
|
|
255
|
+
"objects": []
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
response = requests.post(powerbi_url, headers=headers, json=full_payload)
|
|
152
259
|
|
|
153
260
|
if response.status_code in [200, 202]:
|
|
154
|
-
print(f"✓ Refresh initiated")
|
|
261
|
+
print(f" ✓ Refresh initiated")
|
|
155
262
|
|
|
156
263
|
# For 202, get the refresh_id from the Location header
|
|
157
264
|
if response.status_code == 202:
|
|
@@ -183,7 +290,13 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
|
|
|
183
290
|
|
|
184
291
|
raise Exception(f"Refresh timed out")
|
|
185
292
|
else:
|
|
186
|
-
|
|
293
|
+
# Provide detailed error message
|
|
294
|
+
try:
|
|
295
|
+
error_details = response.json()
|
|
296
|
+
error_message = error_details.get('error', {}).get('message', response.text)
|
|
297
|
+
raise Exception(f"Refresh request failed with status {response.status_code}: {error_message}")
|
|
298
|
+
except (json.JSONDecodeError, ValueError):
|
|
299
|
+
response.raise_for_status()
|
|
187
300
|
|
|
188
301
|
|
|
189
302
|
def download_bim_from_github(url_or_path):
|
|
@@ -431,7 +544,7 @@ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
|
|
|
431
544
|
|
|
432
545
|
|
|
433
546
|
def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
|
|
434
|
-
bim_url_or_path, wait_seconds=5):
|
|
547
|
+
bim_url_or_path, wait_seconds=5, refresh="full"):
|
|
435
548
|
"""
|
|
436
549
|
Deploy a semantic model using DirectLake mode.
|
|
437
550
|
|
|
@@ -442,6 +555,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
442
555
|
dataset_name: Name for the semantic model
|
|
443
556
|
bim_url_or_path: URL to the BIM file or local file path (e.g., 'model.bim' or 'https://...')
|
|
444
557
|
wait_seconds: Seconds to wait before refresh (default: 5)
|
|
558
|
+
refresh: Refresh strategy (default: "full")
|
|
559
|
+
- "full": Clear values and process full refresh
|
|
560
|
+
- "ignore": Skip refresh entirely
|
|
445
561
|
|
|
446
562
|
Returns:
|
|
447
563
|
1 for success, 0 for failure
|
|
@@ -454,6 +570,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
454
570
|
# Using a local file
|
|
455
571
|
dr.deploy("./my_model.bim")
|
|
456
572
|
dr.deploy("C:/path/to/model.bim")
|
|
573
|
+
|
|
574
|
+
# Deploy without refresh
|
|
575
|
+
dr.deploy("./my_model.bim", refresh="ignore")
|
|
457
576
|
"""
|
|
458
577
|
print("=" * 70)
|
|
459
578
|
print("Semantic Model Deployment (DirectLake)")
|
|
@@ -471,14 +590,14 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
471
590
|
dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
|
|
472
591
|
|
|
473
592
|
if dataset_exists:
|
|
474
|
-
print(f"
|
|
593
|
+
print(f"✓ Dataset '{dataset_name}' already exists - skipping deployment")
|
|
475
594
|
|
|
476
595
|
if wait_seconds > 0:
|
|
477
596
|
print(f" Waiting {wait_seconds} seconds...")
|
|
478
597
|
time.sleep(wait_seconds)
|
|
479
598
|
|
|
480
|
-
print("\n[Step
|
|
481
|
-
refresh_dataset(dataset_name, workspace_id, client)
|
|
599
|
+
print("\n[Step 3/3] Refreshing existing semantic model...")
|
|
600
|
+
refresh_dataset(dataset_name, workspace_id, client, refresh=refresh)
|
|
482
601
|
|
|
483
602
|
print("\n" + "=" * 70)
|
|
484
603
|
print("🎉 Refresh Completed!")
|
|
@@ -510,7 +629,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
510
629
|
|
|
511
630
|
# Step 6: Refresh using the dataset ID returned from creation
|
|
512
631
|
print("\n[Step 6/6] Refreshing semantic model...")
|
|
513
|
-
refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id)
|
|
632
|
+
refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id, refresh=refresh)
|
|
514
633
|
|
|
515
634
|
print("\n" + "=" * 70)
|
|
516
635
|
print("🎉 Deployment Completed!")
|
|
@@ -537,7 +656,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
537
656
|
return 0
|
|
538
657
|
|
|
539
658
|
|
|
540
|
-
def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5):
|
|
659
|
+
def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5, refresh="full"):
|
|
541
660
|
"""
|
|
542
661
|
Copy a semantic model from one workspace to another.
|
|
543
662
|
|
|
@@ -550,6 +669,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
550
669
|
destination: Destination in format "workspace/lakehouse.lakehouse/schema"
|
|
551
670
|
new_model_name: Name for the new semantic model (default: same as source)
|
|
552
671
|
wait_seconds: Seconds to wait before refresh (default: 5)
|
|
672
|
+
refresh: Refresh strategy (default: "full")
|
|
673
|
+
- "full": Clear values and process full refresh
|
|
674
|
+
- "ignore": Skip refresh entirely
|
|
553
675
|
|
|
554
676
|
Returns:
|
|
555
677
|
1 for success, 0 for failure
|
|
@@ -562,6 +684,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
562
684
|
copy_model("Source WS", "Production Model", "Target WS/Data Lake.lakehouse/analytics",
|
|
563
685
|
new_model_name="Production Model - Copy")
|
|
564
686
|
|
|
687
|
+
# Copy without refresh
|
|
688
|
+
copy_model("Source WS", "Model", "Target WS/LH.lakehouse/dbo", refresh="ignore")
|
|
689
|
+
|
|
565
690
|
# Using the connect pattern
|
|
566
691
|
import duckrun
|
|
567
692
|
duckrun.semantic_model.copy_model("Source", "Model", "Target/LH.lakehouse/dbo")
|
|
@@ -688,7 +813,8 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
688
813
|
schema_name=schema,
|
|
689
814
|
dataset_name=new_model_name,
|
|
690
815
|
bim_url_or_path=temp_bim_path,
|
|
691
|
-
wait_seconds=wait_seconds
|
|
816
|
+
wait_seconds=wait_seconds,
|
|
817
|
+
refresh=refresh
|
|
692
818
|
)
|
|
693
819
|
|
|
694
820
|
# Clean up temp file
|
duckrun/stats.py
CHANGED
|
@@ -60,32 +60,89 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
|
|
|
60
60
|
return []
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
def
|
|
63
|
+
def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
|
|
64
|
+
"""Match tables across all schemas using a wildcard pattern.
|
|
65
|
+
Pattern can be:
|
|
66
|
+
- '*.summary' - matches 'summary' table in all schemas
|
|
67
|
+
- '*summary' - matches any table ending with 'summary'
|
|
68
|
+
- 'schema.*' - matches all tables in 'schema'
|
|
69
|
+
Returns a dict mapping schema names to lists of matching table names."""
|
|
70
|
+
import fnmatch
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
# Query all schemas and tables in one go
|
|
74
|
+
query = """
|
|
75
|
+
SELECT table_schema, table_name
|
|
76
|
+
FROM information_schema.tables
|
|
77
|
+
WHERE table_schema NOT LIKE 'pg_%'
|
|
78
|
+
AND table_schema != 'information_schema'
|
|
79
|
+
AND table_name NOT LIKE 'tbl_%'
|
|
80
|
+
"""
|
|
81
|
+
result = duckrun_instance.con.execute(query).fetchall()
|
|
82
|
+
|
|
83
|
+
matched = {}
|
|
84
|
+
|
|
85
|
+
# Check if pattern contains a dot (schema.table pattern)
|
|
86
|
+
if '.' in pattern:
|
|
87
|
+
schema_pattern, table_pattern = pattern.split('.', 1)
|
|
88
|
+
for schema, table in result:
|
|
89
|
+
if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
|
|
90
|
+
if schema not in matched:
|
|
91
|
+
matched[schema] = []
|
|
92
|
+
matched[schema].append(table)
|
|
93
|
+
else:
|
|
94
|
+
# Pattern matches only table names
|
|
95
|
+
for schema, table in result:
|
|
96
|
+
if fnmatch.fnmatch(table, pattern):
|
|
97
|
+
if schema not in matched:
|
|
98
|
+
matched[schema] = []
|
|
99
|
+
matched[schema].append(table)
|
|
100
|
+
|
|
101
|
+
return matched
|
|
102
|
+
except:
|
|
103
|
+
return {}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_stats(duckrun_instance, source: str = None, detailed = False):
|
|
64
107
|
"""
|
|
65
108
|
Get comprehensive statistics for Delta Lake tables.
|
|
66
109
|
|
|
67
110
|
Args:
|
|
68
111
|
duckrun_instance: The Duckrun connection instance
|
|
69
|
-
source: Can be one of:
|
|
112
|
+
source: Optional. Can be one of:
|
|
113
|
+
- None: Use all tables in the connection's schema (default)
|
|
70
114
|
- Table name: 'table_name' (uses main schema in DuckDB)
|
|
71
115
|
- Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
|
|
72
116
|
- Schema only: 'schema' (all tables in schema, if multi-schema)
|
|
117
|
+
- Wildcard pattern: '*.summary' (matches tables across all schemas)
|
|
118
|
+
detailed: Optional. Controls the level of detail in statistics:
|
|
119
|
+
- False (default): Aggregated table-level stats (total rows, file count,
|
|
120
|
+
row groups, average row group size, file sizes, VORDER status)
|
|
121
|
+
- True: Row group level statistics with compression details, row group sizes,
|
|
122
|
+
and parquet metadata
|
|
73
123
|
|
|
74
124
|
Returns:
|
|
75
|
-
|
|
76
|
-
|
|
125
|
+
DataFrame with statistics based on detailed parameter:
|
|
126
|
+
- If detailed=False: Aggregated table-level summary
|
|
127
|
+
- If detailed=True: Granular file and row group level stats
|
|
77
128
|
|
|
78
129
|
Examples:
|
|
79
130
|
con = duckrun.connect("tmp/data.lakehouse/test")
|
|
80
131
|
|
|
81
|
-
#
|
|
82
|
-
stats = con.get_stats(
|
|
132
|
+
# All tables in the connection's schema (aggregated)
|
|
133
|
+
stats = con.get_stats()
|
|
134
|
+
|
|
135
|
+
# Single table with detailed row group statistics
|
|
136
|
+
stats_detailed = con.get_stats('price_today', detailed=True)
|
|
83
137
|
|
|
84
138
|
# Specific table in different schema (only if multi-schema enabled)
|
|
85
139
|
stats = con.get_stats('aemo.price')
|
|
86
140
|
|
|
87
141
|
# All tables in a schema (only if multi-schema enabled)
|
|
88
142
|
stats = con.get_stats('aemo')
|
|
143
|
+
|
|
144
|
+
# Wildcard pattern across all schemas (only if multi-schema enabled)
|
|
145
|
+
stats = con.get_stats('*.summary')
|
|
89
146
|
"""
|
|
90
147
|
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
91
148
|
|
|
@@ -93,8 +150,31 @@ def get_stats(duckrun_instance, source: str):
|
|
|
93
150
|
duckdb_schema = "main"
|
|
94
151
|
url_schema = duckrun_instance.schema # This is from the connection URL path
|
|
95
152
|
|
|
153
|
+
# If source is not provided, default to all tables in the connection's schema
|
|
154
|
+
if source is None:
|
|
155
|
+
source = url_schema
|
|
156
|
+
|
|
157
|
+
# Check if source contains wildcard characters
|
|
158
|
+
if '*' in source or '?' in source:
|
|
159
|
+
# Wildcard pattern mode - only valid if multi-schema is enabled
|
|
160
|
+
if not duckrun_instance.scan_all_schemas:
|
|
161
|
+
raise ValueError(f"Wildcard pattern '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Enable multi-schema mode to use wildcards.")
|
|
162
|
+
|
|
163
|
+
matched_tables = _match_tables_by_pattern(duckrun_instance, source)
|
|
164
|
+
|
|
165
|
+
if not matched_tables:
|
|
166
|
+
raise ValueError(f"No tables found matching pattern '{source}'")
|
|
167
|
+
|
|
168
|
+
# Flatten the matched tables into a list with schema info
|
|
169
|
+
tables_with_schemas = []
|
|
170
|
+
for schema, tables in matched_tables.items():
|
|
171
|
+
for table in tables:
|
|
172
|
+
tables_with_schemas.append((schema, table))
|
|
173
|
+
|
|
174
|
+
print(f"Found {len(tables_with_schemas)} tables matching pattern '{source}'")
|
|
175
|
+
|
|
96
176
|
# Parse the source and validate existence
|
|
97
|
-
|
|
177
|
+
elif '.' in source:
|
|
98
178
|
# Format: schema.table - only valid if multi-schema is enabled
|
|
99
179
|
schema_name, table_name = source.split('.', 1)
|
|
100
180
|
|
|
@@ -105,46 +185,45 @@ def get_stats(duckrun_instance, source: str):
|
|
|
105
185
|
if not _table_exists(duckrun_instance, schema_name, table_name):
|
|
106
186
|
raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
|
|
107
187
|
|
|
108
|
-
|
|
188
|
+
tables_with_schemas = [(schema_name, table_name)]
|
|
109
189
|
else:
|
|
110
190
|
# Could be just table name or schema name
|
|
111
191
|
if duckrun_instance.scan_all_schemas:
|
|
112
192
|
# Multi-schema mode: DuckDB has actual schemas
|
|
113
193
|
# First check if it's a table in main schema
|
|
114
194
|
if _table_exists(duckrun_instance, duckdb_schema, source):
|
|
115
|
-
|
|
116
|
-
schema_name = duckdb_schema
|
|
195
|
+
tables_with_schemas = [(duckdb_schema, source)]
|
|
117
196
|
# Otherwise, check if it's a schema name
|
|
118
197
|
elif _schema_exists(duckrun_instance, source):
|
|
119
198
|
schema_name = source
|
|
120
199
|
list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
|
|
121
200
|
if not list_tables:
|
|
122
201
|
raise ValueError(f"Schema '{source}' exists but contains no tables")
|
|
202
|
+
tables_with_schemas = [(schema_name, tbl) for tbl in list_tables]
|
|
123
203
|
else:
|
|
124
204
|
raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
|
|
125
205
|
else:
|
|
126
206
|
# Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
|
|
127
207
|
if _table_exists(duckrun_instance, duckdb_schema, source):
|
|
128
208
|
# It's a table name
|
|
129
|
-
|
|
130
|
-
schema_name = url_schema # Use URL schema for file path construction
|
|
209
|
+
tables_with_schemas = [(url_schema, source)]
|
|
131
210
|
elif source == url_schema:
|
|
132
211
|
# Special case: user asked for stats on the URL schema name - list all tables
|
|
133
212
|
list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
|
|
134
|
-
schema_name = url_schema # Use URL schema for file path construction
|
|
135
213
|
if not list_tables:
|
|
136
214
|
raise ValueError(f"No tables found in schema '{url_schema}'")
|
|
215
|
+
tables_with_schemas = [(url_schema, tbl) for tbl in list_tables]
|
|
137
216
|
else:
|
|
138
217
|
raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
|
|
139
218
|
|
|
140
219
|
# Use the existing connection
|
|
141
220
|
con = duckrun_instance.con
|
|
142
221
|
|
|
143
|
-
print(f"Processing {len(
|
|
222
|
+
print(f"Processing {len(tables_with_schemas)} tables from {len(set(s for s, t in tables_with_schemas))} schema(s)")
|
|
144
223
|
|
|
145
224
|
successful_tables = []
|
|
146
|
-
for idx, tbl in enumerate(
|
|
147
|
-
print(f"[{idx+1}/{len(
|
|
225
|
+
for idx, (schema_name, tbl) in enumerate(tables_with_schemas):
|
|
226
|
+
print(f"[{idx+1}/{len(tables_with_schemas)}] Processing table '{schema_name}.{tbl}'...")
|
|
148
227
|
# Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
|
|
149
228
|
table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
|
|
150
229
|
|
|
@@ -171,8 +250,18 @@ def get_stats(duckrun_instance, source: str):
|
|
|
171
250
|
print(f"Warning: Could not convert RecordBatch for table '{tbl}': Unexpected type {type(add_actions)}")
|
|
172
251
|
xx = {}
|
|
173
252
|
|
|
174
|
-
# Check if VORDER exists
|
|
175
|
-
|
|
253
|
+
# Check if VORDER exists - handle both formats:
|
|
254
|
+
# 1. Flattened format: 'tags.VORDER' or 'tags.vorder' in keys
|
|
255
|
+
# 2. Nested format: check in 'tags' dict for 'VORDER' or 'vorder'
|
|
256
|
+
vorder = False
|
|
257
|
+
if 'tags.VORDER' in xx.keys() or 'tags.vorder' in xx.keys():
|
|
258
|
+
vorder = True
|
|
259
|
+
elif 'tags' in xx.keys() and xx['tags']:
|
|
260
|
+
# Check nested tags dictionary (tags is a list of dicts, one per file)
|
|
261
|
+
for tag_dict in xx['tags']:
|
|
262
|
+
if tag_dict and ('VORDER' in tag_dict or 'vorder' in tag_dict):
|
|
263
|
+
vorder = True
|
|
264
|
+
break
|
|
176
265
|
|
|
177
266
|
# Calculate total size
|
|
178
267
|
total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
|
|
@@ -187,6 +276,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
187
276
|
con.execute(f'''
|
|
188
277
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
189
278
|
SELECT
|
|
279
|
+
'{schema_name}' as schema,
|
|
190
280
|
'{tbl}' as tbl,
|
|
191
281
|
'empty' as file_name,
|
|
192
282
|
0 as num_rows,
|
|
@@ -199,21 +289,36 @@ def get_stats(duckrun_instance, source: str):
|
|
|
199
289
|
''')
|
|
200
290
|
else:
|
|
201
291
|
# Get parquet metadata and create temp table with compression info
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
292
|
+
if detailed == True:
|
|
293
|
+
# Detailed mode: Include ALL parquet_metadata columns
|
|
294
|
+
con.execute(f'''
|
|
295
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
296
|
+
SELECT
|
|
297
|
+
'{schema_name}' as schema,
|
|
298
|
+
'{tbl}' as tbl,
|
|
299
|
+
{vorder} as vorder,
|
|
300
|
+
pm.*,
|
|
301
|
+
'{timestamp}' as timestamp
|
|
302
|
+
FROM parquet_metadata({delta}) pm
|
|
303
|
+
''')
|
|
304
|
+
else:
|
|
305
|
+
# Aggregated mode: Original summary statistics
|
|
306
|
+
con.execute(f'''
|
|
307
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
308
|
+
SELECT
|
|
309
|
+
'{schema_name}' as schema,
|
|
310
|
+
'{tbl}' as tbl,
|
|
311
|
+
fm.file_name,
|
|
312
|
+
fm.num_rows,
|
|
313
|
+
fm.num_row_groups,
|
|
314
|
+
CEIL({total_size}/(1024*1024)) as size,
|
|
315
|
+
{vorder} as vorder,
|
|
316
|
+
COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
|
|
317
|
+
'{timestamp}' as timestamp
|
|
318
|
+
FROM parquet_file_metadata({delta}) fm
|
|
319
|
+
LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
|
|
320
|
+
GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
|
|
321
|
+
''')
|
|
217
322
|
|
|
218
323
|
except Exception as e:
|
|
219
324
|
error_msg = str(e)
|
|
@@ -237,6 +342,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
237
342
|
con.execute(f'''
|
|
238
343
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
239
344
|
SELECT
|
|
345
|
+
'{schema_name}' as schema,
|
|
240
346
|
'{tbl}' as tbl,
|
|
241
347
|
'empty' as file_name,
|
|
242
348
|
0 as num_rows,
|
|
@@ -261,21 +367,36 @@ def get_stats(duckrun_instance, source: str):
|
|
|
261
367
|
filenames.append(table_path + "/" + filename)
|
|
262
368
|
|
|
263
369
|
# Use parquet_file_metadata to get actual parquet stats with compression
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
370
|
+
if detailed == True:
|
|
371
|
+
# Detailed mode: Include ALL parquet_metadata columns
|
|
372
|
+
con.execute(f'''
|
|
373
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
374
|
+
SELECT
|
|
375
|
+
'{schema_name}' as schema,
|
|
376
|
+
'{tbl}' as tbl,
|
|
377
|
+
false as vorder,
|
|
378
|
+
pm.*,
|
|
379
|
+
'{timestamp}' as timestamp
|
|
380
|
+
FROM parquet_metadata({filenames}) pm
|
|
381
|
+
''')
|
|
382
|
+
else:
|
|
383
|
+
# Aggregated mode: Original summary statistics
|
|
384
|
+
con.execute(f'''
|
|
385
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
386
|
+
SELECT
|
|
387
|
+
'{schema_name}' as schema,
|
|
388
|
+
'{tbl}' as tbl,
|
|
389
|
+
fm.file_name,
|
|
390
|
+
fm.num_rows,
|
|
391
|
+
fm.num_row_groups,
|
|
392
|
+
0 as size,
|
|
393
|
+
false as vorder,
|
|
394
|
+
COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
|
|
395
|
+
'{timestamp}' as timestamp
|
|
396
|
+
FROM parquet_file_metadata({filenames}) fm
|
|
397
|
+
LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
|
|
398
|
+
GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
|
|
399
|
+
''')
|
|
279
400
|
|
|
280
401
|
print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
|
|
281
402
|
except Exception as fallback_error:
|
|
@@ -291,30 +412,44 @@ def get_stats(duckrun_instance, source: str):
|
|
|
291
412
|
# No tables were processed successfully - return empty dataframe
|
|
292
413
|
print("⚠️ No tables could be processed successfully")
|
|
293
414
|
import pandas as pd
|
|
294
|
-
|
|
295
|
-
|
|
415
|
+
if detailed == True:
|
|
416
|
+
return pd.DataFrame(columns=['schema', 'tbl', 'vorder', 'timestamp'])
|
|
417
|
+
else:
|
|
418
|
+
return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
|
|
419
|
+
'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
|
|
296
420
|
|
|
297
421
|
# Union all successfully processed temp tables
|
|
298
422
|
union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
|
|
299
423
|
union_query = ' UNION ALL '.join(union_parts)
|
|
300
424
|
|
|
301
|
-
# Generate final summary
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
425
|
+
# Generate final summary based on detailed flag
|
|
426
|
+
if detailed == True:
|
|
427
|
+
# Detailed mode: Return ALL parquet_metadata columns
|
|
428
|
+
final_result = con.execute(f'''
|
|
429
|
+
SELECT *
|
|
430
|
+
FROM ({union_query})
|
|
431
|
+
WHERE tbl IS NOT NULL
|
|
432
|
+
ORDER BY schema, tbl, file_name, row_group_id, column_id
|
|
433
|
+
''').df()
|
|
434
|
+
else:
|
|
435
|
+
# Aggregated mode: Original summary statistics
|
|
436
|
+
final_result = con.execute(f'''
|
|
437
|
+
SELECT
|
|
438
|
+
schema,
|
|
439
|
+
tbl,
|
|
440
|
+
SUM(num_rows) as total_rows,
|
|
441
|
+
COUNT(*) as num_files,
|
|
442
|
+
SUM(num_row_groups) as num_row_group,
|
|
443
|
+
CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
|
|
444
|
+
MIN(size) as file_size_MB,
|
|
445
|
+
ANY_VALUE(vorder) as vorder,
|
|
446
|
+
STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
|
|
447
|
+
ANY_VALUE(timestamp) as timestamp
|
|
448
|
+
FROM ({union_query})
|
|
449
|
+
WHERE tbl IS NOT NULL
|
|
450
|
+
GROUP BY schema, tbl
|
|
451
|
+
ORDER BY total_rows DESC
|
|
452
|
+
''').df()
|
|
318
453
|
|
|
319
454
|
return final_result
|
|
320
455
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.2.19.dev5
|
|
4
|
+
Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|