duckrun 0.2.18.dev5__tar.gz → 0.2.19.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/PKG-INFO +1 -1
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun/__init__.py +1 -1
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun/core.py +13 -6
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun/notebook.py +2 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun/stats.py +105 -58
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/pyproject.toml +1 -1
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/LICENSE +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/README.md +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun/auth.py +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun/files.py +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun/runner.py +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun/writer.py +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.18.dev5 → duckrun-0.2.19.dev1}/setup.cfg +0 -0
|
@@ -1035,7 +1035,7 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1035
1035
|
"""Get underlying DuckDB connection"""
|
|
1036
1036
|
return self.con
|
|
1037
1037
|
|
|
1038
|
-
def get_stats(self, source: str = None):
|
|
1038
|
+
def get_stats(self, source: str = None, detailed = False):
|
|
1039
1039
|
"""
|
|
1040
1040
|
Get comprehensive statistics for Delta Lake tables.
|
|
1041
1041
|
|
|
@@ -1045,27 +1045,34 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1045
1045
|
- Table name: 'table_name' (uses current schema)
|
|
1046
1046
|
- Schema.table: 'schema.table_name' (specific table in schema)
|
|
1047
1047
|
- Schema only: 'schema' (all tables in schema)
|
|
1048
|
+
detailed: Optional. Controls the level of detail in statistics:
|
|
1049
|
+
- False (default): Aggregated table-level stats
|
|
1050
|
+
- True: Row group level statistics with compression details
|
|
1048
1051
|
|
|
1049
1052
|
Returns:
|
|
1050
|
-
|
|
1051
|
-
|
|
1053
|
+
DataFrame with statistics based on detailed parameter:
|
|
1054
|
+
- If detailed=False: Aggregated table-level summary
|
|
1055
|
+
- If detailed=True: Granular file and row group level stats
|
|
1052
1056
|
|
|
1053
1057
|
Examples:
|
|
1054
1058
|
con = duckrun.connect("tmp/data.lakehouse/aemo")
|
|
1055
1059
|
|
|
1056
|
-
# All tables in current schema (aemo)
|
|
1060
|
+
# All tables in current schema (aemo) - aggregated
|
|
1057
1061
|
stats = con.get_stats()
|
|
1058
1062
|
|
|
1059
|
-
# Single table in current schema
|
|
1063
|
+
# Single table in current schema - aggregated
|
|
1060
1064
|
stats = con.get_stats('price')
|
|
1061
1065
|
|
|
1066
|
+
# Single table with detailed row group statistics
|
|
1067
|
+
stats_detailed = con.get_stats('price', detailed=True)
|
|
1068
|
+
|
|
1062
1069
|
# Specific table in different schema
|
|
1063
1070
|
stats = con.get_stats('aemo.price')
|
|
1064
1071
|
|
|
1065
1072
|
# All tables in a schema
|
|
1066
1073
|
stats = con.get_stats('aemo')
|
|
1067
1074
|
"""
|
|
1068
|
-
return _get_stats(self, source)
|
|
1075
|
+
return _get_stats(self, source, detailed)
|
|
1069
1076
|
|
|
1070
1077
|
def list_lakehouses(self) -> List[str]:
|
|
1071
1078
|
"""
|
|
@@ -160,6 +160,7 @@ def import_notebook_from_web(
|
|
|
160
160
|
update_url = f"{base_url}/workspaces/{workspace_id}/notebooks/{notebook_id}/updateDefinition"
|
|
161
161
|
payload = {
|
|
162
162
|
"definition": {
|
|
163
|
+
"format": "ipynb",
|
|
163
164
|
"parts": [
|
|
164
165
|
{
|
|
165
166
|
"path": "notebook-content.py",
|
|
@@ -192,6 +193,7 @@ def import_notebook_from_web(
|
|
|
192
193
|
payload = {
|
|
193
194
|
"displayName": notebook_name,
|
|
194
195
|
"definition": {
|
|
196
|
+
"format": "ipynb",
|
|
195
197
|
"parts": [
|
|
196
198
|
{
|
|
197
199
|
"path": "notebook-content.py",
|
|
@@ -103,7 +103,7 @@ def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
|
|
|
103
103
|
return {}
|
|
104
104
|
|
|
105
105
|
|
|
106
|
-
def get_stats(duckrun_instance, source: str = None):
|
|
106
|
+
def get_stats(duckrun_instance, source: str = None, detailed = False):
|
|
107
107
|
"""
|
|
108
108
|
Get comprehensive statistics for Delta Lake tables.
|
|
109
109
|
|
|
@@ -115,19 +115,25 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
115
115
|
- Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
|
|
116
116
|
- Schema only: 'schema' (all tables in schema, if multi-schema)
|
|
117
117
|
- Wildcard pattern: '*.summary' (matches tables across all schemas)
|
|
118
|
+
detailed: Optional. Controls the level of detail in statistics:
|
|
119
|
+
- False (default): Aggregated table-level stats (total rows, file count,
|
|
120
|
+
row groups, average row group size, file sizes, VORDER status)
|
|
121
|
+
- True: Row group level statistics with compression details, row group sizes,
|
|
122
|
+
and parquet metadata
|
|
118
123
|
|
|
119
124
|
Returns:
|
|
120
|
-
|
|
121
|
-
|
|
125
|
+
DataFrame with statistics based on detailed parameter:
|
|
126
|
+
- If detailed=False: Aggregated table-level summary
|
|
127
|
+
- If detailed=True: Granular file and row group level stats
|
|
122
128
|
|
|
123
129
|
Examples:
|
|
124
130
|
con = duckrun.connect("tmp/data.lakehouse/test")
|
|
125
131
|
|
|
126
|
-
# All tables in the connection's schema
|
|
132
|
+
# All tables in the connection's schema (aggregated)
|
|
127
133
|
stats = con.get_stats()
|
|
128
134
|
|
|
129
|
-
# Single table
|
|
130
|
-
|
|
135
|
+
# Single table with detailed row group statistics
|
|
136
|
+
stats_detailed = con.get_stats('price_today', detailed=True)
|
|
131
137
|
|
|
132
138
|
# Specific table in different schema (only if multi-schema enabled)
|
|
133
139
|
stats = con.get_stats('aemo.price')
|
|
@@ -283,22 +289,36 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
283
289
|
''')
|
|
284
290
|
else:
|
|
285
291
|
# Get parquet metadata and create temp table with compression info
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
292
|
+
if detailed == True:
|
|
293
|
+
# Detailed mode: Include ALL parquet_metadata columns
|
|
294
|
+
con.execute(f'''
|
|
295
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
296
|
+
SELECT
|
|
297
|
+
'{schema_name}' as schema,
|
|
298
|
+
'{tbl}' as tbl,
|
|
299
|
+
{vorder} as vorder,
|
|
300
|
+
pm.*,
|
|
301
|
+
'{timestamp}' as timestamp
|
|
302
|
+
FROM parquet_metadata({delta}) pm
|
|
303
|
+
''')
|
|
304
|
+
else:
|
|
305
|
+
# Aggregated mode: Original summary statistics
|
|
306
|
+
con.execute(f'''
|
|
307
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
308
|
+
SELECT
|
|
309
|
+
'{schema_name}' as schema,
|
|
310
|
+
'{tbl}' as tbl,
|
|
311
|
+
fm.file_name,
|
|
312
|
+
fm.num_rows,
|
|
313
|
+
fm.num_row_groups,
|
|
314
|
+
CEIL({total_size}/(1024*1024)) as size,
|
|
315
|
+
{vorder} as vorder,
|
|
316
|
+
COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
|
|
317
|
+
'{timestamp}' as timestamp
|
|
318
|
+
FROM parquet_file_metadata({delta}) fm
|
|
319
|
+
LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
|
|
320
|
+
GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
|
|
321
|
+
''')
|
|
302
322
|
|
|
303
323
|
except Exception as e:
|
|
304
324
|
error_msg = str(e)
|
|
@@ -347,22 +367,36 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
347
367
|
filenames.append(table_path + "/" + filename)
|
|
348
368
|
|
|
349
369
|
# Use parquet_file_metadata to get actual parquet stats with compression
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
370
|
+
if detailed == True:
|
|
371
|
+
# Detailed mode: Include ALL parquet_metadata columns
|
|
372
|
+
con.execute(f'''
|
|
373
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
374
|
+
SELECT
|
|
375
|
+
'{schema_name}' as schema,
|
|
376
|
+
'{tbl}' as tbl,
|
|
377
|
+
false as vorder,
|
|
378
|
+
pm.*,
|
|
379
|
+
'{timestamp}' as timestamp
|
|
380
|
+
FROM parquet_metadata({filenames}) pm
|
|
381
|
+
''')
|
|
382
|
+
else:
|
|
383
|
+
# Aggregated mode: Original summary statistics
|
|
384
|
+
con.execute(f'''
|
|
385
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
386
|
+
SELECT
|
|
387
|
+
'{schema_name}' as schema,
|
|
388
|
+
'{tbl}' as tbl,
|
|
389
|
+
fm.file_name,
|
|
390
|
+
fm.num_rows,
|
|
391
|
+
fm.num_row_groups,
|
|
392
|
+
0 as size,
|
|
393
|
+
false as vorder,
|
|
394
|
+
COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
|
|
395
|
+
'{timestamp}' as timestamp
|
|
396
|
+
FROM parquet_file_metadata({filenames}) fm
|
|
397
|
+
LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
|
|
398
|
+
GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
|
|
399
|
+
''')
|
|
366
400
|
|
|
367
401
|
print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
|
|
368
402
|
except Exception as fallback_error:
|
|
@@ -378,31 +412,44 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
378
412
|
# No tables were processed successfully - return empty dataframe
|
|
379
413
|
print("⚠️ No tables could be processed successfully")
|
|
380
414
|
import pandas as pd
|
|
381
|
-
|
|
382
|
-
|
|
415
|
+
if detailed == True:
|
|
416
|
+
return pd.DataFrame(columns=['schema', 'tbl', 'vorder', 'timestamp'])
|
|
417
|
+
else:
|
|
418
|
+
return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
|
|
419
|
+
'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
|
|
383
420
|
|
|
384
421
|
# Union all successfully processed temp tables
|
|
385
422
|
union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
|
|
386
423
|
union_query = ' UNION ALL '.join(union_parts)
|
|
387
424
|
|
|
388
|
-
# Generate final summary
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
425
|
+
# Generate final summary based on detailed flag
|
|
426
|
+
if detailed == True:
|
|
427
|
+
# Detailed mode: Return ALL parquet_metadata columns
|
|
428
|
+
final_result = con.execute(f'''
|
|
429
|
+
SELECT *
|
|
430
|
+
FROM ({union_query})
|
|
431
|
+
WHERE tbl IS NOT NULL
|
|
432
|
+
ORDER BY schema, tbl, file_name, row_group_id, column_id
|
|
433
|
+
''').df()
|
|
434
|
+
else:
|
|
435
|
+
# Aggregated mode: Original summary statistics
|
|
436
|
+
final_result = con.execute(f'''
|
|
437
|
+
SELECT
|
|
438
|
+
schema,
|
|
439
|
+
tbl,
|
|
440
|
+
SUM(num_rows) as total_rows,
|
|
441
|
+
COUNT(*) as num_files,
|
|
442
|
+
SUM(num_row_groups) as num_row_group,
|
|
443
|
+
CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
|
|
444
|
+
MIN(size) as file_size_MB,
|
|
445
|
+
ANY_VALUE(vorder) as vorder,
|
|
446
|
+
STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
|
|
447
|
+
ANY_VALUE(timestamp) as timestamp
|
|
448
|
+
FROM ({union_query})
|
|
449
|
+
WHERE tbl IS NOT NULL
|
|
450
|
+
GROUP BY schema, tbl
|
|
451
|
+
ORDER BY total_rows DESC
|
|
452
|
+
''').df()
|
|
406
453
|
|
|
407
454
|
return final_result
|
|
408
455
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.19.dev1"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|