duckrun 0.2.18.dev5__tar.gz → 0.2.19.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.18.dev5
3
+ Version: 0.2.19.dev1
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -3,7 +3,7 @@
3
3
  from duckrun.core import Duckrun
4
4
  from duckrun.notebook import import_notebook_from_web, import_notebook
5
5
 
6
- __version__ = "0.2.18.dev2"
6
+ __version__ = "0.2.18"
7
7
 
8
8
  # Expose unified connect method at module level
9
9
  connect = Duckrun.connect
@@ -1035,7 +1035,7 @@ class Duckrun(WorkspaceOperationsMixin):
1035
1035
  """Get underlying DuckDB connection"""
1036
1036
  return self.con
1037
1037
 
1038
- def get_stats(self, source: str = None):
1038
+ def get_stats(self, source: str = None, detailed = False):
1039
1039
  """
1040
1040
  Get comprehensive statistics for Delta Lake tables.
1041
1041
 
@@ -1045,27 +1045,34 @@ class Duckrun(WorkspaceOperationsMixin):
1045
1045
  - Table name: 'table_name' (uses current schema)
1046
1046
  - Schema.table: 'schema.table_name' (specific table in schema)
1047
1047
  - Schema only: 'schema' (all tables in schema)
1048
+ detailed: Optional. Controls the level of detail in statistics:
1049
+ - False (default): Aggregated table-level stats
1050
+ - True: Row group level statistics with compression details
1048
1051
 
1049
1052
  Returns:
1050
- Arrow table with statistics including total rows, file count, row groups,
1051
- average row group size, file sizes, VORDER status, and timestamp
1053
+ DataFrame with statistics based on detailed parameter:
1054
+ - If detailed=False: Aggregated table-level summary
1055
+ - If detailed=True: Granular file and row group level stats
1052
1056
 
1053
1057
  Examples:
1054
1058
  con = duckrun.connect("tmp/data.lakehouse/aemo")
1055
1059
 
1056
- # All tables in current schema (aemo)
1060
+ # All tables in current schema (aemo) - aggregated
1057
1061
  stats = con.get_stats()
1058
1062
 
1059
- # Single table in current schema
1063
+ # Single table in current schema - aggregated
1060
1064
  stats = con.get_stats('price')
1061
1065
 
1066
+ # Single table with detailed row group statistics
1067
+ stats_detailed = con.get_stats('price', detailed=True)
1068
+
1062
1069
  # Specific table in different schema
1063
1070
  stats = con.get_stats('aemo.price')
1064
1071
 
1065
1072
  # All tables in a schema
1066
1073
  stats = con.get_stats('aemo')
1067
1074
  """
1068
- return _get_stats(self, source)
1075
+ return _get_stats(self, source, detailed)
1069
1076
 
1070
1077
  def list_lakehouses(self) -> List[str]:
1071
1078
  """
@@ -160,6 +160,7 @@ def import_notebook_from_web(
160
160
  update_url = f"{base_url}/workspaces/{workspace_id}/notebooks/{notebook_id}/updateDefinition"
161
161
  payload = {
162
162
  "definition": {
163
+ "format": "ipynb",
163
164
  "parts": [
164
165
  {
165
166
  "path": "notebook-content.py",
@@ -192,6 +193,7 @@ def import_notebook_from_web(
192
193
  payload = {
193
194
  "displayName": notebook_name,
194
195
  "definition": {
196
+ "format": "ipynb",
195
197
  "parts": [
196
198
  {
197
199
  "path": "notebook-content.py",
@@ -103,7 +103,7 @@ def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
103
103
  return {}
104
104
 
105
105
 
106
- def get_stats(duckrun_instance, source: str = None):
106
+ def get_stats(duckrun_instance, source: str = None, detailed = False):
107
107
  """
108
108
  Get comprehensive statistics for Delta Lake tables.
109
109
 
@@ -115,19 +115,25 @@ def get_stats(duckrun_instance, source: str = None):
115
115
  - Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
116
116
  - Schema only: 'schema' (all tables in schema, if multi-schema)
117
117
  - Wildcard pattern: '*.summary' (matches tables across all schemas)
118
+ detailed: Optional. Controls the level of detail in statistics:
119
+ - False (default): Aggregated table-level stats (total rows, file count,
120
+ row groups, average row group size, file sizes, VORDER status)
121
+ - True: Row group level statistics with compression details, row group sizes,
122
+ and parquet metadata
118
123
 
119
124
  Returns:
120
- Arrow table with statistics including total rows, file count, row groups,
121
- average row group size, file sizes, VORDER status, and timestamp
125
+ DataFrame with statistics based on detailed parameter:
126
+ - If detailed=False: Aggregated table-level summary
127
+ - If detailed=True: Granular file and row group level stats
122
128
 
123
129
  Examples:
124
130
  con = duckrun.connect("tmp/data.lakehouse/test")
125
131
 
126
- # All tables in the connection's schema
132
+ # All tables in the connection's schema (aggregated)
127
133
  stats = con.get_stats()
128
134
 
129
- # Single table in main schema (DuckDB uses 'main', not 'test')
130
- stats = con.get_stats('price_today')
135
+ # Single table with detailed row group statistics
136
+ stats_detailed = con.get_stats('price_today', detailed=True)
131
137
 
132
138
  # Specific table in different schema (only if multi-schema enabled)
133
139
  stats = con.get_stats('aemo.price')
@@ -283,22 +289,36 @@ def get_stats(duckrun_instance, source: str = None):
283
289
  ''')
284
290
  else:
285
291
  # Get parquet metadata and create temp table with compression info
286
- con.execute(f'''
287
- CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
288
- SELECT
289
- '{schema_name}' as schema,
290
- '{tbl}' as tbl,
291
- fm.file_name,
292
- fm.num_rows,
293
- fm.num_row_groups,
294
- CEIL({total_size}/(1024*1024)) as size,
295
- {vorder} as vorder,
296
- COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
297
- '{timestamp}' as timestamp
298
- FROM parquet_file_metadata({delta}) fm
299
- LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
300
- GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
301
- ''')
292
+ if detailed == True:
293
+ # Detailed mode: Include ALL parquet_metadata columns
294
+ con.execute(f'''
295
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
296
+ SELECT
297
+ '{schema_name}' as schema,
298
+ '{tbl}' as tbl,
299
+ {vorder} as vorder,
300
+ pm.*,
301
+ '{timestamp}' as timestamp
302
+ FROM parquet_metadata({delta}) pm
303
+ ''')
304
+ else:
305
+ # Aggregated mode: Original summary statistics
306
+ con.execute(f'''
307
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
308
+ SELECT
309
+ '{schema_name}' as schema,
310
+ '{tbl}' as tbl,
311
+ fm.file_name,
312
+ fm.num_rows,
313
+ fm.num_row_groups,
314
+ CEIL({total_size}/(1024*1024)) as size,
315
+ {vorder} as vorder,
316
+ COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
317
+ '{timestamp}' as timestamp
318
+ FROM parquet_file_metadata({delta}) fm
319
+ LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
320
+ GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
321
+ ''')
302
322
 
303
323
  except Exception as e:
304
324
  error_msg = str(e)
@@ -347,22 +367,36 @@ def get_stats(duckrun_instance, source: str = None):
347
367
  filenames.append(table_path + "/" + filename)
348
368
 
349
369
  # Use parquet_file_metadata to get actual parquet stats with compression
350
- con.execute(f'''
351
- CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
352
- SELECT
353
- '{schema_name}' as schema,
354
- '{tbl}' as tbl,
355
- fm.file_name,
356
- fm.num_rows,
357
- fm.num_row_groups,
358
- 0 as size,
359
- false as vorder,
360
- COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
361
- '{timestamp}' as timestamp
362
- FROM parquet_file_metadata({filenames}) fm
363
- LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
364
- GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
365
- ''')
370
+ if detailed == True:
371
+ # Detailed mode: Include ALL parquet_metadata columns
372
+ con.execute(f'''
373
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
374
+ SELECT
375
+ '{schema_name}' as schema,
376
+ '{tbl}' as tbl,
377
+ false as vorder,
378
+ pm.*,
379
+ '{timestamp}' as timestamp
380
+ FROM parquet_metadata({filenames}) pm
381
+ ''')
382
+ else:
383
+ # Aggregated mode: Original summary statistics
384
+ con.execute(f'''
385
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
386
+ SELECT
387
+ '{schema_name}' as schema,
388
+ '{tbl}' as tbl,
389
+ fm.file_name,
390
+ fm.num_rows,
391
+ fm.num_row_groups,
392
+ 0 as size,
393
+ false as vorder,
394
+ COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
395
+ '{timestamp}' as timestamp
396
+ FROM parquet_file_metadata({filenames}) fm
397
+ LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
398
+ GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
399
+ ''')
366
400
 
367
401
  print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
368
402
  except Exception as fallback_error:
@@ -378,31 +412,44 @@ def get_stats(duckrun_instance, source: str = None):
378
412
  # No tables were processed successfully - return empty dataframe
379
413
  print("⚠️ No tables could be processed successfully")
380
414
  import pandas as pd
381
- return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
382
- 'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
415
+ if detailed == True:
416
+ return pd.DataFrame(columns=['schema', 'tbl', 'vorder', 'timestamp'])
417
+ else:
418
+ return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
419
+ 'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
383
420
 
384
421
  # Union all successfully processed temp tables
385
422
  union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
386
423
  union_query = ' UNION ALL '.join(union_parts)
387
424
 
388
- # Generate final summary
389
- final_result = con.execute(f'''
390
- SELECT
391
- schema,
392
- tbl,
393
- SUM(num_rows) as total_rows,
394
- COUNT(*) as num_files,
395
- SUM(num_row_groups) as num_row_group,
396
- CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
397
- MIN(size) as file_size_MB,
398
- ANY_VALUE(vorder) as vorder,
399
- STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
400
- ANY_VALUE(timestamp) as timestamp
401
- FROM ({union_query})
402
- WHERE tbl IS NOT NULL
403
- GROUP BY schema, tbl
404
- ORDER BY total_rows DESC
405
- ''').df()
425
+ # Generate final summary based on detailed flag
426
+ if detailed == True:
427
+ # Detailed mode: Return ALL parquet_metadata columns
428
+ final_result = con.execute(f'''
429
+ SELECT *
430
+ FROM ({union_query})
431
+ WHERE tbl IS NOT NULL
432
+ ORDER BY schema, tbl, file_name, row_group_id, column_id
433
+ ''').df()
434
+ else:
435
+ # Aggregated mode: Original summary statistics
436
+ final_result = con.execute(f'''
437
+ SELECT
438
+ schema,
439
+ tbl,
440
+ SUM(num_rows) as total_rows,
441
+ COUNT(*) as num_files,
442
+ SUM(num_row_groups) as num_row_group,
443
+ CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
444
+ MIN(size) as file_size_MB,
445
+ ANY_VALUE(vorder) as vorder,
446
+ STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
447
+ ANY_VALUE(timestamp) as timestamp
448
+ FROM ({union_query})
449
+ WHERE tbl IS NOT NULL
450
+ GROUP BY schema, tbl
451
+ ORDER BY total_rows DESC
452
+ ''').df()
406
453
 
407
454
  return final_result
408
455
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.18.dev5
3
+ Version: 0.2.19.dev1
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.18.dev5"
7
+ version = "0.2.19.dev1"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes
File without changes