duckrun 0.2.19.dev4__tar.gz → 0.2.19.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/core.py +2 -1
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/rle.py +85 -24
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/pyproject.toml +1 -1
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/tests/test_rle.py +2 -2
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/LICENSE +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/README.md +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/__init__.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/auth.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/setup.cfg +0 -0
|
@@ -1425,7 +1425,8 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1425
1425
|
max_cardinality_pct=max_cardinality_pct,
|
|
1426
1426
|
max_ordering_depth=max_ordering_depth,
|
|
1427
1427
|
schema_name=schema_name,
|
|
1428
|
-
table_display_name=tbl
|
|
1428
|
+
table_display_name=tbl,
|
|
1429
|
+
duckrun_instance=self # Pass duckrun instance for detailed parquet stats
|
|
1429
1430
|
)
|
|
1430
1431
|
|
|
1431
1432
|
def close(self):
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
from typing import List, Dict, Tuple, Optional
|
|
3
3
|
import pandas as pd
|
|
4
|
+
from .stats import get_stats
|
|
4
5
|
|
|
5
6
|
def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
|
|
6
7
|
"""
|
|
@@ -238,22 +239,21 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
|
|
|
238
239
|
Calculate cardinality ratio for each column (distinct_values / total_rows).
|
|
239
240
|
Lower ratio = better for RLE compression (more repetition).
|
|
240
241
|
|
|
241
|
-
NEVER uses sampling - always scans full dataset with exact
|
|
242
|
+
NEVER uses sampling - always scans full dataset with exact distinct counts.
|
|
242
243
|
|
|
243
244
|
Args:
|
|
244
245
|
con: DuckDB connection
|
|
245
246
|
source: Either a table name (default) or parquet file path
|
|
246
247
|
limit: DEPRECATED - kept for backward compatibility but ignored. Always scans full dataset.
|
|
247
248
|
is_parquet: If True, source is a parquet file path; if False, source is a table name
|
|
248
|
-
use_approx:
|
|
249
|
-
|
|
250
|
-
approx_threshold: Row count threshold for using HyperLogLog (default: 100M rows)
|
|
249
|
+
use_approx: DEPRECATED - always uses exact COUNT(DISTINCT)
|
|
250
|
+
approx_threshold: DEPRECATED - always uses exact COUNT(DISTINCT)
|
|
251
251
|
|
|
252
252
|
Returns:
|
|
253
253
|
Dictionary mapping column names to dict with keys:
|
|
254
254
|
- 'cardinality_ratio': distinct/total, range 0-1, lower is better for RLE
|
|
255
255
|
- 'total_rows': total row count
|
|
256
|
-
- 'distinct_values': number of distinct values (exact
|
|
256
|
+
- 'distinct_values': number of distinct values (exact)
|
|
257
257
|
"""
|
|
258
258
|
# Build the FROM clause based on source type
|
|
259
259
|
if is_parquet:
|
|
@@ -274,26 +274,15 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
|
|
|
274
274
|
if not column_names:
|
|
275
275
|
return {}
|
|
276
276
|
|
|
277
|
-
#
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
281
|
-
use_approx = total_rows > approx_threshold
|
|
282
|
-
if use_approx:
|
|
283
|
-
print(f" Table has {total_rows:,} rows (>{approx_threshold:,}) - using HyperLogLog approximation")
|
|
284
|
-
else:
|
|
285
|
-
print(f" Table has {total_rows:,} rows (<={approx_threshold:,}) - using exact COUNT(DISTINCT)")
|
|
286
|
-
else:
|
|
287
|
-
total_rows = None # Will be calculated in main query
|
|
277
|
+
# Get row count
|
|
278
|
+
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
279
|
+
print(f" Table has {total_rows:,} rows - using exact COUNT(DISTINCT)")
|
|
288
280
|
|
|
289
|
-
# Build a single query that calculates all
|
|
281
|
+
# Build a single query that calculates all cardinality in one pass
|
|
290
282
|
# This scans the data only ONCE instead of once per column
|
|
291
283
|
select_clauses = []
|
|
292
284
|
for col in column_names:
|
|
293
|
-
|
|
294
|
-
select_clauses.append(f"approx_count_distinct({col}) as distinct_{col}")
|
|
295
|
-
else:
|
|
296
|
-
select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
|
|
285
|
+
select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
|
|
297
286
|
|
|
298
287
|
query = f"""
|
|
299
288
|
SELECT
|
|
@@ -307,8 +296,7 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
|
|
|
307
296
|
if not result:
|
|
308
297
|
return {}
|
|
309
298
|
|
|
310
|
-
|
|
311
|
-
total_rows = result[0]
|
|
299
|
+
total_rows = result[0]
|
|
312
300
|
|
|
313
301
|
nfv_stats = {}
|
|
314
302
|
|
|
@@ -405,7 +393,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
405
393
|
max_cardinality_pct: float = 0.01,
|
|
406
394
|
max_ordering_depth: int = 3,
|
|
407
395
|
schema_name: str = None,
|
|
408
|
-
table_display_name: str = None
|
|
396
|
+
table_display_name: str = None,
|
|
397
|
+
duckrun_instance = None) -> pd.DataFrame:
|
|
409
398
|
"""
|
|
410
399
|
Test column orderings for RLE optimization.
|
|
411
400
|
|
|
@@ -425,6 +414,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
425
414
|
max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
|
|
426
415
|
schema_name: Optional schema name to include in results (default: None)
|
|
427
416
|
table_display_name: Optional table name to include in results (default: None)
|
|
417
|
+
duckrun_instance: Optional Duckrun instance to fetch detailed parquet stats (default: None)
|
|
428
418
|
|
|
429
419
|
Returns:
|
|
430
420
|
DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
|
|
@@ -485,6 +475,35 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
485
475
|
print(f"✓ Analysis complete!")
|
|
486
476
|
print(f"{'='*60}")
|
|
487
477
|
|
|
478
|
+
# Get detailed parquet stats if duckrun_instance is provided
|
|
479
|
+
parquet_stats = None
|
|
480
|
+
vorder_status = False
|
|
481
|
+
table_size_mb = None
|
|
482
|
+
if duckrun_instance and table_display_name:
|
|
483
|
+
print("\nFetching detailed parquet metadata...")
|
|
484
|
+
try:
|
|
485
|
+
# For single-schema connections, just use the table name
|
|
486
|
+
# For multi-schema connections, use schema.table format
|
|
487
|
+
if hasattr(duckrun_instance, 'scan_all_schemas') and duckrun_instance.scan_all_schemas and schema_name:
|
|
488
|
+
source_param = f"{schema_name}.{table_display_name}"
|
|
489
|
+
else:
|
|
490
|
+
source_param = table_display_name
|
|
491
|
+
|
|
492
|
+
parquet_stats = get_stats(duckrun_instance, source=source_param, detailed=True)
|
|
493
|
+
print(f"✓ Retrieved parquet metadata for {len(parquet_stats)} row groups/columns")
|
|
494
|
+
|
|
495
|
+
# Get vorder status from the stats if available
|
|
496
|
+
if 'vorder' in parquet_stats.columns:
|
|
497
|
+
vorder_status = parquet_stats['vorder'].iloc[0] if len(parquet_stats) > 0 else False
|
|
498
|
+
|
|
499
|
+
# Calculate total table size from compressed sizes
|
|
500
|
+
if 'total_compressed_size' in parquet_stats.columns:
|
|
501
|
+
total_bytes = parquet_stats['total_compressed_size'].sum()
|
|
502
|
+
table_size_mb = round(total_bytes / (1024 * 1024), 2) if total_bytes else None
|
|
503
|
+
except Exception as e:
|
|
504
|
+
print(f"⚠️ Could not fetch parquet stats: {e}")
|
|
505
|
+
parquet_stats = None
|
|
506
|
+
|
|
488
507
|
# Transform to long format
|
|
489
508
|
long_format_results = []
|
|
490
509
|
|
|
@@ -502,6 +521,38 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
502
521
|
# Get total rows and NDV from card_stats if available
|
|
503
522
|
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
504
523
|
|
|
524
|
+
# Aggregate parquet stats per column if available
|
|
525
|
+
parquet_by_column = {}
|
|
526
|
+
if parquet_stats is not None and not parquet_stats.empty:
|
|
527
|
+
# Determine column name field - can be 'name' or 'path_in_schema'
|
|
528
|
+
col_name_field = 'path_in_schema' if 'path_in_schema' in parquet_stats.columns else 'name'
|
|
529
|
+
|
|
530
|
+
# Group by column name and aggregate
|
|
531
|
+
for col_name in data_columns:
|
|
532
|
+
col_stats = parquet_stats[parquet_stats[col_name_field] == col_name] if col_name_field in parquet_stats.columns else pd.DataFrame()
|
|
533
|
+
|
|
534
|
+
if not col_stats.empty:
|
|
535
|
+
# Aggregate stats across all row groups for this column
|
|
536
|
+
total_compressed = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
|
|
537
|
+
total_uncompressed = col_stats['total_uncompressed_size'].sum() if 'total_uncompressed_size' in col_stats.columns else None
|
|
538
|
+
# Handle both 'encodings' (multiple) and 'encoding' (single) column names
|
|
539
|
+
if 'encodings' in col_stats.columns:
|
|
540
|
+
encodings = col_stats['encodings'].unique().tolist()
|
|
541
|
+
elif 'encoding' in col_stats.columns:
|
|
542
|
+
encodings = col_stats['encoding'].unique().tolist()
|
|
543
|
+
else:
|
|
544
|
+
encodings = []
|
|
545
|
+
compressions = col_stats['compression'].unique().tolist() if 'compression' in col_stats.columns else []
|
|
546
|
+
num_row_groups = col_stats['row_group_id'].nunique() if 'row_group_id' in col_stats.columns else len(col_stats)
|
|
547
|
+
|
|
548
|
+
parquet_by_column[col_name] = {
|
|
549
|
+
'encoding': ', '.join([str(e) for e in encodings if e is not None]),
|
|
550
|
+
'compression': ', '.join([str(c) for c in compressions if c is not None]),
|
|
551
|
+
'total_compressed_size': total_compressed,
|
|
552
|
+
'total_uncompressed_size': total_uncompressed,
|
|
553
|
+
'num_row_groups': num_row_groups
|
|
554
|
+
}
|
|
555
|
+
|
|
505
556
|
# Create one row per data column
|
|
506
557
|
for col in data_columns:
|
|
507
558
|
rle_value = row[col]
|
|
@@ -509,6 +560,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
509
560
|
# Get NDV from card_stats
|
|
510
561
|
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
511
562
|
|
|
563
|
+
# Get parquet stats for this column
|
|
564
|
+
col_parquet = parquet_by_column.get(col, {})
|
|
565
|
+
|
|
512
566
|
long_format_results.append({
|
|
513
567
|
'schema': schema_val,
|
|
514
568
|
'table': table_val,
|
|
@@ -519,6 +573,13 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
519
573
|
'NDV': ndv_value,
|
|
520
574
|
'total_rows': total_rows,
|
|
521
575
|
'total_RLE': total_rle_all_val,
|
|
576
|
+
'encoding': col_parquet.get('encoding', ''),
|
|
577
|
+
'compression': col_parquet.get('compression', ''),
|
|
578
|
+
'total_compressed_size': col_parquet.get('total_compressed_size', None),
|
|
579
|
+
'total_uncompressed_size': col_parquet.get('total_uncompressed_size', None),
|
|
580
|
+
'avg_row_group_size': total_rows // col_parquet.get('num_row_groups', 1) if total_rows and col_parquet.get('num_row_groups') else None,
|
|
581
|
+
'table_size_mb': table_size_mb,
|
|
582
|
+
'vorder': vorder_status,
|
|
522
583
|
'comments': ''
|
|
523
584
|
})
|
|
524
585
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.19.
|
|
7
|
+
version = "0.2.19.dev6"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -7,10 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
7
7
|
import duckrun
|
|
8
8
|
|
|
9
9
|
# Analyze multiple schemas/tables
|
|
10
|
-
conn = duckrun.connect("tmp/data.lakehouse
|
|
10
|
+
conn = duckrun.connect("tmp/data.lakehouse")
|
|
11
11
|
|
|
12
12
|
# Analyze tables - now returns long format automatically
|
|
13
|
-
result = conn.rle("calendar"
|
|
13
|
+
result = conn.rle("aemo.calendar")
|
|
14
14
|
print(result)
|
|
15
15
|
conn.close()
|
|
16
16
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|