duckrun 0.2.19.dev4__tar.gz → 0.2.19.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/core.py +2 -1
  3. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/rle.py +85 -24
  4. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/PKG-INFO +1 -1
  5. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/pyproject.toml +1 -1
  6. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/tests/test_rle.py +2 -2
  7. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/LICENSE +0 -0
  8. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/README.md +0 -0
  9. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/__init__.py +0 -0
  10. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/auth.py +0 -0
  11. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/files.py +0 -0
  12. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/lakehouse.py +0 -0
  13. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/notebook.py +0 -0
  14. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/runner.py +0 -0
  15. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/semantic_model.py +0 -0
  16. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/stats.py +0 -0
  17. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/writer.py +0 -0
  18. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/SOURCES.txt +0 -0
  19. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/dependency_links.txt +0 -0
  20. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/requires.txt +0 -0
  21. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/top_level.txt +0 -0
  22. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev4
3
+ Version: 0.2.19.dev6
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -1425,7 +1425,8 @@ class Duckrun(WorkspaceOperationsMixin):
1425
1425
  max_cardinality_pct=max_cardinality_pct,
1426
1426
  max_ordering_depth=max_ordering_depth,
1427
1427
  schema_name=schema_name,
1428
- table_display_name=tbl
1428
+ table_display_name=tbl,
1429
+ duckrun_instance=self # Pass duckrun instance for detailed parquet stats
1429
1430
  )
1430
1431
 
1431
1432
  def close(self):
@@ -1,6 +1,7 @@
1
1
  import itertools
2
2
  from typing import List, Dict, Tuple, Optional
3
3
  import pandas as pd
4
+ from .stats import get_stats
4
5
 
5
6
  def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
6
7
  """
@@ -238,22 +239,21 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
238
239
  Calculate cardinality ratio for each column (distinct_values / total_rows).
239
240
  Lower ratio = better for RLE compression (more repetition).
240
241
 
241
- NEVER uses sampling - always scans full dataset with exact or approximate distinct counts.
242
+ NEVER uses sampling - always scans full dataset with exact distinct counts.
242
243
 
243
244
  Args:
244
245
  con: DuckDB connection
245
246
  source: Either a table name (default) or parquet file path
246
247
  limit: DEPRECATED - kept for backward compatibility but ignored. Always scans full dataset.
247
248
  is_parquet: If True, source is a parquet file path; if False, source is a table name
248
- use_approx: If True, use HyperLogLog (approx). If False, use exact COUNT(DISTINCT).
249
- If None (default), auto-decide based on table size threshold.
250
- approx_threshold: Row count threshold for using HyperLogLog (default: 100M rows)
249
+ use_approx: DEPRECATED - always uses exact COUNT(DISTINCT)
250
+ approx_threshold: DEPRECATED - always uses exact COUNT(DISTINCT)
251
251
 
252
252
  Returns:
253
253
  Dictionary mapping column names to dict with keys:
254
254
  - 'cardinality_ratio': distinct/total, range 0-1, lower is better for RLE
255
255
  - 'total_rows': total row count
256
- - 'distinct_values': number of distinct values (exact or approximate)
256
+ - 'distinct_values': number of distinct values (exact)
257
257
  """
258
258
  # Build the FROM clause based on source type
259
259
  if is_parquet:
@@ -274,26 +274,15 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
274
274
  if not column_names:
275
275
  return {}
276
276
 
277
- # Auto-decide whether to use approximate or exact based on table size
278
- if use_approx is None:
279
- # Quick row count check
280
- total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
281
- use_approx = total_rows > approx_threshold
282
- if use_approx:
283
- print(f" Table has {total_rows:,} rows (>{approx_threshold:,}) - using HyperLogLog approximation")
284
- else:
285
- print(f" Table has {total_rows:,} rows (<={approx_threshold:,}) - using exact COUNT(DISTINCT)")
286
- else:
287
- total_rows = None # Will be calculated in main query
277
+ # Get row count
278
+ total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
279
+ print(f" Table has {total_rows:,} rows - using exact COUNT(DISTINCT)")
288
280
 
289
- # Build a single query that calculates all NFV scores in one pass
281
+ # Build a single query that calculates all cardinality in one pass
290
282
  # This scans the data only ONCE instead of once per column
291
283
  select_clauses = []
292
284
  for col in column_names:
293
- if use_approx:
294
- select_clauses.append(f"approx_count_distinct({col}) as distinct_{col}")
295
- else:
296
- select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
285
+ select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
297
286
 
298
287
  query = f"""
299
288
  SELECT
@@ -307,8 +296,7 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
307
296
  if not result:
308
297
  return {}
309
298
 
310
- if total_rows is None:
311
- total_rows = result[0]
299
+ total_rows = result[0]
312
300
 
313
301
  nfv_stats = {}
314
302
 
@@ -405,7 +393,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
405
393
  max_cardinality_pct: float = 0.01,
406
394
  max_ordering_depth: int = 3,
407
395
  schema_name: str = None,
408
- table_display_name: str = None) -> pd.DataFrame:
396
+ table_display_name: str = None,
397
+ duckrun_instance = None) -> pd.DataFrame:
409
398
  """
410
399
  Test column orderings for RLE optimization.
411
400
 
@@ -425,6 +414,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
425
414
  max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
426
415
  schema_name: Optional schema name to include in results (default: None)
427
416
  table_display_name: Optional table name to include in results (default: None)
417
+ duckrun_instance: Optional Duckrun instance to fetch detailed parquet stats (default: None)
428
418
 
429
419
  Returns:
430
420
  DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
@@ -485,6 +475,35 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
485
475
  print(f"✓ Analysis complete!")
486
476
  print(f"{'='*60}")
487
477
 
478
+ # Get detailed parquet stats if duckrun_instance is provided
479
+ parquet_stats = None
480
+ vorder_status = False
481
+ table_size_mb = None
482
+ if duckrun_instance and table_display_name:
483
+ print("\nFetching detailed parquet metadata...")
484
+ try:
485
+ # For single-schema connections, just use the table name
486
+ # For multi-schema connections, use schema.table format
487
+ if hasattr(duckrun_instance, 'scan_all_schemas') and duckrun_instance.scan_all_schemas and schema_name:
488
+ source_param = f"{schema_name}.{table_display_name}"
489
+ else:
490
+ source_param = table_display_name
491
+
492
+ parquet_stats = get_stats(duckrun_instance, source=source_param, detailed=True)
493
+ print(f"✓ Retrieved parquet metadata for {len(parquet_stats)} row groups/columns")
494
+
495
+ # Get vorder status from the stats if available
496
+ if 'vorder' in parquet_stats.columns:
497
+ vorder_status = parquet_stats['vorder'].iloc[0] if len(parquet_stats) > 0 else False
498
+
499
+ # Calculate total table size from compressed sizes
500
+ if 'total_compressed_size' in parquet_stats.columns:
501
+ total_bytes = parquet_stats['total_compressed_size'].sum()
502
+ table_size_mb = round(total_bytes / (1024 * 1024), 2) if total_bytes else None
503
+ except Exception as e:
504
+ print(f"⚠️ Could not fetch parquet stats: {e}")
505
+ parquet_stats = None
506
+
488
507
  # Transform to long format
489
508
  long_format_results = []
490
509
 
@@ -502,6 +521,38 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
502
521
  # Get total rows and NDV from card_stats if available
503
522
  total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
504
523
 
524
+ # Aggregate parquet stats per column if available
525
+ parquet_by_column = {}
526
+ if parquet_stats is not None and not parquet_stats.empty:
527
+ # Determine column name field - can be 'name' or 'path_in_schema'
528
+ col_name_field = 'path_in_schema' if 'path_in_schema' in parquet_stats.columns else 'name'
529
+
530
+ # Group by column name and aggregate
531
+ for col_name in data_columns:
532
+ col_stats = parquet_stats[parquet_stats[col_name_field] == col_name] if col_name_field in parquet_stats.columns else pd.DataFrame()
533
+
534
+ if not col_stats.empty:
535
+ # Aggregate stats across all row groups for this column
536
+ total_compressed = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
537
+ total_uncompressed = col_stats['total_uncompressed_size'].sum() if 'total_uncompressed_size' in col_stats.columns else None
538
+ # Handle both 'encodings' (multiple) and 'encoding' (single) column names
539
+ if 'encodings' in col_stats.columns:
540
+ encodings = col_stats['encodings'].unique().tolist()
541
+ elif 'encoding' in col_stats.columns:
542
+ encodings = col_stats['encoding'].unique().tolist()
543
+ else:
544
+ encodings = []
545
+ compressions = col_stats['compression'].unique().tolist() if 'compression' in col_stats.columns else []
546
+ num_row_groups = col_stats['row_group_id'].nunique() if 'row_group_id' in col_stats.columns else len(col_stats)
547
+
548
+ parquet_by_column[col_name] = {
549
+ 'encoding': ', '.join([str(e) for e in encodings if e is not None]),
550
+ 'compression': ', '.join([str(c) for c in compressions if c is not None]),
551
+ 'total_compressed_size': total_compressed,
552
+ 'total_uncompressed_size': total_uncompressed,
553
+ 'num_row_groups': num_row_groups
554
+ }
555
+
505
556
  # Create one row per data column
506
557
  for col in data_columns:
507
558
  rle_value = row[col]
@@ -509,6 +560,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
509
560
  # Get NDV from card_stats
510
561
  ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
511
562
 
563
+ # Get parquet stats for this column
564
+ col_parquet = parquet_by_column.get(col, {})
565
+
512
566
  long_format_results.append({
513
567
  'schema': schema_val,
514
568
  'table': table_val,
@@ -519,6 +573,13 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
519
573
  'NDV': ndv_value,
520
574
  'total_rows': total_rows,
521
575
  'total_RLE': total_rle_all_val,
576
+ 'encoding': col_parquet.get('encoding', ''),
577
+ 'compression': col_parquet.get('compression', ''),
578
+ 'total_compressed_size': col_parquet.get('total_compressed_size', None),
579
+ 'total_uncompressed_size': col_parquet.get('total_uncompressed_size', None),
580
+ 'avg_row_group_size': total_rows // col_parquet.get('num_row_groups', 1) if total_rows and col_parquet.get('num_row_groups') else None,
581
+ 'table_size_mb': table_size_mb,
582
+ 'vorder': vorder_status,
522
583
  'comments': ''
523
584
  })
524
585
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev4
3
+ Version: 0.2.19.dev6
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev4"
7
+ version = "0.2.19.dev6"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -7,10 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
7
  import duckrun
8
8
 
9
9
  # Analyze multiple schemas/tables
10
- conn = duckrun.connect("tmp/data.lakehouse/deltars_sorted")
10
+ conn = duckrun.connect("tmp/data.lakehouse")
11
11
 
12
12
  # Analyze tables - now returns long format automatically
13
- result = conn.rle("calendar",'auto')
13
+ result = conn.rle("aemo.calendar")
14
14
  print(result)
15
15
  conn.close()
16
16
 
File without changes
File without changes
File without changes