duckrun 0.2.22.dev0__tar.gz → 0.2.22.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/PKG-INFO +1 -1
  2. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/__init__.py +2 -3
  3. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/core.py +34 -1
  4. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/stats.py +192 -0
  5. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun.egg-info/PKG-INFO +1 -1
  6. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun.egg-info/SOURCES.txt +2 -1
  7. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/pyproject.toml +1 -1
  8. duckrun-0.2.22.dev2/tests/test_consecutive_values.py +115 -0
  9. duckrun-0.2.22.dev2/tests/test_rle_summary.py +22 -0
  10. duckrun-0.2.22.dev0/duckrun/rle.py +0 -362
  11. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/LICENSE +0 -0
  12. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/README.md +0 -0
  13. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/auth.py +0 -0
  14. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/ducklake_metadata.py +0 -0
  15. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/files.py +0 -0
  16. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/lakehouse.py +0 -0
  17. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/notebook.py +0 -0
  18. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/runner.py +0 -0
  19. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/semantic_model.py +0 -0
  20. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/writer.py +0 -0
  21. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun.egg-info/dependency_links.txt +0 -0
  22. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun.egg-info/requires.txt +0 -0
  23. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun.egg-info/top_level.txt +0 -0
  24. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/setup.cfg +0 -0
  25. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_checkpoint_format.py +0 -0
  26. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_deploy_fresh.py +0 -0
  27. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_ducklake_export.py +0 -0
  28. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_filename.py +0 -0
  29. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_register.py +0 -0
  30. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_rle.py +0 -0
  31. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_rle_analysis.py +0 -0
  32. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_writer_dictionary.py +0 -0
  33. {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_writer_integration.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.22.dev0
3
+ Version: 0.2.22.dev2
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -2,11 +2,10 @@
2
2
 
3
3
  from duckrun.core import Duckrun
4
4
  from duckrun.notebook import import_notebook_from_web, import_notebook
5
- from duckrun import rle
6
5
 
7
- __version__ = "0.2.18"
6
+ __version__ = "0.2.22.dev2"
8
7
 
9
8
  # Expose unified connect method at module level
10
9
  connect = Duckrun.connect
11
10
 
12
- __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook", "rle"]
11
+ __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
@@ -7,7 +7,7 @@ import time
7
7
  from typing import List, Tuple, Union, Optional, Callable, Dict, Any
8
8
  from string import Template
9
9
  from datetime import datetime
10
- from .stats import get_stats as _get_stats
10
+ from .stats import get_stats as _get_stats, get_rle as _get_rle
11
11
  from .runner import run as _run
12
12
  from .files import copy as _copy, download as _download
13
13
  from .writer import QueryResult
@@ -1181,6 +1181,39 @@ class Duckrun(WorkspaceOperationsMixin):
1181
1181
  """
1182
1182
  return _get_stats(self, source, detailed)
1183
1183
 
1184
+ def get_rle(self, source: str = None):
1185
+ """
1186
+ Get RLE (Run-Length Encoding) statistics for Delta Lake tables.
1187
+ Measures compression potential by counting consecutive identical values.
1188
+
1189
+ Args:
1190
+ source: Optional. Can be one of:
1191
+ - None: Use all tables in the connection's schema (default)
1192
+ - Table name: 'table_name' (uses current schema)
1193
+ - Schema.table: 'schema.table_name' (specific table in schema)
1194
+ - Schema only: 'schema' (all tables in schema)
1195
+ - Wildcard patterns: '*.summary' or 'schema.*'
1196
+
1197
+ Returns:
1198
+ DataFrame with columns: schema_name, table_name, total_rle_runs
1199
+
1200
+ Examples:
1201
+ con = duckrun.connect("tmp/data.lakehouse/aemo")
1202
+
1203
+ # All tables in current schema
1204
+ rle = con.get_rle()
1205
+
1206
+ # Single table in current schema
1207
+ rle = con.get_rle('price')
1208
+
1209
+ # Specific table in different schema
1210
+ rle = con.get_rle('deltars.summary')
1211
+
1212
+ # All tables matching pattern
1213
+ rle = con.get_rle('*.summary')
1214
+ """
1215
+ return _get_rle(self, source)
1216
+
1184
1217
  def list_lakehouses(self) -> List[str]:
1185
1218
  """
1186
1219
  List all lakehouses in the current workspace.
@@ -454,3 +454,195 @@ def get_stats(duckrun_instance, source: str = None, detailed = False):
454
454
  return final_result
455
455
 
456
456
 
457
+ def get_rle(duckrun_instance, source: str = None) -> 'pd.DataFrame':
458
+ """
459
+ Get RLE statistics for tables at the column level.
460
+
461
+ Args:
462
+ duckrun_instance: Duckrun connection (from duckrun.connect())
463
+ source: Optional. Can be one of:
464
+ - None: Use all tables in the connection's schema (default)
465
+ - Table name: 'table_name' (uses main schema in DuckDB)
466
+ - Schema.table: 'schema.table_name' (specific table in schema)
467
+ - Schema only: 'schema' (all tables in schema)
468
+ - Wildcard pattern: '*.summary' (matches tables across all schemas)
469
+
470
+ Returns:
471
+ DataFrame with columns:
472
+ - schema_name: Schema name
473
+ - table_name: Table name
474
+ - column_name: Column name
475
+ - total_rows: Total number of rows
476
+ - rle_runs: RLE runs for this column in natural order
477
+ - ndv: Number of distinct values
478
+ - total_rle_runs: Sum of RLE runs across all columns (same for all rows of a table)
479
+ """
480
+ import fnmatch
481
+ import pandas as pd
482
+
483
+ con = duckrun_instance.con # Get underlying DuckDB connection
484
+
485
+ # Determine which tables to process
486
+ tables_to_process = [] # List of (schema, table) tuples
487
+
488
+ if source is None:
489
+ # Get all tables in the connection's schema
490
+ schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
491
+ try:
492
+ if schema_name == 'main':
493
+ query = "SHOW TABLES"
494
+ result = con.execute(query).fetchall()
495
+ if result:
496
+ tables = [row[0] for row in result if not row[0].startswith('tbl_')]
497
+ tables_to_process = [(schema_name, tbl) for tbl in tables]
498
+ else:
499
+ query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'"
500
+ result = con.execute(query).fetchall()
501
+ if result:
502
+ tables = [row[0] for row in result if not row[0].startswith('tbl_')]
503
+ tables_to_process = [(schema_name, tbl) for tbl in tables]
504
+ except:
505
+ pass
506
+
507
+ elif '.' in source:
508
+ parts = source.split('.', 1)
509
+ schema_pattern, table_pattern = parts[0], parts[1]
510
+
511
+ # Check if patterns contain wildcards
512
+ if '*' in schema_pattern or '*' in table_pattern:
513
+ # Wildcard matching
514
+ query = """
515
+ SELECT table_schema, table_name
516
+ FROM information_schema.tables
517
+ WHERE table_schema NOT LIKE 'pg_%'
518
+ AND table_schema != 'information_schema'
519
+ AND table_name NOT LIKE 'tbl_%'
520
+ """
521
+ result = con.execute(query).fetchall()
522
+ for schema, table in result:
523
+ if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
524
+ tables_to_process.append((schema, table))
525
+ else:
526
+ # Exact schema.table
527
+ tables_to_process = [(schema_pattern, table_pattern)]
528
+
529
+ elif '*' in source:
530
+ # Wildcard pattern for table names across all schemas
531
+ query = """
532
+ SELECT table_schema, table_name
533
+ FROM information_schema.tables
534
+ WHERE table_schema NOT LIKE 'pg_%'
535
+ AND table_schema != 'information_schema'
536
+ AND table_name NOT LIKE 'tbl_%'
537
+ """
538
+ result = con.execute(query).fetchall()
539
+ for schema, table in result:
540
+ if fnmatch.fnmatch(table, source):
541
+ tables_to_process.append((schema, table))
542
+
543
+ else:
544
+ # Check if it's a schema name or table name
545
+ try:
546
+ # Try as schema first
547
+ schema_query = f"SELECT 1 FROM information_schema.schemata WHERE schema_name = '{source}' LIMIT 1"
548
+ schema_exists = con.execute(schema_query).fetchone()
549
+
550
+ if schema_exists:
551
+ # It's a schema - get all tables
552
+ tables_query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{source}'"
553
+ result = con.execute(tables_query).fetchall()
554
+ if result:
555
+ tables = [row[0] for row in result if not row[0].startswith('tbl_')]
556
+ tables_to_process = [(source, tbl) for tbl in tables]
557
+ else:
558
+ # It's a table name in default schema
559
+ schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
560
+ tables_to_process = [(schema_name, source)]
561
+ except:
562
+ # Assume it's a table name
563
+ schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
564
+ tables_to_process = [(schema_name, source)]
565
+
566
+ if not tables_to_process:
567
+ print("No tables found matching the criteria")
568
+ return pd.DataFrame(columns=['schema_name', 'table_name', 'column_name', 'total_rows',
569
+ 'rle_runs', 'ndv', 'total_rle_runs'])
570
+
571
+ print(f"Processing {len(tables_to_process)} table(s)...")
572
+
573
+ # Process each table
574
+ results = []
575
+ for schema, table in tables_to_process:
576
+ table_path = f"{duckrun_instance.table_base_url}{schema}/{table}"
577
+
578
+ print(f"\nCalculating RLE runs for {schema}.{table}...")
579
+
580
+ # Get column names and row count
581
+ try:
582
+ schema_info = con.sql(f"""
583
+ SELECT column_name
584
+ FROM (DESCRIBE SELECT * FROM delta_scan('{table_path}'))
585
+ """).df()
586
+
587
+ # Get total row count
588
+ total_rows = con.sql(f"SELECT COUNT(*) FROM delta_scan('{table_path}')").fetchone()[0]
589
+
590
+ if schema_info.empty:
591
+ continue
592
+
593
+ # Track total RLE runs for this table
594
+ table_total_rle = 0
595
+ table_results = []
596
+
597
+ for _, row in schema_info.iterrows():
598
+ col_name = row['column_name']
599
+
600
+ # Calculate RLE runs in natural (physical) order using delta_scan
601
+ rle_query = f"""
602
+ WITH numbered AS (
603
+ SELECT
604
+ filename,
605
+ file_row_number,
606
+ {col_name},
607
+ LAG({col_name}) OVER (ORDER BY filename, file_row_number) as prev_value
608
+ FROM delta_scan('{table_path}', file_row_number=1, filename=1)
609
+ )
610
+ SELECT COUNT(*) as runs
611
+ FROM numbered
612
+ WHERE prev_value IS NULL OR {col_name} != prev_value OR {col_name} IS NULL OR prev_value IS NULL
613
+ """
614
+
615
+ try:
616
+ runs = con.sql(rle_query).fetchone()[0]
617
+
618
+ # Also calculate NDV for this column
619
+ ndv_query = f"SELECT COUNT(DISTINCT {col_name}) FROM delta_scan('{table_path}')"
620
+ ndv = con.sql(ndv_query).fetchone()[0]
621
+
622
+ table_total_rle += runs
623
+
624
+ print(f" {col_name}: {runs:,} runs, ndv={ndv:,}")
625
+
626
+ table_results.append({
627
+ 'schema_name': schema,
628
+ 'table_name': table,
629
+ 'column_name': col_name,
630
+ 'total_rows': total_rows,
631
+ 'rle_runs': runs,
632
+ 'ndv': ndv
633
+ })
634
+ except Exception as e:
635
+ print(f" Warning: Could not calculate RLE runs for {col_name}: {e}")
636
+
637
+ # Add total_rle_runs to all rows for this table
638
+ for result in table_results:
639
+ result['total_rle_runs'] = table_total_rle
640
+ results.append(result)
641
+
642
+ print(f" Total RLE runs for table: {table_total_rle:,}")
643
+
644
+ except Exception as e:
645
+ print(f" Error processing table: {e}")
646
+
647
+ return pd.DataFrame(results)
648
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.22.dev0
3
+ Version: 0.2.22.dev2
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -8,7 +8,6 @@ duckrun/ducklake_metadata.py
8
8
  duckrun/files.py
9
9
  duckrun/lakehouse.py
10
10
  duckrun/notebook.py
11
- duckrun/rle.py
12
11
  duckrun/runner.py
13
12
  duckrun/semantic_model.py
14
13
  duckrun/stats.py
@@ -19,11 +18,13 @@ duckrun.egg-info/dependency_links.txt
19
18
  duckrun.egg-info/requires.txt
20
19
  duckrun.egg-info/top_level.txt
21
20
  tests/test_checkpoint_format.py
21
+ tests/test_consecutive_values.py
22
22
  tests/test_deploy_fresh.py
23
23
  tests/test_ducklake_export.py
24
24
  tests/test_filename.py
25
25
  tests/test_register.py
26
26
  tests/test_rle.py
27
27
  tests/test_rle_analysis.py
28
+ tests/test_rle_summary.py
28
29
  tests/test_writer_dictionary.py
29
30
  tests/test_writer_integration.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.22.dev0"
7
+ version = "0.2.22.dev2"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -0,0 +1,115 @@
1
+ """
2
+ Test: Analyze Consecutive Values in Delta Table
3
+
4
+ This test connects to tmp/data.lakehouse and analyzes the deltars.summary table
5
+ for consecutive runs in the 'time' column.
6
+ """
7
+
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ # Add parent directory to path to import duckrun
12
+ sys.path.insert(0, str(Path(__file__).parent.parent))
13
+
14
+ import duckrun
15
+ from duckrun.rle import analyze_consecutive_values
16
+
17
+
18
+ def test_consecutive_values():
19
+ """Test consecutive value analysis on deltars.summary table"""
20
+
21
+ lakehouse_path = "tmp/data.lakehouse"
22
+
23
+ print("=" * 80)
24
+ print("CONSECUTIVE VALUES ANALYSIS TEST")
25
+ print("=" * 80)
26
+ print(f"Lakehouse: {lakehouse_path}")
27
+ print(f"Table: deltars.summary")
28
+ print(f"Column: time")
29
+
30
+ try:
31
+ # Connect to lakehouse
32
+ print("\nConnecting to lakehouse...")
33
+ con = duckrun.connect(lakehouse_path)
34
+
35
+ # Analyze consecutive values in the 'time' column
36
+ print("\n" + "=" * 80)
37
+ print("ANALYZING CONSECUTIVE VALUES")
38
+ print("=" * 80)
39
+
40
+ df = analyze_consecutive_values(
41
+ duckrun_con=con,
42
+ table_name='summary',
43
+ column_name='time',
44
+ min_consecutive=3,
45
+ schema_name='deltars'
46
+ )
47
+
48
+ # Display results
49
+ if not df.empty:
50
+ print("\n" + "=" * 80)
51
+ print("RESULTS")
52
+ print("=" * 80)
53
+ print(f"\nFound {len(df)} consecutive sequences")
54
+ print("\nTop 20 longest sequences:")
55
+ print(df.head(20).to_string(index=False))
56
+
57
+ # Statistics
58
+ print("\n" + "=" * 80)
59
+ print("STATISTICS")
60
+ print("=" * 80)
61
+ total_in_sequences = df['consecutive_count'].sum()
62
+ longest = df['consecutive_count'].max()
63
+ shortest = df['consecutive_count'].min()
64
+ avg = df['consecutive_count'].mean()
65
+
66
+ print(f"Total values in sequences: {total_in_sequences:,}")
67
+ print(f"Longest sequence: {longest:,}")
68
+ print(f"Shortest sequence: {shortest:,}")
69
+ print(f"Average sequence length: {avg:.2f}")
70
+
71
+ # File distribution
72
+ print("\n" + "=" * 80)
73
+ print("FILE DISTRIBUTION")
74
+ print("=" * 80)
75
+ file_counts = df.groupby('filename').agg({
76
+ 'consecutive_count': ['count', 'sum', 'max']
77
+ }).reset_index()
78
+ file_counts.columns = ['filename', 'num_sequences', 'total_values', 'max_sequence']
79
+
80
+ # Extract just filename from path
81
+ file_counts['filename'] = file_counts['filename'].apply(
82
+ lambda x: x.split('/')[-1] if '/' in str(x) else x
83
+ )
84
+
85
+ print(f"\nSequences across {len(file_counts)} files:")
86
+ print(file_counts.to_string(index=False))
87
+
88
+ print("\n✅ Test completed successfully!")
89
+ return True
90
+ else:
91
+ print("\n⚠ No consecutive sequences found")
92
+ return True
93
+
94
+ except Exception as e:
95
+ print(f"\n❌ Error during analysis: {e}")
96
+ import traceback
97
+ traceback.print_exc()
98
+ return False
99
+
100
+
101
+ if __name__ == "__main__":
102
+ print("\n" + "=" * 80)
103
+ print("STARTING CONSECUTIVE VALUES TEST")
104
+ print("=" * 80)
105
+
106
+ success = test_consecutive_values()
107
+
108
+ print("\n" + "=" * 80)
109
+ if success:
110
+ print("✅ TEST PASSED")
111
+ else:
112
+ print("❌ TEST FAILED")
113
+ print("=" * 80)
114
+
115
+ sys.exit(0 if success else 1)
@@ -0,0 +1,22 @@
1
+ """
2
+ Test: Get RLE statistics for deltars.summary table
3
+ """
4
+
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ # Add parent directory to path to import duckrun
9
+ sys.path.insert(0, str(Path(__file__).parent.parent))
10
+
11
+ import duckrun
12
+
13
+ # Connect to lakehouse
14
+ con = duckrun.connect("tmp/data.lakehouse")
15
+
16
+ # Get RLE stats for deltars.summary
17
+ rle_df = con.get_rle('*.summary')
18
+
19
+ print("\n" + "=" * 80)
20
+ print("RESULTS")
21
+ print("=" * 80)
22
+ print(rle_df.to_string(index=False))
@@ -1,362 +0,0 @@
1
- from typing import List, Dict, Tuple, Optional
2
- import pandas as pd
3
-
4
-
5
- def get_table_stats(duckrun_con, table_name: str,
6
- top_n_values: int = 10) -> pd.DataFrame:
7
- """
8
- Get comprehensive table statistics including NDV and value frequency analysis.
9
-
10
- The theory: If a value appears frequently (high repetition), it may provide better RLE compression
11
- even if the column has higher NDV. This function helps identify such patterns.
12
-
13
- Args:
14
- duckrun_con: Duckrun connection (from duckrun.connect())
15
- table_name: Name of the table to analyze
16
- top_n_values: Number of top frequent values to show per column (default: 10)
17
-
18
- Returns:
19
- DataFrame with columns:
20
- - column_name: Name of the column
21
- - data_type: Data type of the column
22
- - total_rows: Total number of rows
23
- - null_count: Number of NULL values
24
- - null_pct: Percentage of NULL values
25
- - ndv: Number of distinct values (exact)
26
- - cardinality_ratio: NDV / total_rows (lower = better for RLE)
27
- - top_value: Most frequent value
28
- - top_value_count: Count of most frequent value
29
- - top_value_pct: Percentage of most frequent value
30
- - top_n_coverage: Percentage covered by top N values
31
- - repetition_score: Custom score indicating RLE potential (higher = better)
32
- """
33
- con = duckrun_con.con # Get underlying DuckDB connection
34
- from_clause = table_name
35
-
36
- # Get column names and types
37
- schema_info = con.sql(f"""
38
- SELECT column_name, column_type
39
- FROM (DESCRIBE SELECT * FROM {from_clause})
40
- """).df()
41
-
42
- if schema_info.empty:
43
- return pd.DataFrame()
44
-
45
- # Get total row count once
46
- total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
47
- print(f"Analyzing {len(schema_info)} columns across {total_rows:,} rows...")
48
-
49
- results = []
50
-
51
- for idx, row in schema_info.iterrows():
52
- col_name = row['column_name']
53
- col_type = row['column_type']
54
-
55
- print(f" [{idx+1}/{len(schema_info)}] Analyzing column: {col_name}")
56
-
57
- # Get basic stats in one query
58
- stats_query = f"""
59
- SELECT
60
- COUNT(*) as total,
61
- COUNT({col_name}) as non_null,
62
- COUNT(DISTINCT {col_name}) as ndv
63
- FROM {from_clause}
64
- """
65
-
66
- stats = con.sql(stats_query).fetchone()
67
- total = stats[0]
68
- non_null = stats[1]
69
- ndv = stats[2]
70
- null_count = total - non_null
71
- null_pct = (null_count / total * 100) if total > 0 else 0
72
- cardinality_ratio = (ndv / total) if total > 0 else 0
73
-
74
- # Get top N values with their frequencies
75
- top_values_query = f"""
76
- SELECT
77
- {col_name} as value,
78
- COUNT(*) as count,
79
- COUNT(*) * 100.0 / {total} as percentage
80
- FROM {from_clause}
81
- WHERE {col_name} IS NOT NULL
82
- GROUP BY {col_name}
83
- ORDER BY count DESC
84
- LIMIT {top_n_values}
85
- """
86
-
87
- top_values = con.sql(top_values_query).df()
88
-
89
- # Extract top value info
90
- if not top_values.empty:
91
- top_value = top_values.iloc[0]['value']
92
- top_value_count = top_values.iloc[0]['count']
93
- top_value_pct = top_values.iloc[0]['percentage']
94
- top_n_coverage = top_values['percentage'].sum()
95
- else:
96
- top_value = None
97
- top_value_count = 0
98
- top_value_pct = 0
99
- top_n_coverage = 0
100
-
101
- # Calculate repetition score: higher means better for RLE
102
- # Score considers:
103
- # 1. How much the top value covers (higher = better)
104
- # 2. How much top N values cover (higher = better)
105
- # 3. Inverse of cardinality ratio (lower cardinality = better)
106
- repetition_score = (top_value_pct * 2 + top_n_coverage) / 3 / (cardinality_ratio + 0.01)
107
-
108
- results.append({
109
- 'column_name': col_name,
110
- 'data_type': col_type,
111
- 'total_rows': total_rows,
112
- 'null_count': null_count,
113
- 'null_pct': round(null_pct, 2),
114
- 'ndv': ndv,
115
- 'cardinality_ratio': round(cardinality_ratio, 4),
116
- 'top_value': top_value,
117
- 'top_value_count': top_value_count,
118
- 'top_value_pct': round(top_value_pct, 2),
119
- 'top_n_coverage': round(top_n_coverage, 2),
120
- 'repetition_score': round(repetition_score, 2)
121
- })
122
-
123
- df = pd.DataFrame(results)
124
-
125
- # Sort by repetition score (best RLE candidates first)
126
- df = df.sort_values('repetition_score', ascending=False).reset_index(drop=True)
127
-
128
- print(f"\n✓ Analysis complete!")
129
- print(f"\nTop columns by repetition score (best RLE candidates):")
130
- for idx, row in df.head(5).iterrows():
131
- print(f" {idx+1}. {row['column_name']}: score={row['repetition_score']}, "
132
- f"top_value_pct={row['top_value_pct']}%, ndv={row['ndv']:,}")
133
-
134
- return df
135
-
136
-
137
- def get_value_frequency_details(duckrun_con, table_name: str, column_name: str,
138
- limit: int = 20) -> pd.DataFrame:
139
- """
140
- Get detailed value frequency distribution for a specific column.
141
-
142
- Shows the most frequent values and their counts/percentages.
143
- Useful for understanding repetition patterns that drive RLE compression.
144
-
145
- Args:
146
- duckrun_con: Duckrun connection (from duckrun.connect())
147
- table_name: Name of the table to analyze
148
- column_name: Name of the column to analyze
149
- limit: Maximum number of values to return (default: 20)
150
-
151
- Returns:
152
- DataFrame with columns:
153
- - value: The distinct value
154
- - count: Number of occurrences
155
- - percentage: Percentage of total rows
156
- - cumulative_pct: Cumulative percentage
157
- """
158
- con = duckrun_con.con # Get underlying DuckDB connection
159
- from_clause = table_name
160
-
161
- # Get total row count
162
- total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
163
-
164
- # Get value frequencies
165
- query = f"""
166
- WITH value_counts AS (
167
- SELECT
168
- {column_name} as value,
169
- COUNT(*) as count,
170
- COUNT(*) * 100.0 / {total_rows} as percentage
171
- FROM {from_clause}
172
- WHERE {column_name} IS NOT NULL
173
- GROUP BY {column_name}
174
- ORDER BY count DESC
175
- LIMIT {limit}
176
- )
177
- SELECT
178
- value,
179
- count,
180
- percentage,
181
- SUM(percentage) OVER (ORDER BY count DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as cumulative_pct
182
- FROM value_counts
183
- ORDER BY count DESC
184
- """
185
-
186
- df = con.sql(query).df()
187
-
188
- # Round percentages
189
- if not df.empty:
190
- df['percentage'] = df['percentage'].round(2)
191
- df['cumulative_pct'] = df['cumulative_pct'].round(2)
192
-
193
- return df
194
-
195
-
196
- def find_optimal_sort_order(duckrun_con, table_name: str,
197
- max_combinations: int = 10) -> pd.DataFrame:
198
- """
199
- Determine optimal sort order using V-Order-like logic: pure compression testing.
200
-
201
- This mimics how VertiPaq/V-Order actually works:
202
- 1. Calculate cardinality for each column
203
- 2. Test different sort orderings
204
- 3. Measure actual RLE run counts for each ordering
205
- 4. Pick the ordering with best overall compression (fewest total runs)
206
-
207
- NO semantic understanding, NO query pattern assumptions.
208
- Pure mechanical testing of compression effectiveness.
209
-
210
- Args:
211
- duckrun_con: Duckrun connection (from duckrun.connect())
212
- table_name: Name of the table to analyze
213
- max_combinations: Maximum sort orderings to test (default: 10)
214
-
215
- Returns:
216
- DataFrame with tested orderings ranked by compression effectiveness
217
- """
218
- from itertools import permutations
219
-
220
- con = duckrun_con.con # Get underlying DuckDB connection
221
- from_clause = table_name
222
-
223
- # Get column names and cardinalities
224
- print("Step 1: Analyzing column cardinalities...")
225
- schema_info = con.sql(f"""
226
- SELECT column_name, column_type
227
- FROM (DESCRIBE SELECT * FROM {from_clause})
228
- """).df()
229
-
230
- total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
231
-
232
- # Calculate NDV for each column
233
- cardinality_map = {}
234
- for _, row in schema_info.iterrows():
235
- col = row['column_name']
236
- ndv = con.sql(f"SELECT COUNT(DISTINCT {col}) FROM {from_clause}").fetchone()[0]
237
- cardinality_ratio = ndv / total_rows
238
- cardinality_map[col] = {'ndv': ndv, 'ratio': cardinality_ratio}
239
- print(f" {col}: {ndv:,} distinct ({cardinality_ratio*100:.4f}%)")
240
-
241
- # Filter to low-cardinality columns only (< 1% cardinality)
242
- # High cardinality columns won't benefit from reordering
243
- low_card_cols = [col for col, stats in cardinality_map.items()
244
- if stats['ratio'] < 0.01]
245
-
246
- print(f"\nStep 2: Testing sort orderings for {len(low_card_cols)} low-cardinality columns...")
247
- print(f"Columns to test: {', '.join(low_card_cols)}")
248
-
249
- if len(low_card_cols) < 2:
250
- print("Not enough columns to test different orderings!")
251
- return pd.DataFrame()
252
-
253
- # Generate candidate orderings
254
- # Start with cardinality-based orderings
255
- sorted_by_card = sorted(low_card_cols, key=lambda c: cardinality_map[c]['ndv'])
256
-
257
- test_orderings = [
258
- sorted_by_card, # Lowest cardinality first
259
- sorted_by_card[::-1], # Highest cardinality first
260
- ]
261
-
262
- # Add some permutations of top 3 columns
263
- if len(low_card_cols) >= 3:
264
- for perm in permutations(sorted_by_card[:3]):
265
- if list(perm) not in test_orderings:
266
- test_orderings.append(list(perm))
267
- if len(test_orderings) >= max_combinations:
268
- break
269
-
270
- # Test each ordering by calculating actual RLE runs
271
- print(f"\nStep 3: Testing {len(test_orderings)} different orderings...")
272
- results = []
273
-
274
- for idx, ordering in enumerate(test_orderings, 1):
275
- print(f"\n[{idx}/{len(test_orderings)}] Testing: {' → '.join(ordering)}")
276
-
277
- # Calculate RLE runs for each column with this ordering
278
- # We'll sort the data by the ordering and count runs
279
- order_clause = ', '.join(ordering)
280
-
281
- column_rle = {}
282
- for col in schema_info['column_name']:
283
- # Count runs: a new run starts when value changes
284
- rle_query = f"""
285
- WITH sorted_data AS (
286
- SELECT
287
- {col},
288
- ROW_NUMBER() OVER (ORDER BY {order_clause}) as rn
289
- FROM {from_clause}
290
- ),
291
- with_prev AS (
292
- SELECT
293
- {col},
294
- LAG({col}) OVER (ORDER BY rn) as prev_val
295
- FROM sorted_data
296
- )
297
- SELECT COUNT(*) as runs
298
- FROM with_prev
299
- WHERE prev_val IS NULL OR {col} != prev_val OR {col} IS NULL OR prev_val IS NULL
300
- """
301
-
302
- runs = con.sql(rle_query).fetchone()[0]
303
- column_rle[col] = runs
304
- print(f" {col}: {runs:,} runs")
305
-
306
- total_runs = sum(column_rle.values())
307
- print(f" TOTAL: {total_runs:,} runs")
308
-
309
- results.append({
310
- 'sort_order': ' → '.join(ordering),
311
- 'total_runs': total_runs,
312
- 'compression_score': total_rows / total_runs, # Higher = better compression
313
- **column_rle
314
- })
315
-
316
- # Create results DataFrame
317
- df = pd.DataFrame(results)
318
- df = df.sort_values('total_runs').reset_index(drop=True)
319
-
320
- print("\n" + "=" * 80)
321
- print("RESULTS: Best to Worst Compression")
322
- print("=" * 80)
323
-
324
- for idx, row in df.iterrows():
325
- print(f"\n{idx + 1}. {row['sort_order']}")
326
- print(f" Total runs: {row['total_runs']:,}")
327
- print(f" Compression score: {row['compression_score']:.2f}x")
328
- if idx == 0:
329
- print(" ⭐ BEST COMPRESSION")
330
-
331
- print("\n" + "=" * 80)
332
- print("CONCLUSION")
333
- print("=" * 80)
334
- best = df.iloc[0]
335
- print(f"\nOptimal sort order: {best['sort_order']}")
336
- print(f"This ordering achieves the fewest total RLE runs ({best['total_runs']:,})")
337
- print(f"\nThis is how V-Order actually works:")
338
- print("✓ No query pattern assumptions")
339
- print("✓ No semantic understanding")
340
- print("✓ Pure compression effectiveness testing")
341
- print("✓ Mechanical optimization based on data patterns")
342
-
343
- return df
344
-
345
-
346
- # Example usage:
347
- #
348
- # import duckrun
349
- #
350
- # con = duckrun.connect('workspace/lakehouse.lakehouse')
351
- #
352
- # # Get RLE statistics:
353
- # stats_df = con.get_rle_stats('my_table', top_n_values=10)
354
- # print(stats_df)
355
- #
356
- # # Detailed frequency distribution for a specific column:
357
- # freq_df = con.get_value_frequency('my_table', 'status_column', limit=20)
358
- # print(freq_df)
359
- #
360
- # # Find optimal sort order (V-Order simulation):
361
- # optimal_df = con.find_optimal_sort_order('my_table', max_combinations=10)
362
- # print(optimal_df)
File without changes
File without changes
File without changes