duckrun 0.2.21.dev2__tar.gz → 0.2.22.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/PKG-INFO +1 -1
  2. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/__init__.py +2 -3
  3. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/core.py +126 -1
  4. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/stats.py +192 -0
  5. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun.egg-info/PKG-INFO +1 -1
  6. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun.egg-info/SOURCES.txt +3 -1
  7. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/pyproject.toml +1 -1
  8. duckrun-0.2.22.dev2/tests/test_consecutive_values.py +115 -0
  9. duckrun-0.2.22.dev2/tests/test_rle_analysis.py +149 -0
  10. duckrun-0.2.22.dev2/tests/test_rle_summary.py +22 -0
  11. duckrun-0.2.21.dev2/duckrun/rle.py +0 -940
  12. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/LICENSE +0 -0
  13. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/README.md +0 -0
  14. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/auth.py +0 -0
  15. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/ducklake_metadata.py +0 -0
  16. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/files.py +0 -0
  17. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/lakehouse.py +0 -0
  18. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/notebook.py +0 -0
  19. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/runner.py +0 -0
  20. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/semantic_model.py +0 -0
  21. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/writer.py +0 -0
  22. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun.egg-info/dependency_links.txt +0 -0
  23. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun.egg-info/requires.txt +0 -0
  24. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun.egg-info/top_level.txt +0 -0
  25. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/setup.cfg +0 -0
  26. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_checkpoint_format.py +0 -0
  27. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_deploy_fresh.py +0 -0
  28. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_ducklake_export.py +0 -0
  29. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_filename.py +0 -0
  30. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_register.py +0 -0
  31. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_rle.py +0 -0
  32. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_writer_dictionary.py +0 -0
  33. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_writer_integration.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.21.dev2
3
+ Version: 0.2.22.dev2
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -2,11 +2,10 @@
2
2
 
3
3
  from duckrun.core import Duckrun
4
4
  from duckrun.notebook import import_notebook_from_web, import_notebook
5
- from duckrun import rle
6
5
 
7
- __version__ = "0.2.18"
6
+ __version__ = "0.2.22.dev2"
8
7
 
9
8
  # Expose unified connect method at module level
10
9
  connect = Duckrun.connect
11
10
 
12
- __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook", "rle"]
11
+ __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
@@ -7,7 +7,7 @@ import time
7
7
  from typing import List, Tuple, Union, Optional, Callable, Dict, Any
8
8
  from string import Template
9
9
  from datetime import datetime
10
- from .stats import get_stats as _get_stats
10
+ from .stats import get_stats as _get_stats, get_rle as _get_rle
11
11
  from .runner import run as _run
12
12
  from .files import copy as _copy, download as _download
13
13
  from .writer import QueryResult
@@ -1050,6 +1050,98 @@ class Duckrun(WorkspaceOperationsMixin):
1050
1050
  """
1051
1051
  self.con.register(name, df)
1052
1052
 
1053
+ def get_rle_stats(self, table_name: str, top_n_values: int = 10):
1054
+ """
1055
+ Get comprehensive table statistics including NDV and value frequency analysis.
1056
+
1057
+ Analyzes column characteristics for RLE compression optimization.
1058
+
1059
+ Args:
1060
+ table_name: Name of the table to analyze
1061
+ top_n_values: Number of top frequent values to show per column (default: 10)
1062
+
1063
+ Returns:
1064
+ DataFrame with statistics for each column:
1065
+ - column_name: Name of the column
1066
+ - data_type: Data type
1067
+ - total_rows: Total number of rows
1068
+ - null_count, null_pct: NULL statistics
1069
+ - ndv: Number of distinct values (exact)
1070
+ - cardinality_ratio: NDV / total_rows (lower = better for RLE)
1071
+ - top_value, top_value_count, top_value_pct: Most frequent value stats
1072
+ - top_n_coverage: Percentage covered by top N values
1073
+ - repetition_score: RLE potential score (higher = better)
1074
+
1075
+ Examples:
1076
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
1077
+
1078
+ # Analyze a table
1079
+ stats = con.get_rle_stats('sales')
1080
+ print(stats)
1081
+
1082
+ # Show top 20 values per column
1083
+ stats = con.get_rle_stats('sales', top_n_values=20)
1084
+ """
1085
+ from .rle import get_table_stats as _get_rle_stats
1086
+ return _get_rle_stats(self, table_name, top_n_values)
1087
+
1088
+ def get_value_frequency(self, table_name: str, column_name: str, limit: int = 20):
1089
+ """
1090
+ Get detailed value frequency distribution for a specific column.
1091
+
1092
+ Args:
1093
+ table_name: Name of the table
1094
+ column_name: Name of the column to analyze
1095
+ limit: Maximum number of values to return (default: 20)
1096
+
1097
+ Returns:
1098
+ DataFrame with value frequencies:
1099
+ - value: The distinct value
1100
+ - count: Number of occurrences
1101
+ - percentage: Percentage of total rows
1102
+ - cumulative_pct: Cumulative percentage
1103
+
1104
+ Examples:
1105
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
1106
+
1107
+ # Get top 20 values for a column
1108
+ freq = con.get_value_frequency('sales', 'status')
1109
+ print(freq)
1110
+ """
1111
+ from .rle import get_value_frequency_details as _get_value_frequency
1112
+ return _get_value_frequency(self, table_name, column_name, limit)
1113
+
1114
+ def find_optimal_sort_order(self, table_name: str, max_combinations: int = 10):
1115
+ """
1116
+ Find optimal column sort order for compression using V-Order-like testing.
1117
+
1118
+ Tests different column orderings and measures RLE compression effectiveness.
1119
+ This simulates how V-Order/VertiPaq optimizes data layout.
1120
+
1121
+ Args:
1122
+ table_name: Name of the table to analyze
1123
+ max_combinations: Maximum sort orderings to test (default: 10)
1124
+
1125
+ Returns:
1126
+ DataFrame with tested orderings ranked by compression:
1127
+ - sort_order: Column ordering (e.g., "date → DUID → time")
1128
+ - total_runs: Total RLE runs (fewer = better compression)
1129
+ - compression_score: Compression effectiveness (higher = better)
1130
+ - Individual RLE counts per column
1131
+
1132
+ Examples:
1133
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
1134
+
1135
+ # Find optimal sort order
1136
+ optimal = con.find_optimal_sort_order('energy_data')
1137
+ print(optimal)
1138
+
1139
+ # Test more combinations
1140
+ optimal = con.find_optimal_sort_order('energy_data', max_combinations=20)
1141
+ """
1142
+ from .rle import find_optimal_sort_order as _find_optimal_sort_order
1143
+ return _find_optimal_sort_order(self, table_name, max_combinations)
1144
+
1053
1145
  def get_stats(self, source: str = None, detailed = False):
1054
1146
  """
1055
1147
  Get comprehensive statistics for Delta Lake tables.
@@ -1089,6 +1181,39 @@ class Duckrun(WorkspaceOperationsMixin):
1089
1181
  """
1090
1182
  return _get_stats(self, source, detailed)
1091
1183
 
1184
+ def get_rle(self, source: str = None):
1185
+ """
1186
+ Get RLE (Run-Length Encoding) statistics for Delta Lake tables.
1187
+ Measures compression potential by counting consecutive identical values.
1188
+
1189
+ Args:
1190
+ source: Optional. Can be one of:
1191
+ - None: Use all tables in the connection's schema (default)
1192
+ - Table name: 'table_name' (uses current schema)
1193
+ - Schema.table: 'schema.table_name' (specific table in schema)
1194
+ - Schema only: 'schema' (all tables in schema)
1195
+ - Wildcard patterns: '*.summary' or 'schema.*'
1196
+
1197
+ Returns:
1198
+ DataFrame with columns: schema_name, table_name, total_rle_runs
1199
+
1200
+ Examples:
1201
+ con = duckrun.connect("tmp/data.lakehouse/aemo")
1202
+
1203
+ # All tables in current schema
1204
+ rle = con.get_rle()
1205
+
1206
+ # Single table in current schema
1207
+ rle = con.get_rle('price')
1208
+
1209
+ # Specific table in different schema
1210
+ rle = con.get_rle('deltars.summary')
1211
+
1212
+ # All tables matching pattern
1213
+ rle = con.get_rle('*.summary')
1214
+ """
1215
+ return _get_rle(self, source)
1216
+
1092
1217
  def list_lakehouses(self) -> List[str]:
1093
1218
  """
1094
1219
  List all lakehouses in the current workspace.
@@ -454,3 +454,195 @@ def get_stats(duckrun_instance, source: str = None, detailed = False):
454
454
  return final_result
455
455
 
456
456
 
457
+ def get_rle(duckrun_instance, source: str = None) -> 'pd.DataFrame':
458
+ """
459
+ Get RLE statistics for tables at the column level.
460
+
461
+ Args:
462
+ duckrun_instance: Duckrun connection (from duckrun.connect())
463
+ source: Optional. Can be one of:
464
+ - None: Use all tables in the connection's schema (default)
465
+ - Table name: 'table_name' (uses main schema in DuckDB)
466
+ - Schema.table: 'schema.table_name' (specific table in schema)
467
+ - Schema only: 'schema' (all tables in schema)
468
+ - Wildcard pattern: '*.summary' (matches tables across all schemas)
469
+
470
+ Returns:
471
+ DataFrame with columns:
472
+ - schema_name: Schema name
473
+ - table_name: Table name
474
+ - column_name: Column name
475
+ - total_rows: Total number of rows
476
+ - rle_runs: RLE runs for this column in natural order
477
+ - ndv: Number of distinct values
478
+ - total_rle_runs: Sum of RLE runs across all columns (same for all rows of a table)
479
+ """
480
+ import fnmatch
481
+ import pandas as pd
482
+
483
+ con = duckrun_instance.con # Get underlying DuckDB connection
484
+
485
+ # Determine which tables to process
486
+ tables_to_process = [] # List of (schema, table) tuples
487
+
488
+ if source is None:
489
+ # Get all tables in the connection's schema
490
+ schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
491
+ try:
492
+ if schema_name == 'main':
493
+ query = "SHOW TABLES"
494
+ result = con.execute(query).fetchall()
495
+ if result:
496
+ tables = [row[0] for row in result if not row[0].startswith('tbl_')]
497
+ tables_to_process = [(schema_name, tbl) for tbl in tables]
498
+ else:
499
+ query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'"
500
+ result = con.execute(query).fetchall()
501
+ if result:
502
+ tables = [row[0] for row in result if not row[0].startswith('tbl_')]
503
+ tables_to_process = [(schema_name, tbl) for tbl in tables]
504
+ except:
505
+ pass
506
+
507
+ elif '.' in source:
508
+ parts = source.split('.', 1)
509
+ schema_pattern, table_pattern = parts[0], parts[1]
510
+
511
+ # Check if patterns contain wildcards
512
+ if '*' in schema_pattern or '*' in table_pattern:
513
+ # Wildcard matching
514
+ query = """
515
+ SELECT table_schema, table_name
516
+ FROM information_schema.tables
517
+ WHERE table_schema NOT LIKE 'pg_%'
518
+ AND table_schema != 'information_schema'
519
+ AND table_name NOT LIKE 'tbl_%'
520
+ """
521
+ result = con.execute(query).fetchall()
522
+ for schema, table in result:
523
+ if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
524
+ tables_to_process.append((schema, table))
525
+ else:
526
+ # Exact schema.table
527
+ tables_to_process = [(schema_pattern, table_pattern)]
528
+
529
+ elif '*' in source:
530
+ # Wildcard pattern for table names across all schemas
531
+ query = """
532
+ SELECT table_schema, table_name
533
+ FROM information_schema.tables
534
+ WHERE table_schema NOT LIKE 'pg_%'
535
+ AND table_schema != 'information_schema'
536
+ AND table_name NOT LIKE 'tbl_%'
537
+ """
538
+ result = con.execute(query).fetchall()
539
+ for schema, table in result:
540
+ if fnmatch.fnmatch(table, source):
541
+ tables_to_process.append((schema, table))
542
+
543
+ else:
544
+ # Check if it's a schema name or table name
545
+ try:
546
+ # Try as schema first
547
+ schema_query = f"SELECT 1 FROM information_schema.schemata WHERE schema_name = '{source}' LIMIT 1"
548
+ schema_exists = con.execute(schema_query).fetchone()
549
+
550
+ if schema_exists:
551
+ # It's a schema - get all tables
552
+ tables_query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{source}'"
553
+ result = con.execute(tables_query).fetchall()
554
+ if result:
555
+ tables = [row[0] for row in result if not row[0].startswith('tbl_')]
556
+ tables_to_process = [(source, tbl) for tbl in tables]
557
+ else:
558
+ # It's a table name in default schema
559
+ schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
560
+ tables_to_process = [(schema_name, source)]
561
+ except:
562
+ # Assume it's a table name
563
+ schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
564
+ tables_to_process = [(schema_name, source)]
565
+
566
+ if not tables_to_process:
567
+ print("No tables found matching the criteria")
568
+ return pd.DataFrame(columns=['schema_name', 'table_name', 'column_name', 'total_rows',
569
+ 'rle_runs', 'ndv', 'total_rle_runs'])
570
+
571
+ print(f"Processing {len(tables_to_process)} table(s)...")
572
+
573
+ # Process each table
574
+ results = []
575
+ for schema, table in tables_to_process:
576
+ table_path = f"{duckrun_instance.table_base_url}{schema}/{table}"
577
+
578
+ print(f"\nCalculating RLE runs for {schema}.{table}...")
579
+
580
+ # Get column names and row count
581
+ try:
582
+ schema_info = con.sql(f"""
583
+ SELECT column_name
584
+ FROM (DESCRIBE SELECT * FROM delta_scan('{table_path}'))
585
+ """).df()
586
+
587
+ # Get total row count
588
+ total_rows = con.sql(f"SELECT COUNT(*) FROM delta_scan('{table_path}')").fetchone()[0]
589
+
590
+ if schema_info.empty:
591
+ continue
592
+
593
+ # Track total RLE runs for this table
594
+ table_total_rle = 0
595
+ table_results = []
596
+
597
+ for _, row in schema_info.iterrows():
598
+ col_name = row['column_name']
599
+
600
+ # Calculate RLE runs in natural (physical) order using delta_scan
601
+ rle_query = f"""
602
+ WITH numbered AS (
603
+ SELECT
604
+ filename,
605
+ file_row_number,
606
+ {col_name},
607
+ LAG({col_name}) OVER (ORDER BY filename, file_row_number) as prev_value
608
+ FROM delta_scan('{table_path}', file_row_number=1, filename=1)
609
+ )
610
+ SELECT COUNT(*) as runs
611
+ FROM numbered
612
+ WHERE prev_value IS NULL OR {col_name} != prev_value OR {col_name} IS NULL OR prev_value IS NULL
613
+ """
614
+
615
+ try:
616
+ runs = con.sql(rle_query).fetchone()[0]
617
+
618
+ # Also calculate NDV for this column
619
+ ndv_query = f"SELECT COUNT(DISTINCT {col_name}) FROM delta_scan('{table_path}')"
620
+ ndv = con.sql(ndv_query).fetchone()[0]
621
+
622
+ table_total_rle += runs
623
+
624
+ print(f" {col_name}: {runs:,} runs, ndv={ndv:,}")
625
+
626
+ table_results.append({
627
+ 'schema_name': schema,
628
+ 'table_name': table,
629
+ 'column_name': col_name,
630
+ 'total_rows': total_rows,
631
+ 'rle_runs': runs,
632
+ 'ndv': ndv
633
+ })
634
+ except Exception as e:
635
+ print(f" Warning: Could not calculate RLE runs for {col_name}: {e}")
636
+
637
+ # Add total_rle_runs to all rows for this table
638
+ for result in table_results:
639
+ result['total_rle_runs'] = table_total_rle
640
+ results.append(result)
641
+
642
+ print(f" Total RLE runs for table: {table_total_rle:,}")
643
+
644
+ except Exception as e:
645
+ print(f" Error processing table: {e}")
646
+
647
+ return pd.DataFrame(results)
648
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.21.dev2
3
+ Version: 0.2.22.dev2
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -8,7 +8,6 @@ duckrun/ducklake_metadata.py
8
8
  duckrun/files.py
9
9
  duckrun/lakehouse.py
10
10
  duckrun/notebook.py
11
- duckrun/rle.py
12
11
  duckrun/runner.py
13
12
  duckrun/semantic_model.py
14
13
  duckrun/stats.py
@@ -19,10 +18,13 @@ duckrun.egg-info/dependency_links.txt
19
18
  duckrun.egg-info/requires.txt
20
19
  duckrun.egg-info/top_level.txt
21
20
  tests/test_checkpoint_format.py
21
+ tests/test_consecutive_values.py
22
22
  tests/test_deploy_fresh.py
23
23
  tests/test_ducklake_export.py
24
24
  tests/test_filename.py
25
25
  tests/test_register.py
26
26
  tests/test_rle.py
27
+ tests/test_rle_analysis.py
28
+ tests/test_rle_summary.py
27
29
  tests/test_writer_dictionary.py
28
30
  tests/test_writer_integration.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.21.dev2"
7
+ version = "0.2.22.dev2"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -0,0 +1,115 @@
1
+ """
2
+ Test: Analyze Consecutive Values in Delta Table
3
+
4
+ This test connects to tmp/data.lakehouse and analyzes the deltars.summary table
5
+ for consecutive runs in the 'time' column.
6
+ """
7
+
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ # Add parent directory to path to import duckrun
12
+ sys.path.insert(0, str(Path(__file__).parent.parent))
13
+
14
+ import duckrun
15
+ from duckrun.rle import analyze_consecutive_values
16
+
17
+
18
+ def test_consecutive_values():
19
+ """Test consecutive value analysis on deltars.summary table"""
20
+
21
+ lakehouse_path = "tmp/data.lakehouse"
22
+
23
+ print("=" * 80)
24
+ print("CONSECUTIVE VALUES ANALYSIS TEST")
25
+ print("=" * 80)
26
+ print(f"Lakehouse: {lakehouse_path}")
27
+ print(f"Table: deltars.summary")
28
+ print(f"Column: time")
29
+
30
+ try:
31
+ # Connect to lakehouse
32
+ print("\nConnecting to lakehouse...")
33
+ con = duckrun.connect(lakehouse_path)
34
+
35
+ # Analyze consecutive values in the 'time' column
36
+ print("\n" + "=" * 80)
37
+ print("ANALYZING CONSECUTIVE VALUES")
38
+ print("=" * 80)
39
+
40
+ df = analyze_consecutive_values(
41
+ duckrun_con=con,
42
+ table_name='summary',
43
+ column_name='time',
44
+ min_consecutive=3,
45
+ schema_name='deltars'
46
+ )
47
+
48
+ # Display results
49
+ if not df.empty:
50
+ print("\n" + "=" * 80)
51
+ print("RESULTS")
52
+ print("=" * 80)
53
+ print(f"\nFound {len(df)} consecutive sequences")
54
+ print("\nTop 20 longest sequences:")
55
+ print(df.head(20).to_string(index=False))
56
+
57
+ # Statistics
58
+ print("\n" + "=" * 80)
59
+ print("STATISTICS")
60
+ print("=" * 80)
61
+ total_in_sequences = df['consecutive_count'].sum()
62
+ longest = df['consecutive_count'].max()
63
+ shortest = df['consecutive_count'].min()
64
+ avg = df['consecutive_count'].mean()
65
+
66
+ print(f"Total values in sequences: {total_in_sequences:,}")
67
+ print(f"Longest sequence: {longest:,}")
68
+ print(f"Shortest sequence: {shortest:,}")
69
+ print(f"Average sequence length: {avg:.2f}")
70
+
71
+ # File distribution
72
+ print("\n" + "=" * 80)
73
+ print("FILE DISTRIBUTION")
74
+ print("=" * 80)
75
+ file_counts = df.groupby('filename').agg({
76
+ 'consecutive_count': ['count', 'sum', 'max']
77
+ }).reset_index()
78
+ file_counts.columns = ['filename', 'num_sequences', 'total_values', 'max_sequence']
79
+
80
+ # Extract just filename from path
81
+ file_counts['filename'] = file_counts['filename'].apply(
82
+ lambda x: x.split('/')[-1] if '/' in str(x) else x
83
+ )
84
+
85
+ print(f"\nSequences across {len(file_counts)} files:")
86
+ print(file_counts.to_string(index=False))
87
+
88
+ print("\n✅ Test completed successfully!")
89
+ return True
90
+ else:
91
+ print("\n⚠ No consecutive sequences found")
92
+ return True
93
+
94
+ except Exception as e:
95
+ print(f"\n❌ Error during analysis: {e}")
96
+ import traceback
97
+ traceback.print_exc()
98
+ return False
99
+
100
+
101
+ if __name__ == "__main__":
102
+ print("\n" + "=" * 80)
103
+ print("STARTING CONSECUTIVE VALUES TEST")
104
+ print("=" * 80)
105
+
106
+ success = test_consecutive_values()
107
+
108
+ print("\n" + "=" * 80)
109
+ if success:
110
+ print("✅ TEST PASSED")
111
+ else:
112
+ print("❌ TEST FAILED")
113
+ print("=" * 80)
114
+
115
+ sys.exit(0 if success else 1)
@@ -0,0 +1,149 @@
1
+ """
2
+ Test: RLE Analysis with Real Parquet Data
3
+
4
+ This test demonstrates the refactored RLE module using real parquet data.
5
+ It analyzes table statistics, NDV, and value frequency patterns.
6
+ """
7
+
8
+ import duckdb
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ # Add parent directory to path to import duckrun
13
+ sys.path.insert(0, str(Path(__file__).parent.parent))
14
+
15
+ from duckrun.rle import get_table_stats, get_value_frequency_details
16
+
17
+
18
+ def test_rle_with_real_parquet():
19
+ """Test RLE analysis with real parquet file"""
20
+
21
+ # Path to the test parquet file
22
+ parquet_path = Path(__file__).parent / "part-00000-19052469-6a9d-4faa-86ac-60efce3e4443-c000.snappy.parquet"
23
+
24
+ if not parquet_path.exists():
25
+ print(f"❌ Error: Parquet file not found at {parquet_path}")
26
+ return False
27
+
28
+ print("=" * 80)
29
+ print("RLE ANALYSIS TEST: Real Parquet Data")
30
+ print("=" * 80)
31
+ print(f"File: {parquet_path.name}")
32
+ print(f"Size: {parquet_path.stat().st_size:,} bytes")
33
+
34
+ # Connect to DuckDB
35
+ con = duckdb.connect(':memory:')
36
+
37
+ try:
38
+ # First, let's see the schema
39
+ print("\n" + "=" * 80)
40
+ print("SCHEMA INSPECTION")
41
+ print("=" * 80)
42
+
43
+ schema_df = con.sql(f"""
44
+ SELECT * FROM parquet_schema('{parquet_path}')
45
+ """).df()
46
+
47
+ print(f"\nColumns found: {len(schema_df)}")
48
+ print(schema_df.to_string(index=False))
49
+
50
+ # Get row count
51
+ row_count = con.sql(f"""
52
+ SELECT COUNT(*) FROM read_parquet('{parquet_path}')
53
+ """).fetchone()[0]
54
+
55
+ print(f"\nTotal rows: {row_count:,}")
56
+
57
+ # Run comprehensive RLE analysis
58
+ print("\n" + "=" * 80)
59
+ print("COMPREHENSIVE RLE ANALYSIS")
60
+ print("=" * 80)
61
+
62
+ stats_df = get_table_stats(con, str(parquet_path), is_parquet=True, top_n_values=10)
63
+
64
+ # Display results
65
+ print("\n" + "=" * 80)
66
+ print("RESULTS: Columns Ranked by RLE Potential")
67
+ print("=" * 80)
68
+
69
+ print("\n" + stats_df[['column_name', 'data_type', 'ndv', 'cardinality_ratio',
70
+ 'top_value_pct', 'top_n_coverage', 'repetition_score']].to_string(index=False))
71
+
72
+ # Detailed analysis of top 3 columns
73
+ print("\n" + "=" * 80)
74
+ print("DETAILED VALUE FREQUENCY ANALYSIS")
75
+ print("=" * 80)
76
+
77
+ for idx in range(min(3, len(stats_df))):
78
+ col_name = stats_df.iloc[idx]['column_name']
79
+ score = stats_df.iloc[idx]['repetition_score']
80
+
81
+ print(f"\n[{idx+1}] Column: {col_name} (repetition_score: {score})")
82
+ print("-" * 80)
83
+
84
+ freq_df = get_value_frequency_details(con, str(parquet_path), col_name,
85
+ is_parquet=True, limit=15)
86
+ print(freq_df.to_string(index=False))
87
+
88
+ if not freq_df.empty:
89
+ print(f"\n✓ Top value appears {freq_df.iloc[0]['percentage']:.2f}% of the time")
90
+ print(f"✓ Top 15 values cover {freq_df['cumulative_pct'].iloc[-1]:.2f}% of all data")
91
+
92
+ # Summary and recommendations
93
+ print("\n" + "=" * 80)
94
+ print("SUMMARY & RECOMMENDATIONS")
95
+ print("=" * 80)
96
+
97
+ # Categorize columns
98
+ excellent = stats_df[stats_df['repetition_score'] > 100]
99
+ good = stats_df[(stats_df['repetition_score'] >= 10) & (stats_df['repetition_score'] <= 100)]
100
+ poor = stats_df[stats_df['repetition_score'] < 10]
101
+
102
+ print(f"\n📊 RLE Compression Potential:")
103
+ print(f" Excellent (score > 100): {len(excellent)} columns")
104
+ if len(excellent) > 0:
105
+ print(f" {', '.join(excellent['column_name'].tolist())}")
106
+
107
+ print(f" Good (score 10-100): {len(good)} columns")
108
+ if len(good) > 0:
109
+ print(f" {', '.join(good['column_name'].tolist())}")
110
+
111
+ print(f" Poor (score < 10): {len(poor)} columns")
112
+ if len(poor) > 0:
113
+ print(f" {', '.join(poor['column_name'].tolist())}")
114
+
115
+ print(f"\n💡 Sorting Recommendation:")
116
+ top_3 = stats_df.head(3)['column_name'].tolist()
117
+ print(f" For optimal RLE compression, consider sorting by:")
118
+ for i, col in enumerate(top_3, 1):
119
+ print(f" {i}. {col}")
120
+
121
+ print(f"\n✅ Test completed successfully!")
122
+
123
+ return True
124
+
125
+ except Exception as e:
126
+ print(f"\n❌ Error during analysis: {e}")
127
+ import traceback
128
+ traceback.print_exc()
129
+ return False
130
+
131
+ finally:
132
+ con.close()
133
+
134
+
135
+ if __name__ == "__main__":
136
+ print("\n" + "=" * 80)
137
+ print("STARTING RLE ANALYSIS TEST")
138
+ print("=" * 80)
139
+
140
+ success = test_rle_with_real_parquet()
141
+
142
+ print("\n" + "=" * 80)
143
+ if success:
144
+ print("✅ TEST PASSED")
145
+ else:
146
+ print("❌ TEST FAILED")
147
+ print("=" * 80)
148
+
149
+ sys.exit(0 if success else 1)