duckrun 0.2.21.dev2__tar.gz → 0.2.22.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/PKG-INFO +1 -1
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/__init__.py +2 -3
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/core.py +126 -1
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/stats.py +192 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun.egg-info/SOURCES.txt +3 -1
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/pyproject.toml +1 -1
- duckrun-0.2.22.dev2/tests/test_consecutive_values.py +115 -0
- duckrun-0.2.22.dev2/tests/test_rle_analysis.py +149 -0
- duckrun-0.2.22.dev2/tests/test_rle_summary.py +22 -0
- duckrun-0.2.21.dev2/duckrun/rle.py +0 -940
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/LICENSE +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/README.md +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/auth.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/ducklake_metadata.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/files.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/notebook.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/runner.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun/writer.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/setup.cfg +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_checkpoint_format.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_deploy_fresh.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_ducklake_export.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_filename.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_register.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_rle.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_writer_dictionary.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev2}/tests/test_writer_integration.py +0 -0
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
from duckrun.core import Duckrun
|
|
4
4
|
from duckrun.notebook import import_notebook_from_web, import_notebook
|
|
5
|
-
from duckrun import rle
|
|
6
5
|
|
|
7
|
-
__version__ = "0.2.
|
|
6
|
+
__version__ = "0.2.22.dev2"
|
|
8
7
|
|
|
9
8
|
# Expose unified connect method at module level
|
|
10
9
|
connect = Duckrun.connect
|
|
11
10
|
|
|
12
|
-
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"
|
|
11
|
+
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
|
|
@@ -7,7 +7,7 @@ import time
|
|
|
7
7
|
from typing import List, Tuple, Union, Optional, Callable, Dict, Any
|
|
8
8
|
from string import Template
|
|
9
9
|
from datetime import datetime
|
|
10
|
-
from .stats import get_stats as _get_stats
|
|
10
|
+
from .stats import get_stats as _get_stats, get_rle as _get_rle
|
|
11
11
|
from .runner import run as _run
|
|
12
12
|
from .files import copy as _copy, download as _download
|
|
13
13
|
from .writer import QueryResult
|
|
@@ -1050,6 +1050,98 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1050
1050
|
"""
|
|
1051
1051
|
self.con.register(name, df)
|
|
1052
1052
|
|
|
1053
|
+
def get_rle_stats(self, table_name: str, top_n_values: int = 10):
|
|
1054
|
+
"""
|
|
1055
|
+
Get comprehensive table statistics including NDV and value frequency analysis.
|
|
1056
|
+
|
|
1057
|
+
Analyzes column characteristics for RLE compression optimization.
|
|
1058
|
+
|
|
1059
|
+
Args:
|
|
1060
|
+
table_name: Name of the table to analyze
|
|
1061
|
+
top_n_values: Number of top frequent values to show per column (default: 10)
|
|
1062
|
+
|
|
1063
|
+
Returns:
|
|
1064
|
+
DataFrame with statistics for each column:
|
|
1065
|
+
- column_name: Name of the column
|
|
1066
|
+
- data_type: Data type
|
|
1067
|
+
- total_rows: Total number of rows
|
|
1068
|
+
- null_count, null_pct: NULL statistics
|
|
1069
|
+
- ndv: Number of distinct values (exact)
|
|
1070
|
+
- cardinality_ratio: NDV / total_rows (lower = better for RLE)
|
|
1071
|
+
- top_value, top_value_count, top_value_pct: Most frequent value stats
|
|
1072
|
+
- top_n_coverage: Percentage covered by top N values
|
|
1073
|
+
- repetition_score: RLE potential score (higher = better)
|
|
1074
|
+
|
|
1075
|
+
Examples:
|
|
1076
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
1077
|
+
|
|
1078
|
+
# Analyze a table
|
|
1079
|
+
stats = con.get_rle_stats('sales')
|
|
1080
|
+
print(stats)
|
|
1081
|
+
|
|
1082
|
+
# Show top 20 values per column
|
|
1083
|
+
stats = con.get_rle_stats('sales', top_n_values=20)
|
|
1084
|
+
"""
|
|
1085
|
+
from .rle import get_table_stats as _get_rle_stats
|
|
1086
|
+
return _get_rle_stats(self, table_name, top_n_values)
|
|
1087
|
+
|
|
1088
|
+
def get_value_frequency(self, table_name: str, column_name: str, limit: int = 20):
|
|
1089
|
+
"""
|
|
1090
|
+
Get detailed value frequency distribution for a specific column.
|
|
1091
|
+
|
|
1092
|
+
Args:
|
|
1093
|
+
table_name: Name of the table
|
|
1094
|
+
column_name: Name of the column to analyze
|
|
1095
|
+
limit: Maximum number of values to return (default: 20)
|
|
1096
|
+
|
|
1097
|
+
Returns:
|
|
1098
|
+
DataFrame with value frequencies:
|
|
1099
|
+
- value: The distinct value
|
|
1100
|
+
- count: Number of occurrences
|
|
1101
|
+
- percentage: Percentage of total rows
|
|
1102
|
+
- cumulative_pct: Cumulative percentage
|
|
1103
|
+
|
|
1104
|
+
Examples:
|
|
1105
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
1106
|
+
|
|
1107
|
+
# Get top 20 values for a column
|
|
1108
|
+
freq = con.get_value_frequency('sales', 'status')
|
|
1109
|
+
print(freq)
|
|
1110
|
+
"""
|
|
1111
|
+
from .rle import get_value_frequency_details as _get_value_frequency
|
|
1112
|
+
return _get_value_frequency(self, table_name, column_name, limit)
|
|
1113
|
+
|
|
1114
|
+
def find_optimal_sort_order(self, table_name: str, max_combinations: int = 10):
|
|
1115
|
+
"""
|
|
1116
|
+
Find optimal column sort order for compression using V-Order-like testing.
|
|
1117
|
+
|
|
1118
|
+
Tests different column orderings and measures RLE compression effectiveness.
|
|
1119
|
+
This simulates how V-Order/VertiPaq optimizes data layout.
|
|
1120
|
+
|
|
1121
|
+
Args:
|
|
1122
|
+
table_name: Name of the table to analyze
|
|
1123
|
+
max_combinations: Maximum sort orderings to test (default: 10)
|
|
1124
|
+
|
|
1125
|
+
Returns:
|
|
1126
|
+
DataFrame with tested orderings ranked by compression:
|
|
1127
|
+
- sort_order: Column ordering (e.g., "date → DUID → time")
|
|
1128
|
+
- total_runs: Total RLE runs (fewer = better compression)
|
|
1129
|
+
- compression_score: Compression effectiveness (higher = better)
|
|
1130
|
+
- Individual RLE counts per column
|
|
1131
|
+
|
|
1132
|
+
Examples:
|
|
1133
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
1134
|
+
|
|
1135
|
+
# Find optimal sort order
|
|
1136
|
+
optimal = con.find_optimal_sort_order('energy_data')
|
|
1137
|
+
print(optimal)
|
|
1138
|
+
|
|
1139
|
+
# Test more combinations
|
|
1140
|
+
optimal = con.find_optimal_sort_order('energy_data', max_combinations=20)
|
|
1141
|
+
"""
|
|
1142
|
+
from .rle import find_optimal_sort_order as _find_optimal_sort_order
|
|
1143
|
+
return _find_optimal_sort_order(self, table_name, max_combinations)
|
|
1144
|
+
|
|
1053
1145
|
def get_stats(self, source: str = None, detailed = False):
|
|
1054
1146
|
"""
|
|
1055
1147
|
Get comprehensive statistics for Delta Lake tables.
|
|
@@ -1089,6 +1181,39 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1089
1181
|
"""
|
|
1090
1182
|
return _get_stats(self, source, detailed)
|
|
1091
1183
|
|
|
1184
|
+
def get_rle(self, source: str = None):
|
|
1185
|
+
"""
|
|
1186
|
+
Get RLE (Run-Length Encoding) statistics for Delta Lake tables.
|
|
1187
|
+
Measures compression potential by counting consecutive identical values.
|
|
1188
|
+
|
|
1189
|
+
Args:
|
|
1190
|
+
source: Optional. Can be one of:
|
|
1191
|
+
- None: Use all tables in the connection's schema (default)
|
|
1192
|
+
- Table name: 'table_name' (uses current schema)
|
|
1193
|
+
- Schema.table: 'schema.table_name' (specific table in schema)
|
|
1194
|
+
- Schema only: 'schema' (all tables in schema)
|
|
1195
|
+
- Wildcard patterns: '*.summary' or 'schema.*'
|
|
1196
|
+
|
|
1197
|
+
Returns:
|
|
1198
|
+
DataFrame with columns: schema_name, table_name, total_rle_runs
|
|
1199
|
+
|
|
1200
|
+
Examples:
|
|
1201
|
+
con = duckrun.connect("tmp/data.lakehouse/aemo")
|
|
1202
|
+
|
|
1203
|
+
# All tables in current schema
|
|
1204
|
+
rle = con.get_rle()
|
|
1205
|
+
|
|
1206
|
+
# Single table in current schema
|
|
1207
|
+
rle = con.get_rle('price')
|
|
1208
|
+
|
|
1209
|
+
# Specific table in different schema
|
|
1210
|
+
rle = con.get_rle('deltars.summary')
|
|
1211
|
+
|
|
1212
|
+
# All tables matching pattern
|
|
1213
|
+
rle = con.get_rle('*.summary')
|
|
1214
|
+
"""
|
|
1215
|
+
return _get_rle(self, source)
|
|
1216
|
+
|
|
1092
1217
|
def list_lakehouses(self) -> List[str]:
|
|
1093
1218
|
"""
|
|
1094
1219
|
List all lakehouses in the current workspace.
|
|
@@ -454,3 +454,195 @@ def get_stats(duckrun_instance, source: str = None, detailed = False):
|
|
|
454
454
|
return final_result
|
|
455
455
|
|
|
456
456
|
|
|
457
|
+
def get_rle(duckrun_instance, source: str = None) -> 'pd.DataFrame':
|
|
458
|
+
"""
|
|
459
|
+
Get RLE statistics for tables at the column level.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
duckrun_instance: Duckrun connection (from duckrun.connect())
|
|
463
|
+
source: Optional. Can be one of:
|
|
464
|
+
- None: Use all tables in the connection's schema (default)
|
|
465
|
+
- Table name: 'table_name' (uses main schema in DuckDB)
|
|
466
|
+
- Schema.table: 'schema.table_name' (specific table in schema)
|
|
467
|
+
- Schema only: 'schema' (all tables in schema)
|
|
468
|
+
- Wildcard pattern: '*.summary' (matches tables across all schemas)
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
DataFrame with columns:
|
|
472
|
+
- schema_name: Schema name
|
|
473
|
+
- table_name: Table name
|
|
474
|
+
- column_name: Column name
|
|
475
|
+
- total_rows: Total number of rows
|
|
476
|
+
- rle_runs: RLE runs for this column in natural order
|
|
477
|
+
- ndv: Number of distinct values
|
|
478
|
+
- total_rle_runs: Sum of RLE runs across all columns (same for all rows of a table)
|
|
479
|
+
"""
|
|
480
|
+
import fnmatch
|
|
481
|
+
import pandas as pd
|
|
482
|
+
|
|
483
|
+
con = duckrun_instance.con # Get underlying DuckDB connection
|
|
484
|
+
|
|
485
|
+
# Determine which tables to process
|
|
486
|
+
tables_to_process = [] # List of (schema, table) tuples
|
|
487
|
+
|
|
488
|
+
if source is None:
|
|
489
|
+
# Get all tables in the connection's schema
|
|
490
|
+
schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
|
|
491
|
+
try:
|
|
492
|
+
if schema_name == 'main':
|
|
493
|
+
query = "SHOW TABLES"
|
|
494
|
+
result = con.execute(query).fetchall()
|
|
495
|
+
if result:
|
|
496
|
+
tables = [row[0] for row in result if not row[0].startswith('tbl_')]
|
|
497
|
+
tables_to_process = [(schema_name, tbl) for tbl in tables]
|
|
498
|
+
else:
|
|
499
|
+
query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'"
|
|
500
|
+
result = con.execute(query).fetchall()
|
|
501
|
+
if result:
|
|
502
|
+
tables = [row[0] for row in result if not row[0].startswith('tbl_')]
|
|
503
|
+
tables_to_process = [(schema_name, tbl) for tbl in tables]
|
|
504
|
+
except:
|
|
505
|
+
pass
|
|
506
|
+
|
|
507
|
+
elif '.' in source:
|
|
508
|
+
parts = source.split('.', 1)
|
|
509
|
+
schema_pattern, table_pattern = parts[0], parts[1]
|
|
510
|
+
|
|
511
|
+
# Check if patterns contain wildcards
|
|
512
|
+
if '*' in schema_pattern or '*' in table_pattern:
|
|
513
|
+
# Wildcard matching
|
|
514
|
+
query = """
|
|
515
|
+
SELECT table_schema, table_name
|
|
516
|
+
FROM information_schema.tables
|
|
517
|
+
WHERE table_schema NOT LIKE 'pg_%'
|
|
518
|
+
AND table_schema != 'information_schema'
|
|
519
|
+
AND table_name NOT LIKE 'tbl_%'
|
|
520
|
+
"""
|
|
521
|
+
result = con.execute(query).fetchall()
|
|
522
|
+
for schema, table in result:
|
|
523
|
+
if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
|
|
524
|
+
tables_to_process.append((schema, table))
|
|
525
|
+
else:
|
|
526
|
+
# Exact schema.table
|
|
527
|
+
tables_to_process = [(schema_pattern, table_pattern)]
|
|
528
|
+
|
|
529
|
+
elif '*' in source:
|
|
530
|
+
# Wildcard pattern for table names across all schemas
|
|
531
|
+
query = """
|
|
532
|
+
SELECT table_schema, table_name
|
|
533
|
+
FROM information_schema.tables
|
|
534
|
+
WHERE table_schema NOT LIKE 'pg_%'
|
|
535
|
+
AND table_schema != 'information_schema'
|
|
536
|
+
AND table_name NOT LIKE 'tbl_%'
|
|
537
|
+
"""
|
|
538
|
+
result = con.execute(query).fetchall()
|
|
539
|
+
for schema, table in result:
|
|
540
|
+
if fnmatch.fnmatch(table, source):
|
|
541
|
+
tables_to_process.append((schema, table))
|
|
542
|
+
|
|
543
|
+
else:
|
|
544
|
+
# Check if it's a schema name or table name
|
|
545
|
+
try:
|
|
546
|
+
# Try as schema first
|
|
547
|
+
schema_query = f"SELECT 1 FROM information_schema.schemata WHERE schema_name = '{source}' LIMIT 1"
|
|
548
|
+
schema_exists = con.execute(schema_query).fetchone()
|
|
549
|
+
|
|
550
|
+
if schema_exists:
|
|
551
|
+
# It's a schema - get all tables
|
|
552
|
+
tables_query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{source}'"
|
|
553
|
+
result = con.execute(tables_query).fetchall()
|
|
554
|
+
if result:
|
|
555
|
+
tables = [row[0] for row in result if not row[0].startswith('tbl_')]
|
|
556
|
+
tables_to_process = [(source, tbl) for tbl in tables]
|
|
557
|
+
else:
|
|
558
|
+
# It's a table name in default schema
|
|
559
|
+
schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
|
|
560
|
+
tables_to_process = [(schema_name, source)]
|
|
561
|
+
except:
|
|
562
|
+
# Assume it's a table name
|
|
563
|
+
schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
|
|
564
|
+
tables_to_process = [(schema_name, source)]
|
|
565
|
+
|
|
566
|
+
if not tables_to_process:
|
|
567
|
+
print("No tables found matching the criteria")
|
|
568
|
+
return pd.DataFrame(columns=['schema_name', 'table_name', 'column_name', 'total_rows',
|
|
569
|
+
'rle_runs', 'ndv', 'total_rle_runs'])
|
|
570
|
+
|
|
571
|
+
print(f"Processing {len(tables_to_process)} table(s)...")
|
|
572
|
+
|
|
573
|
+
# Process each table
|
|
574
|
+
results = []
|
|
575
|
+
for schema, table in tables_to_process:
|
|
576
|
+
table_path = f"{duckrun_instance.table_base_url}{schema}/{table}"
|
|
577
|
+
|
|
578
|
+
print(f"\nCalculating RLE runs for {schema}.{table}...")
|
|
579
|
+
|
|
580
|
+
# Get column names and row count
|
|
581
|
+
try:
|
|
582
|
+
schema_info = con.sql(f"""
|
|
583
|
+
SELECT column_name
|
|
584
|
+
FROM (DESCRIBE SELECT * FROM delta_scan('{table_path}'))
|
|
585
|
+
""").df()
|
|
586
|
+
|
|
587
|
+
# Get total row count
|
|
588
|
+
total_rows = con.sql(f"SELECT COUNT(*) FROM delta_scan('{table_path}')").fetchone()[0]
|
|
589
|
+
|
|
590
|
+
if schema_info.empty:
|
|
591
|
+
continue
|
|
592
|
+
|
|
593
|
+
# Track total RLE runs for this table
|
|
594
|
+
table_total_rle = 0
|
|
595
|
+
table_results = []
|
|
596
|
+
|
|
597
|
+
for _, row in schema_info.iterrows():
|
|
598
|
+
col_name = row['column_name']
|
|
599
|
+
|
|
600
|
+
# Calculate RLE runs in natural (physical) order using delta_scan
|
|
601
|
+
rle_query = f"""
|
|
602
|
+
WITH numbered AS (
|
|
603
|
+
SELECT
|
|
604
|
+
filename,
|
|
605
|
+
file_row_number,
|
|
606
|
+
{col_name},
|
|
607
|
+
LAG({col_name}) OVER (ORDER BY filename, file_row_number) as prev_value
|
|
608
|
+
FROM delta_scan('{table_path}', file_row_number=1, filename=1)
|
|
609
|
+
)
|
|
610
|
+
SELECT COUNT(*) as runs
|
|
611
|
+
FROM numbered
|
|
612
|
+
WHERE prev_value IS NULL OR {col_name} != prev_value OR {col_name} IS NULL OR prev_value IS NULL
|
|
613
|
+
"""
|
|
614
|
+
|
|
615
|
+
try:
|
|
616
|
+
runs = con.sql(rle_query).fetchone()[0]
|
|
617
|
+
|
|
618
|
+
# Also calculate NDV for this column
|
|
619
|
+
ndv_query = f"SELECT COUNT(DISTINCT {col_name}) FROM delta_scan('{table_path}')"
|
|
620
|
+
ndv = con.sql(ndv_query).fetchone()[0]
|
|
621
|
+
|
|
622
|
+
table_total_rle += runs
|
|
623
|
+
|
|
624
|
+
print(f" {col_name}: {runs:,} runs, ndv={ndv:,}")
|
|
625
|
+
|
|
626
|
+
table_results.append({
|
|
627
|
+
'schema_name': schema,
|
|
628
|
+
'table_name': table,
|
|
629
|
+
'column_name': col_name,
|
|
630
|
+
'total_rows': total_rows,
|
|
631
|
+
'rle_runs': runs,
|
|
632
|
+
'ndv': ndv
|
|
633
|
+
})
|
|
634
|
+
except Exception as e:
|
|
635
|
+
print(f" Warning: Could not calculate RLE runs for {col_name}: {e}")
|
|
636
|
+
|
|
637
|
+
# Add total_rle_runs to all rows for this table
|
|
638
|
+
for result in table_results:
|
|
639
|
+
result['total_rle_runs'] = table_total_rle
|
|
640
|
+
results.append(result)
|
|
641
|
+
|
|
642
|
+
print(f" Total RLE runs for table: {table_total_rle:,}")
|
|
643
|
+
|
|
644
|
+
except Exception as e:
|
|
645
|
+
print(f" Error processing table: {e}")
|
|
646
|
+
|
|
647
|
+
return pd.DataFrame(results)
|
|
648
|
+
|
|
@@ -8,7 +8,6 @@ duckrun/ducklake_metadata.py
|
|
|
8
8
|
duckrun/files.py
|
|
9
9
|
duckrun/lakehouse.py
|
|
10
10
|
duckrun/notebook.py
|
|
11
|
-
duckrun/rle.py
|
|
12
11
|
duckrun/runner.py
|
|
13
12
|
duckrun/semantic_model.py
|
|
14
13
|
duckrun/stats.py
|
|
@@ -19,10 +18,13 @@ duckrun.egg-info/dependency_links.txt
|
|
|
19
18
|
duckrun.egg-info/requires.txt
|
|
20
19
|
duckrun.egg-info/top_level.txt
|
|
21
20
|
tests/test_checkpoint_format.py
|
|
21
|
+
tests/test_consecutive_values.py
|
|
22
22
|
tests/test_deploy_fresh.py
|
|
23
23
|
tests/test_ducklake_export.py
|
|
24
24
|
tests/test_filename.py
|
|
25
25
|
tests/test_register.py
|
|
26
26
|
tests/test_rle.py
|
|
27
|
+
tests/test_rle_analysis.py
|
|
28
|
+
tests/test_rle_summary.py
|
|
27
29
|
tests/test_writer_dictionary.py
|
|
28
30
|
tests/test_writer_integration.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.22.dev2"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test: Analyze Consecutive Values in Delta Table
|
|
3
|
+
|
|
4
|
+
This test connects to tmp/data.lakehouse and analyzes the deltars.summary table
|
|
5
|
+
for consecutive runs in the 'time' column.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
# Add parent directory to path to import duckrun
|
|
12
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
13
|
+
|
|
14
|
+
import duckrun
|
|
15
|
+
from duckrun.rle import analyze_consecutive_values
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_consecutive_values():
|
|
19
|
+
"""Test consecutive value analysis on deltars.summary table"""
|
|
20
|
+
|
|
21
|
+
lakehouse_path = "tmp/data.lakehouse"
|
|
22
|
+
|
|
23
|
+
print("=" * 80)
|
|
24
|
+
print("CONSECUTIVE VALUES ANALYSIS TEST")
|
|
25
|
+
print("=" * 80)
|
|
26
|
+
print(f"Lakehouse: {lakehouse_path}")
|
|
27
|
+
print(f"Table: deltars.summary")
|
|
28
|
+
print(f"Column: time")
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
# Connect to lakehouse
|
|
32
|
+
print("\nConnecting to lakehouse...")
|
|
33
|
+
con = duckrun.connect(lakehouse_path)
|
|
34
|
+
|
|
35
|
+
# Analyze consecutive values in the 'time' column
|
|
36
|
+
print("\n" + "=" * 80)
|
|
37
|
+
print("ANALYZING CONSECUTIVE VALUES")
|
|
38
|
+
print("=" * 80)
|
|
39
|
+
|
|
40
|
+
df = analyze_consecutive_values(
|
|
41
|
+
duckrun_con=con,
|
|
42
|
+
table_name='summary',
|
|
43
|
+
column_name='time',
|
|
44
|
+
min_consecutive=3,
|
|
45
|
+
schema_name='deltars'
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Display results
|
|
49
|
+
if not df.empty:
|
|
50
|
+
print("\n" + "=" * 80)
|
|
51
|
+
print("RESULTS")
|
|
52
|
+
print("=" * 80)
|
|
53
|
+
print(f"\nFound {len(df)} consecutive sequences")
|
|
54
|
+
print("\nTop 20 longest sequences:")
|
|
55
|
+
print(df.head(20).to_string(index=False))
|
|
56
|
+
|
|
57
|
+
# Statistics
|
|
58
|
+
print("\n" + "=" * 80)
|
|
59
|
+
print("STATISTICS")
|
|
60
|
+
print("=" * 80)
|
|
61
|
+
total_in_sequences = df['consecutive_count'].sum()
|
|
62
|
+
longest = df['consecutive_count'].max()
|
|
63
|
+
shortest = df['consecutive_count'].min()
|
|
64
|
+
avg = df['consecutive_count'].mean()
|
|
65
|
+
|
|
66
|
+
print(f"Total values in sequences: {total_in_sequences:,}")
|
|
67
|
+
print(f"Longest sequence: {longest:,}")
|
|
68
|
+
print(f"Shortest sequence: {shortest:,}")
|
|
69
|
+
print(f"Average sequence length: {avg:.2f}")
|
|
70
|
+
|
|
71
|
+
# File distribution
|
|
72
|
+
print("\n" + "=" * 80)
|
|
73
|
+
print("FILE DISTRIBUTION")
|
|
74
|
+
print("=" * 80)
|
|
75
|
+
file_counts = df.groupby('filename').agg({
|
|
76
|
+
'consecutive_count': ['count', 'sum', 'max']
|
|
77
|
+
}).reset_index()
|
|
78
|
+
file_counts.columns = ['filename', 'num_sequences', 'total_values', 'max_sequence']
|
|
79
|
+
|
|
80
|
+
# Extract just filename from path
|
|
81
|
+
file_counts['filename'] = file_counts['filename'].apply(
|
|
82
|
+
lambda x: x.split('/')[-1] if '/' in str(x) else x
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
print(f"\nSequences across {len(file_counts)} files:")
|
|
86
|
+
print(file_counts.to_string(index=False))
|
|
87
|
+
|
|
88
|
+
print("\n✅ Test completed successfully!")
|
|
89
|
+
return True
|
|
90
|
+
else:
|
|
91
|
+
print("\n⚠ No consecutive sequences found")
|
|
92
|
+
return True
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
print(f"\n❌ Error during analysis: {e}")
|
|
96
|
+
import traceback
|
|
97
|
+
traceback.print_exc()
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
print("\n" + "=" * 80)
|
|
103
|
+
print("STARTING CONSECUTIVE VALUES TEST")
|
|
104
|
+
print("=" * 80)
|
|
105
|
+
|
|
106
|
+
success = test_consecutive_values()
|
|
107
|
+
|
|
108
|
+
print("\n" + "=" * 80)
|
|
109
|
+
if success:
|
|
110
|
+
print("✅ TEST PASSED")
|
|
111
|
+
else:
|
|
112
|
+
print("❌ TEST FAILED")
|
|
113
|
+
print("=" * 80)
|
|
114
|
+
|
|
115
|
+
sys.exit(0 if success else 1)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test: RLE Analysis with Real Parquet Data
|
|
3
|
+
|
|
4
|
+
This test demonstrates the refactored RLE module using real parquet data.
|
|
5
|
+
It analyzes table statistics, NDV, and value frequency patterns.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import duckdb
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
# Add parent directory to path to import duckrun
|
|
13
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
14
|
+
|
|
15
|
+
from duckrun.rle import get_table_stats, get_value_frequency_details
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_rle_with_real_parquet():
|
|
19
|
+
"""Test RLE analysis with real parquet file"""
|
|
20
|
+
|
|
21
|
+
# Path to the test parquet file
|
|
22
|
+
parquet_path = Path(__file__).parent / "part-00000-19052469-6a9d-4faa-86ac-60efce3e4443-c000.snappy.parquet"
|
|
23
|
+
|
|
24
|
+
if not parquet_path.exists():
|
|
25
|
+
print(f"❌ Error: Parquet file not found at {parquet_path}")
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
print("=" * 80)
|
|
29
|
+
print("RLE ANALYSIS TEST: Real Parquet Data")
|
|
30
|
+
print("=" * 80)
|
|
31
|
+
print(f"File: {parquet_path.name}")
|
|
32
|
+
print(f"Size: {parquet_path.stat().st_size:,} bytes")
|
|
33
|
+
|
|
34
|
+
# Connect to DuckDB
|
|
35
|
+
con = duckdb.connect(':memory:')
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
# First, let's see the schema
|
|
39
|
+
print("\n" + "=" * 80)
|
|
40
|
+
print("SCHEMA INSPECTION")
|
|
41
|
+
print("=" * 80)
|
|
42
|
+
|
|
43
|
+
schema_df = con.sql(f"""
|
|
44
|
+
SELECT * FROM parquet_schema('{parquet_path}')
|
|
45
|
+
""").df()
|
|
46
|
+
|
|
47
|
+
print(f"\nColumns found: {len(schema_df)}")
|
|
48
|
+
print(schema_df.to_string(index=False))
|
|
49
|
+
|
|
50
|
+
# Get row count
|
|
51
|
+
row_count = con.sql(f"""
|
|
52
|
+
SELECT COUNT(*) FROM read_parquet('{parquet_path}')
|
|
53
|
+
""").fetchone()[0]
|
|
54
|
+
|
|
55
|
+
print(f"\nTotal rows: {row_count:,}")
|
|
56
|
+
|
|
57
|
+
# Run comprehensive RLE analysis
|
|
58
|
+
print("\n" + "=" * 80)
|
|
59
|
+
print("COMPREHENSIVE RLE ANALYSIS")
|
|
60
|
+
print("=" * 80)
|
|
61
|
+
|
|
62
|
+
stats_df = get_table_stats(con, str(parquet_path), is_parquet=True, top_n_values=10)
|
|
63
|
+
|
|
64
|
+
# Display results
|
|
65
|
+
print("\n" + "=" * 80)
|
|
66
|
+
print("RESULTS: Columns Ranked by RLE Potential")
|
|
67
|
+
print("=" * 80)
|
|
68
|
+
|
|
69
|
+
print("\n" + stats_df[['column_name', 'data_type', 'ndv', 'cardinality_ratio',
|
|
70
|
+
'top_value_pct', 'top_n_coverage', 'repetition_score']].to_string(index=False))
|
|
71
|
+
|
|
72
|
+
# Detailed analysis of top 3 columns
|
|
73
|
+
print("\n" + "=" * 80)
|
|
74
|
+
print("DETAILED VALUE FREQUENCY ANALYSIS")
|
|
75
|
+
print("=" * 80)
|
|
76
|
+
|
|
77
|
+
for idx in range(min(3, len(stats_df))):
|
|
78
|
+
col_name = stats_df.iloc[idx]['column_name']
|
|
79
|
+
score = stats_df.iloc[idx]['repetition_score']
|
|
80
|
+
|
|
81
|
+
print(f"\n[{idx+1}] Column: {col_name} (repetition_score: {score})")
|
|
82
|
+
print("-" * 80)
|
|
83
|
+
|
|
84
|
+
freq_df = get_value_frequency_details(con, str(parquet_path), col_name,
|
|
85
|
+
is_parquet=True, limit=15)
|
|
86
|
+
print(freq_df.to_string(index=False))
|
|
87
|
+
|
|
88
|
+
if not freq_df.empty:
|
|
89
|
+
print(f"\n✓ Top value appears {freq_df.iloc[0]['percentage']:.2f}% of the time")
|
|
90
|
+
print(f"✓ Top 15 values cover {freq_df['cumulative_pct'].iloc[-1]:.2f}% of all data")
|
|
91
|
+
|
|
92
|
+
# Summary and recommendations
|
|
93
|
+
print("\n" + "=" * 80)
|
|
94
|
+
print("SUMMARY & RECOMMENDATIONS")
|
|
95
|
+
print("=" * 80)
|
|
96
|
+
|
|
97
|
+
# Categorize columns
|
|
98
|
+
excellent = stats_df[stats_df['repetition_score'] > 100]
|
|
99
|
+
good = stats_df[(stats_df['repetition_score'] >= 10) & (stats_df['repetition_score'] <= 100)]
|
|
100
|
+
poor = stats_df[stats_df['repetition_score'] < 10]
|
|
101
|
+
|
|
102
|
+
print(f"\n📊 RLE Compression Potential:")
|
|
103
|
+
print(f" Excellent (score > 100): {len(excellent)} columns")
|
|
104
|
+
if len(excellent) > 0:
|
|
105
|
+
print(f" {', '.join(excellent['column_name'].tolist())}")
|
|
106
|
+
|
|
107
|
+
print(f" Good (score 10-100): {len(good)} columns")
|
|
108
|
+
if len(good) > 0:
|
|
109
|
+
print(f" {', '.join(good['column_name'].tolist())}")
|
|
110
|
+
|
|
111
|
+
print(f" Poor (score < 10): {len(poor)} columns")
|
|
112
|
+
if len(poor) > 0:
|
|
113
|
+
print(f" {', '.join(poor['column_name'].tolist())}")
|
|
114
|
+
|
|
115
|
+
print(f"\n💡 Sorting Recommendation:")
|
|
116
|
+
top_3 = stats_df.head(3)['column_name'].tolist()
|
|
117
|
+
print(f" For optimal RLE compression, consider sorting by:")
|
|
118
|
+
for i, col in enumerate(top_3, 1):
|
|
119
|
+
print(f" {i}. {col}")
|
|
120
|
+
|
|
121
|
+
print(f"\n✅ Test completed successfully!")
|
|
122
|
+
|
|
123
|
+
return True
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
print(f"\n❌ Error during analysis: {e}")
|
|
127
|
+
import traceback
|
|
128
|
+
traceback.print_exc()
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
finally:
|
|
132
|
+
con.close()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
print("\n" + "=" * 80)
|
|
137
|
+
print("STARTING RLE ANALYSIS TEST")
|
|
138
|
+
print("=" * 80)
|
|
139
|
+
|
|
140
|
+
success = test_rle_with_real_parquet()
|
|
141
|
+
|
|
142
|
+
print("\n" + "=" * 80)
|
|
143
|
+
if success:
|
|
144
|
+
print("✅ TEST PASSED")
|
|
145
|
+
else:
|
|
146
|
+
print("❌ TEST FAILED")
|
|
147
|
+
print("=" * 80)
|
|
148
|
+
|
|
149
|
+
sys.exit(0 if success else 1)
|