duckrun 0.2.22.dev0__tar.gz → 0.2.22.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/PKG-INFO +1 -1
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/__init__.py +2 -3
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/core.py +34 -1
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/stats.py +192 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun.egg-info/SOURCES.txt +2 -1
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/pyproject.toml +1 -1
- duckrun-0.2.22.dev2/tests/test_consecutive_values.py +115 -0
- duckrun-0.2.22.dev2/tests/test_rle_summary.py +22 -0
- duckrun-0.2.22.dev0/duckrun/rle.py +0 -362
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/LICENSE +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/README.md +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/auth.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/ducklake_metadata.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/files.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/notebook.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/runner.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun/writer.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/setup.cfg +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_checkpoint_format.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_deploy_fresh.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_ducklake_export.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_filename.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_register.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_rle.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_rle_analysis.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_writer_dictionary.py +0 -0
- {duckrun-0.2.22.dev0 → duckrun-0.2.22.dev2}/tests/test_writer_integration.py +0 -0
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
from duckrun.core import Duckrun
|
|
4
4
|
from duckrun.notebook import import_notebook_from_web, import_notebook
|
|
5
|
-
from duckrun import rle
|
|
6
5
|
|
|
7
|
-
__version__ = "0.2.
|
|
6
|
+
__version__ = "0.2.22.dev2"
|
|
8
7
|
|
|
9
8
|
# Expose unified connect method at module level
|
|
10
9
|
connect = Duckrun.connect
|
|
11
10
|
|
|
12
|
-
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"
|
|
11
|
+
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
|
|
@@ -7,7 +7,7 @@ import time
|
|
|
7
7
|
from typing import List, Tuple, Union, Optional, Callable, Dict, Any
|
|
8
8
|
from string import Template
|
|
9
9
|
from datetime import datetime
|
|
10
|
-
from .stats import get_stats as _get_stats
|
|
10
|
+
from .stats import get_stats as _get_stats, get_rle as _get_rle
|
|
11
11
|
from .runner import run as _run
|
|
12
12
|
from .files import copy as _copy, download as _download
|
|
13
13
|
from .writer import QueryResult
|
|
@@ -1181,6 +1181,39 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1181
1181
|
"""
|
|
1182
1182
|
return _get_stats(self, source, detailed)
|
|
1183
1183
|
|
|
1184
|
+
def get_rle(self, source: str = None):
|
|
1185
|
+
"""
|
|
1186
|
+
Get RLE (Run-Length Encoding) statistics for Delta Lake tables.
|
|
1187
|
+
Measures compression potential by counting consecutive identical values.
|
|
1188
|
+
|
|
1189
|
+
Args:
|
|
1190
|
+
source: Optional. Can be one of:
|
|
1191
|
+
- None: Use all tables in the connection's schema (default)
|
|
1192
|
+
- Table name: 'table_name' (uses current schema)
|
|
1193
|
+
- Schema.table: 'schema.table_name' (specific table in schema)
|
|
1194
|
+
- Schema only: 'schema' (all tables in schema)
|
|
1195
|
+
- Wildcard patterns: '*.summary' or 'schema.*'
|
|
1196
|
+
|
|
1197
|
+
Returns:
|
|
1198
|
+
DataFrame with columns: schema_name, table_name, total_rle_runs
|
|
1199
|
+
|
|
1200
|
+
Examples:
|
|
1201
|
+
con = duckrun.connect("tmp/data.lakehouse/aemo")
|
|
1202
|
+
|
|
1203
|
+
# All tables in current schema
|
|
1204
|
+
rle = con.get_rle()
|
|
1205
|
+
|
|
1206
|
+
# Single table in current schema
|
|
1207
|
+
rle = con.get_rle('price')
|
|
1208
|
+
|
|
1209
|
+
# Specific table in different schema
|
|
1210
|
+
rle = con.get_rle('deltars.summary')
|
|
1211
|
+
|
|
1212
|
+
# All tables matching pattern
|
|
1213
|
+
rle = con.get_rle('*.summary')
|
|
1214
|
+
"""
|
|
1215
|
+
return _get_rle(self, source)
|
|
1216
|
+
|
|
1184
1217
|
def list_lakehouses(self) -> List[str]:
|
|
1185
1218
|
"""
|
|
1186
1219
|
List all lakehouses in the current workspace.
|
|
@@ -454,3 +454,195 @@ def get_stats(duckrun_instance, source: str = None, detailed = False):
|
|
|
454
454
|
return final_result
|
|
455
455
|
|
|
456
456
|
|
|
457
|
+
def get_rle(duckrun_instance, source: str = None) -> 'pd.DataFrame':
|
|
458
|
+
"""
|
|
459
|
+
Get RLE statistics for tables at the column level.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
duckrun_instance: Duckrun connection (from duckrun.connect())
|
|
463
|
+
source: Optional. Can be one of:
|
|
464
|
+
- None: Use all tables in the connection's schema (default)
|
|
465
|
+
- Table name: 'table_name' (uses main schema in DuckDB)
|
|
466
|
+
- Schema.table: 'schema.table_name' (specific table in schema)
|
|
467
|
+
- Schema only: 'schema' (all tables in schema)
|
|
468
|
+
- Wildcard pattern: '*.summary' (matches tables across all schemas)
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
DataFrame with columns:
|
|
472
|
+
- schema_name: Schema name
|
|
473
|
+
- table_name: Table name
|
|
474
|
+
- column_name: Column name
|
|
475
|
+
- total_rows: Total number of rows
|
|
476
|
+
- rle_runs: RLE runs for this column in natural order
|
|
477
|
+
- ndv: Number of distinct values
|
|
478
|
+
- total_rle_runs: Sum of RLE runs across all columns (same for all rows of a table)
|
|
479
|
+
"""
|
|
480
|
+
import fnmatch
|
|
481
|
+
import pandas as pd
|
|
482
|
+
|
|
483
|
+
con = duckrun_instance.con # Get underlying DuckDB connection
|
|
484
|
+
|
|
485
|
+
# Determine which tables to process
|
|
486
|
+
tables_to_process = [] # List of (schema, table) tuples
|
|
487
|
+
|
|
488
|
+
if source is None:
|
|
489
|
+
# Get all tables in the connection's schema
|
|
490
|
+
schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
|
|
491
|
+
try:
|
|
492
|
+
if schema_name == 'main':
|
|
493
|
+
query = "SHOW TABLES"
|
|
494
|
+
result = con.execute(query).fetchall()
|
|
495
|
+
if result:
|
|
496
|
+
tables = [row[0] for row in result if not row[0].startswith('tbl_')]
|
|
497
|
+
tables_to_process = [(schema_name, tbl) for tbl in tables]
|
|
498
|
+
else:
|
|
499
|
+
query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'"
|
|
500
|
+
result = con.execute(query).fetchall()
|
|
501
|
+
if result:
|
|
502
|
+
tables = [row[0] for row in result if not row[0].startswith('tbl_')]
|
|
503
|
+
tables_to_process = [(schema_name, tbl) for tbl in tables]
|
|
504
|
+
except:
|
|
505
|
+
pass
|
|
506
|
+
|
|
507
|
+
elif '.' in source:
|
|
508
|
+
parts = source.split('.', 1)
|
|
509
|
+
schema_pattern, table_pattern = parts[0], parts[1]
|
|
510
|
+
|
|
511
|
+
# Check if patterns contain wildcards
|
|
512
|
+
if '*' in schema_pattern or '*' in table_pattern:
|
|
513
|
+
# Wildcard matching
|
|
514
|
+
query = """
|
|
515
|
+
SELECT table_schema, table_name
|
|
516
|
+
FROM information_schema.tables
|
|
517
|
+
WHERE table_schema NOT LIKE 'pg_%'
|
|
518
|
+
AND table_schema != 'information_schema'
|
|
519
|
+
AND table_name NOT LIKE 'tbl_%'
|
|
520
|
+
"""
|
|
521
|
+
result = con.execute(query).fetchall()
|
|
522
|
+
for schema, table in result:
|
|
523
|
+
if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
|
|
524
|
+
tables_to_process.append((schema, table))
|
|
525
|
+
else:
|
|
526
|
+
# Exact schema.table
|
|
527
|
+
tables_to_process = [(schema_pattern, table_pattern)]
|
|
528
|
+
|
|
529
|
+
elif '*' in source:
|
|
530
|
+
# Wildcard pattern for table names across all schemas
|
|
531
|
+
query = """
|
|
532
|
+
SELECT table_schema, table_name
|
|
533
|
+
FROM information_schema.tables
|
|
534
|
+
WHERE table_schema NOT LIKE 'pg_%'
|
|
535
|
+
AND table_schema != 'information_schema'
|
|
536
|
+
AND table_name NOT LIKE 'tbl_%'
|
|
537
|
+
"""
|
|
538
|
+
result = con.execute(query).fetchall()
|
|
539
|
+
for schema, table in result:
|
|
540
|
+
if fnmatch.fnmatch(table, source):
|
|
541
|
+
tables_to_process.append((schema, table))
|
|
542
|
+
|
|
543
|
+
else:
|
|
544
|
+
# Check if it's a schema name or table name
|
|
545
|
+
try:
|
|
546
|
+
# Try as schema first
|
|
547
|
+
schema_query = f"SELECT 1 FROM information_schema.schemata WHERE schema_name = '{source}' LIMIT 1"
|
|
548
|
+
schema_exists = con.execute(schema_query).fetchone()
|
|
549
|
+
|
|
550
|
+
if schema_exists:
|
|
551
|
+
# It's a schema - get all tables
|
|
552
|
+
tables_query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{source}'"
|
|
553
|
+
result = con.execute(tables_query).fetchall()
|
|
554
|
+
if result:
|
|
555
|
+
tables = [row[0] for row in result if not row[0].startswith('tbl_')]
|
|
556
|
+
tables_to_process = [(source, tbl) for tbl in tables]
|
|
557
|
+
else:
|
|
558
|
+
# It's a table name in default schema
|
|
559
|
+
schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
|
|
560
|
+
tables_to_process = [(schema_name, source)]
|
|
561
|
+
except:
|
|
562
|
+
# Assume it's a table name
|
|
563
|
+
schema_name = duckrun_instance.schema if hasattr(duckrun_instance, 'schema') else 'main'
|
|
564
|
+
tables_to_process = [(schema_name, source)]
|
|
565
|
+
|
|
566
|
+
if not tables_to_process:
|
|
567
|
+
print("No tables found matching the criteria")
|
|
568
|
+
return pd.DataFrame(columns=['schema_name', 'table_name', 'column_name', 'total_rows',
|
|
569
|
+
'rle_runs', 'ndv', 'total_rle_runs'])
|
|
570
|
+
|
|
571
|
+
print(f"Processing {len(tables_to_process)} table(s)...")
|
|
572
|
+
|
|
573
|
+
# Process each table
|
|
574
|
+
results = []
|
|
575
|
+
for schema, table in tables_to_process:
|
|
576
|
+
table_path = f"{duckrun_instance.table_base_url}{schema}/{table}"
|
|
577
|
+
|
|
578
|
+
print(f"\nCalculating RLE runs for {schema}.{table}...")
|
|
579
|
+
|
|
580
|
+
# Get column names and row count
|
|
581
|
+
try:
|
|
582
|
+
schema_info = con.sql(f"""
|
|
583
|
+
SELECT column_name
|
|
584
|
+
FROM (DESCRIBE SELECT * FROM delta_scan('{table_path}'))
|
|
585
|
+
""").df()
|
|
586
|
+
|
|
587
|
+
# Get total row count
|
|
588
|
+
total_rows = con.sql(f"SELECT COUNT(*) FROM delta_scan('{table_path}')").fetchone()[0]
|
|
589
|
+
|
|
590
|
+
if schema_info.empty:
|
|
591
|
+
continue
|
|
592
|
+
|
|
593
|
+
# Track total RLE runs for this table
|
|
594
|
+
table_total_rle = 0
|
|
595
|
+
table_results = []
|
|
596
|
+
|
|
597
|
+
for _, row in schema_info.iterrows():
|
|
598
|
+
col_name = row['column_name']
|
|
599
|
+
|
|
600
|
+
# Calculate RLE runs in natural (physical) order using delta_scan
|
|
601
|
+
rle_query = f"""
|
|
602
|
+
WITH numbered AS (
|
|
603
|
+
SELECT
|
|
604
|
+
filename,
|
|
605
|
+
file_row_number,
|
|
606
|
+
{col_name},
|
|
607
|
+
LAG({col_name}) OVER (ORDER BY filename, file_row_number) as prev_value
|
|
608
|
+
FROM delta_scan('{table_path}', file_row_number=1, filename=1)
|
|
609
|
+
)
|
|
610
|
+
SELECT COUNT(*) as runs
|
|
611
|
+
FROM numbered
|
|
612
|
+
WHERE prev_value IS NULL OR {col_name} != prev_value OR {col_name} IS NULL OR prev_value IS NULL
|
|
613
|
+
"""
|
|
614
|
+
|
|
615
|
+
try:
|
|
616
|
+
runs = con.sql(rle_query).fetchone()[0]
|
|
617
|
+
|
|
618
|
+
# Also calculate NDV for this column
|
|
619
|
+
ndv_query = f"SELECT COUNT(DISTINCT {col_name}) FROM delta_scan('{table_path}')"
|
|
620
|
+
ndv = con.sql(ndv_query).fetchone()[0]
|
|
621
|
+
|
|
622
|
+
table_total_rle += runs
|
|
623
|
+
|
|
624
|
+
print(f" {col_name}: {runs:,} runs, ndv={ndv:,}")
|
|
625
|
+
|
|
626
|
+
table_results.append({
|
|
627
|
+
'schema_name': schema,
|
|
628
|
+
'table_name': table,
|
|
629
|
+
'column_name': col_name,
|
|
630
|
+
'total_rows': total_rows,
|
|
631
|
+
'rle_runs': runs,
|
|
632
|
+
'ndv': ndv
|
|
633
|
+
})
|
|
634
|
+
except Exception as e:
|
|
635
|
+
print(f" Warning: Could not calculate RLE runs for {col_name}: {e}")
|
|
636
|
+
|
|
637
|
+
# Add total_rle_runs to all rows for this table
|
|
638
|
+
for result in table_results:
|
|
639
|
+
result['total_rle_runs'] = table_total_rle
|
|
640
|
+
results.append(result)
|
|
641
|
+
|
|
642
|
+
print(f" Total RLE runs for table: {table_total_rle:,}")
|
|
643
|
+
|
|
644
|
+
except Exception as e:
|
|
645
|
+
print(f" Error processing table: {e}")
|
|
646
|
+
|
|
647
|
+
return pd.DataFrame(results)
|
|
648
|
+
|
|
@@ -8,7 +8,6 @@ duckrun/ducklake_metadata.py
|
|
|
8
8
|
duckrun/files.py
|
|
9
9
|
duckrun/lakehouse.py
|
|
10
10
|
duckrun/notebook.py
|
|
11
|
-
duckrun/rle.py
|
|
12
11
|
duckrun/runner.py
|
|
13
12
|
duckrun/semantic_model.py
|
|
14
13
|
duckrun/stats.py
|
|
@@ -19,11 +18,13 @@ duckrun.egg-info/dependency_links.txt
|
|
|
19
18
|
duckrun.egg-info/requires.txt
|
|
20
19
|
duckrun.egg-info/top_level.txt
|
|
21
20
|
tests/test_checkpoint_format.py
|
|
21
|
+
tests/test_consecutive_values.py
|
|
22
22
|
tests/test_deploy_fresh.py
|
|
23
23
|
tests/test_ducklake_export.py
|
|
24
24
|
tests/test_filename.py
|
|
25
25
|
tests/test_register.py
|
|
26
26
|
tests/test_rle.py
|
|
27
27
|
tests/test_rle_analysis.py
|
|
28
|
+
tests/test_rle_summary.py
|
|
28
29
|
tests/test_writer_dictionary.py
|
|
29
30
|
tests/test_writer_integration.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.22.
|
|
7
|
+
version = "0.2.22.dev2"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test: Analyze Consecutive Values in Delta Table
|
|
3
|
+
|
|
4
|
+
This test connects to tmp/data.lakehouse and analyzes the deltars.summary table
|
|
5
|
+
for consecutive runs in the 'time' column.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
# Add parent directory to path to import duckrun
|
|
12
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
13
|
+
|
|
14
|
+
import duckrun
|
|
15
|
+
from duckrun.rle import analyze_consecutive_values
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_consecutive_values():
|
|
19
|
+
"""Test consecutive value analysis on deltars.summary table"""
|
|
20
|
+
|
|
21
|
+
lakehouse_path = "tmp/data.lakehouse"
|
|
22
|
+
|
|
23
|
+
print("=" * 80)
|
|
24
|
+
print("CONSECUTIVE VALUES ANALYSIS TEST")
|
|
25
|
+
print("=" * 80)
|
|
26
|
+
print(f"Lakehouse: {lakehouse_path}")
|
|
27
|
+
print(f"Table: deltars.summary")
|
|
28
|
+
print(f"Column: time")
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
# Connect to lakehouse
|
|
32
|
+
print("\nConnecting to lakehouse...")
|
|
33
|
+
con = duckrun.connect(lakehouse_path)
|
|
34
|
+
|
|
35
|
+
# Analyze consecutive values in the 'time' column
|
|
36
|
+
print("\n" + "=" * 80)
|
|
37
|
+
print("ANALYZING CONSECUTIVE VALUES")
|
|
38
|
+
print("=" * 80)
|
|
39
|
+
|
|
40
|
+
df = analyze_consecutive_values(
|
|
41
|
+
duckrun_con=con,
|
|
42
|
+
table_name='summary',
|
|
43
|
+
column_name='time',
|
|
44
|
+
min_consecutive=3,
|
|
45
|
+
schema_name='deltars'
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Display results
|
|
49
|
+
if not df.empty:
|
|
50
|
+
print("\n" + "=" * 80)
|
|
51
|
+
print("RESULTS")
|
|
52
|
+
print("=" * 80)
|
|
53
|
+
print(f"\nFound {len(df)} consecutive sequences")
|
|
54
|
+
print("\nTop 20 longest sequences:")
|
|
55
|
+
print(df.head(20).to_string(index=False))
|
|
56
|
+
|
|
57
|
+
# Statistics
|
|
58
|
+
print("\n" + "=" * 80)
|
|
59
|
+
print("STATISTICS")
|
|
60
|
+
print("=" * 80)
|
|
61
|
+
total_in_sequences = df['consecutive_count'].sum()
|
|
62
|
+
longest = df['consecutive_count'].max()
|
|
63
|
+
shortest = df['consecutive_count'].min()
|
|
64
|
+
avg = df['consecutive_count'].mean()
|
|
65
|
+
|
|
66
|
+
print(f"Total values in sequences: {total_in_sequences:,}")
|
|
67
|
+
print(f"Longest sequence: {longest:,}")
|
|
68
|
+
print(f"Shortest sequence: {shortest:,}")
|
|
69
|
+
print(f"Average sequence length: {avg:.2f}")
|
|
70
|
+
|
|
71
|
+
# File distribution
|
|
72
|
+
print("\n" + "=" * 80)
|
|
73
|
+
print("FILE DISTRIBUTION")
|
|
74
|
+
print("=" * 80)
|
|
75
|
+
file_counts = df.groupby('filename').agg({
|
|
76
|
+
'consecutive_count': ['count', 'sum', 'max']
|
|
77
|
+
}).reset_index()
|
|
78
|
+
file_counts.columns = ['filename', 'num_sequences', 'total_values', 'max_sequence']
|
|
79
|
+
|
|
80
|
+
# Extract just filename from path
|
|
81
|
+
file_counts['filename'] = file_counts['filename'].apply(
|
|
82
|
+
lambda x: x.split('/')[-1] if '/' in str(x) else x
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
print(f"\nSequences across {len(file_counts)} files:")
|
|
86
|
+
print(file_counts.to_string(index=False))
|
|
87
|
+
|
|
88
|
+
print("\n✅ Test completed successfully!")
|
|
89
|
+
return True
|
|
90
|
+
else:
|
|
91
|
+
print("\n⚠ No consecutive sequences found")
|
|
92
|
+
return True
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
print(f"\n❌ Error during analysis: {e}")
|
|
96
|
+
import traceback
|
|
97
|
+
traceback.print_exc()
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
print("\n" + "=" * 80)
|
|
103
|
+
print("STARTING CONSECUTIVE VALUES TEST")
|
|
104
|
+
print("=" * 80)
|
|
105
|
+
|
|
106
|
+
success = test_consecutive_values()
|
|
107
|
+
|
|
108
|
+
print("\n" + "=" * 80)
|
|
109
|
+
if success:
|
|
110
|
+
print("✅ TEST PASSED")
|
|
111
|
+
else:
|
|
112
|
+
print("❌ TEST FAILED")
|
|
113
|
+
print("=" * 80)
|
|
114
|
+
|
|
115
|
+
sys.exit(0 if success else 1)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test: Get RLE statistics for deltars.summary table
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
# Add parent directory to path to import duckrun
|
|
9
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
10
|
+
|
|
11
|
+
import duckrun
|
|
12
|
+
|
|
13
|
+
# Connect to lakehouse
|
|
14
|
+
con = duckrun.connect("tmp/data.lakehouse")
|
|
15
|
+
|
|
16
|
+
# Get RLE stats for deltars.summary
|
|
17
|
+
rle_df = con.get_rle('*.summary')
|
|
18
|
+
|
|
19
|
+
print("\n" + "=" * 80)
|
|
20
|
+
print("RESULTS")
|
|
21
|
+
print("=" * 80)
|
|
22
|
+
print(rle_df.to_string(index=False))
|
|
@@ -1,362 +0,0 @@
|
|
|
1
|
-
from typing import List, Dict, Tuple, Optional
|
|
2
|
-
import pandas as pd
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def get_table_stats(duckrun_con, table_name: str,
|
|
6
|
-
top_n_values: int = 10) -> pd.DataFrame:
|
|
7
|
-
"""
|
|
8
|
-
Get comprehensive table statistics including NDV and value frequency analysis.
|
|
9
|
-
|
|
10
|
-
The theory: If a value appears frequently (high repetition), it may provide better RLE compression
|
|
11
|
-
even if the column has higher NDV. This function helps identify such patterns.
|
|
12
|
-
|
|
13
|
-
Args:
|
|
14
|
-
duckrun_con: Duckrun connection (from duckrun.connect())
|
|
15
|
-
table_name: Name of the table to analyze
|
|
16
|
-
top_n_values: Number of top frequent values to show per column (default: 10)
|
|
17
|
-
|
|
18
|
-
Returns:
|
|
19
|
-
DataFrame with columns:
|
|
20
|
-
- column_name: Name of the column
|
|
21
|
-
- data_type: Data type of the column
|
|
22
|
-
- total_rows: Total number of rows
|
|
23
|
-
- null_count: Number of NULL values
|
|
24
|
-
- null_pct: Percentage of NULL values
|
|
25
|
-
- ndv: Number of distinct values (exact)
|
|
26
|
-
- cardinality_ratio: NDV / total_rows (lower = better for RLE)
|
|
27
|
-
- top_value: Most frequent value
|
|
28
|
-
- top_value_count: Count of most frequent value
|
|
29
|
-
- top_value_pct: Percentage of most frequent value
|
|
30
|
-
- top_n_coverage: Percentage covered by top N values
|
|
31
|
-
- repetition_score: Custom score indicating RLE potential (higher = better)
|
|
32
|
-
"""
|
|
33
|
-
con = duckrun_con.con # Get underlying DuckDB connection
|
|
34
|
-
from_clause = table_name
|
|
35
|
-
|
|
36
|
-
# Get column names and types
|
|
37
|
-
schema_info = con.sql(f"""
|
|
38
|
-
SELECT column_name, column_type
|
|
39
|
-
FROM (DESCRIBE SELECT * FROM {from_clause})
|
|
40
|
-
""").df()
|
|
41
|
-
|
|
42
|
-
if schema_info.empty:
|
|
43
|
-
return pd.DataFrame()
|
|
44
|
-
|
|
45
|
-
# Get total row count once
|
|
46
|
-
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
47
|
-
print(f"Analyzing {len(schema_info)} columns across {total_rows:,} rows...")
|
|
48
|
-
|
|
49
|
-
results = []
|
|
50
|
-
|
|
51
|
-
for idx, row in schema_info.iterrows():
|
|
52
|
-
col_name = row['column_name']
|
|
53
|
-
col_type = row['column_type']
|
|
54
|
-
|
|
55
|
-
print(f" [{idx+1}/{len(schema_info)}] Analyzing column: {col_name}")
|
|
56
|
-
|
|
57
|
-
# Get basic stats in one query
|
|
58
|
-
stats_query = f"""
|
|
59
|
-
SELECT
|
|
60
|
-
COUNT(*) as total,
|
|
61
|
-
COUNT({col_name}) as non_null,
|
|
62
|
-
COUNT(DISTINCT {col_name}) as ndv
|
|
63
|
-
FROM {from_clause}
|
|
64
|
-
"""
|
|
65
|
-
|
|
66
|
-
stats = con.sql(stats_query).fetchone()
|
|
67
|
-
total = stats[0]
|
|
68
|
-
non_null = stats[1]
|
|
69
|
-
ndv = stats[2]
|
|
70
|
-
null_count = total - non_null
|
|
71
|
-
null_pct = (null_count / total * 100) if total > 0 else 0
|
|
72
|
-
cardinality_ratio = (ndv / total) if total > 0 else 0
|
|
73
|
-
|
|
74
|
-
# Get top N values with their frequencies
|
|
75
|
-
top_values_query = f"""
|
|
76
|
-
SELECT
|
|
77
|
-
{col_name} as value,
|
|
78
|
-
COUNT(*) as count,
|
|
79
|
-
COUNT(*) * 100.0 / {total} as percentage
|
|
80
|
-
FROM {from_clause}
|
|
81
|
-
WHERE {col_name} IS NOT NULL
|
|
82
|
-
GROUP BY {col_name}
|
|
83
|
-
ORDER BY count DESC
|
|
84
|
-
LIMIT {top_n_values}
|
|
85
|
-
"""
|
|
86
|
-
|
|
87
|
-
top_values = con.sql(top_values_query).df()
|
|
88
|
-
|
|
89
|
-
# Extract top value info
|
|
90
|
-
if not top_values.empty:
|
|
91
|
-
top_value = top_values.iloc[0]['value']
|
|
92
|
-
top_value_count = top_values.iloc[0]['count']
|
|
93
|
-
top_value_pct = top_values.iloc[0]['percentage']
|
|
94
|
-
top_n_coverage = top_values['percentage'].sum()
|
|
95
|
-
else:
|
|
96
|
-
top_value = None
|
|
97
|
-
top_value_count = 0
|
|
98
|
-
top_value_pct = 0
|
|
99
|
-
top_n_coverage = 0
|
|
100
|
-
|
|
101
|
-
# Calculate repetition score: higher means better for RLE
|
|
102
|
-
# Score considers:
|
|
103
|
-
# 1. How much the top value covers (higher = better)
|
|
104
|
-
# 2. How much top N values cover (higher = better)
|
|
105
|
-
# 3. Inverse of cardinality ratio (lower cardinality = better)
|
|
106
|
-
repetition_score = (top_value_pct * 2 + top_n_coverage) / 3 / (cardinality_ratio + 0.01)
|
|
107
|
-
|
|
108
|
-
results.append({
|
|
109
|
-
'column_name': col_name,
|
|
110
|
-
'data_type': col_type,
|
|
111
|
-
'total_rows': total_rows,
|
|
112
|
-
'null_count': null_count,
|
|
113
|
-
'null_pct': round(null_pct, 2),
|
|
114
|
-
'ndv': ndv,
|
|
115
|
-
'cardinality_ratio': round(cardinality_ratio, 4),
|
|
116
|
-
'top_value': top_value,
|
|
117
|
-
'top_value_count': top_value_count,
|
|
118
|
-
'top_value_pct': round(top_value_pct, 2),
|
|
119
|
-
'top_n_coverage': round(top_n_coverage, 2),
|
|
120
|
-
'repetition_score': round(repetition_score, 2)
|
|
121
|
-
})
|
|
122
|
-
|
|
123
|
-
df = pd.DataFrame(results)
|
|
124
|
-
|
|
125
|
-
# Sort by repetition score (best RLE candidates first)
|
|
126
|
-
df = df.sort_values('repetition_score', ascending=False).reset_index(drop=True)
|
|
127
|
-
|
|
128
|
-
print(f"\n✓ Analysis complete!")
|
|
129
|
-
print(f"\nTop columns by repetition score (best RLE candidates):")
|
|
130
|
-
for idx, row in df.head(5).iterrows():
|
|
131
|
-
print(f" {idx+1}. {row['column_name']}: score={row['repetition_score']}, "
|
|
132
|
-
f"top_value_pct={row['top_value_pct']}%, ndv={row['ndv']:,}")
|
|
133
|
-
|
|
134
|
-
return df
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def get_value_frequency_details(duckrun_con, table_name: str, column_name: str,
|
|
138
|
-
limit: int = 20) -> pd.DataFrame:
|
|
139
|
-
"""
|
|
140
|
-
Get detailed value frequency distribution for a specific column.
|
|
141
|
-
|
|
142
|
-
Shows the most frequent values and their counts/percentages.
|
|
143
|
-
Useful for understanding repetition patterns that drive RLE compression.
|
|
144
|
-
|
|
145
|
-
Args:
|
|
146
|
-
duckrun_con: Duckrun connection (from duckrun.connect())
|
|
147
|
-
table_name: Name of the table to analyze
|
|
148
|
-
column_name: Name of the column to analyze
|
|
149
|
-
limit: Maximum number of values to return (default: 20)
|
|
150
|
-
|
|
151
|
-
Returns:
|
|
152
|
-
DataFrame with columns:
|
|
153
|
-
- value: The distinct value
|
|
154
|
-
- count: Number of occurrences
|
|
155
|
-
- percentage: Percentage of total rows
|
|
156
|
-
- cumulative_pct: Cumulative percentage
|
|
157
|
-
"""
|
|
158
|
-
con = duckrun_con.con # Get underlying DuckDB connection
|
|
159
|
-
from_clause = table_name
|
|
160
|
-
|
|
161
|
-
# Get total row count
|
|
162
|
-
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
163
|
-
|
|
164
|
-
# Get value frequencies
|
|
165
|
-
query = f"""
|
|
166
|
-
WITH value_counts AS (
|
|
167
|
-
SELECT
|
|
168
|
-
{column_name} as value,
|
|
169
|
-
COUNT(*) as count,
|
|
170
|
-
COUNT(*) * 100.0 / {total_rows} as percentage
|
|
171
|
-
FROM {from_clause}
|
|
172
|
-
WHERE {column_name} IS NOT NULL
|
|
173
|
-
GROUP BY {column_name}
|
|
174
|
-
ORDER BY count DESC
|
|
175
|
-
LIMIT {limit}
|
|
176
|
-
)
|
|
177
|
-
SELECT
|
|
178
|
-
value,
|
|
179
|
-
count,
|
|
180
|
-
percentage,
|
|
181
|
-
SUM(percentage) OVER (ORDER BY count DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as cumulative_pct
|
|
182
|
-
FROM value_counts
|
|
183
|
-
ORDER BY count DESC
|
|
184
|
-
"""
|
|
185
|
-
|
|
186
|
-
df = con.sql(query).df()
|
|
187
|
-
|
|
188
|
-
# Round percentages
|
|
189
|
-
if not df.empty:
|
|
190
|
-
df['percentage'] = df['percentage'].round(2)
|
|
191
|
-
df['cumulative_pct'] = df['cumulative_pct'].round(2)
|
|
192
|
-
|
|
193
|
-
return df
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
def find_optimal_sort_order(duckrun_con, table_name: str,
|
|
197
|
-
max_combinations: int = 10) -> pd.DataFrame:
|
|
198
|
-
"""
|
|
199
|
-
Determine optimal sort order using V-Order-like logic: pure compression testing.
|
|
200
|
-
|
|
201
|
-
This mimics how VertiPaq/V-Order actually works:
|
|
202
|
-
1. Calculate cardinality for each column
|
|
203
|
-
2. Test different sort orderings
|
|
204
|
-
3. Measure actual RLE run counts for each ordering
|
|
205
|
-
4. Pick the ordering with best overall compression (fewest total runs)
|
|
206
|
-
|
|
207
|
-
NO semantic understanding, NO query pattern assumptions.
|
|
208
|
-
Pure mechanical testing of compression effectiveness.
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
duckrun_con: Duckrun connection (from duckrun.connect())
|
|
212
|
-
table_name: Name of the table to analyze
|
|
213
|
-
max_combinations: Maximum sort orderings to test (default: 10)
|
|
214
|
-
|
|
215
|
-
Returns:
|
|
216
|
-
DataFrame with tested orderings ranked by compression effectiveness
|
|
217
|
-
"""
|
|
218
|
-
from itertools import permutations
|
|
219
|
-
|
|
220
|
-
con = duckrun_con.con # Get underlying DuckDB connection
|
|
221
|
-
from_clause = table_name
|
|
222
|
-
|
|
223
|
-
# Get column names and cardinalities
|
|
224
|
-
print("Step 1: Analyzing column cardinalities...")
|
|
225
|
-
schema_info = con.sql(f"""
|
|
226
|
-
SELECT column_name, column_type
|
|
227
|
-
FROM (DESCRIBE SELECT * FROM {from_clause})
|
|
228
|
-
""").df()
|
|
229
|
-
|
|
230
|
-
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
231
|
-
|
|
232
|
-
# Calculate NDV for each column
|
|
233
|
-
cardinality_map = {}
|
|
234
|
-
for _, row in schema_info.iterrows():
|
|
235
|
-
col = row['column_name']
|
|
236
|
-
ndv = con.sql(f"SELECT COUNT(DISTINCT {col}) FROM {from_clause}").fetchone()[0]
|
|
237
|
-
cardinality_ratio = ndv / total_rows
|
|
238
|
-
cardinality_map[col] = {'ndv': ndv, 'ratio': cardinality_ratio}
|
|
239
|
-
print(f" {col}: {ndv:,} distinct ({cardinality_ratio*100:.4f}%)")
|
|
240
|
-
|
|
241
|
-
# Filter to low-cardinality columns only (< 1% cardinality)
|
|
242
|
-
# High cardinality columns won't benefit from reordering
|
|
243
|
-
low_card_cols = [col for col, stats in cardinality_map.items()
|
|
244
|
-
if stats['ratio'] < 0.01]
|
|
245
|
-
|
|
246
|
-
print(f"\nStep 2: Testing sort orderings for {len(low_card_cols)} low-cardinality columns...")
|
|
247
|
-
print(f"Columns to test: {', '.join(low_card_cols)}")
|
|
248
|
-
|
|
249
|
-
if len(low_card_cols) < 2:
|
|
250
|
-
print("Not enough columns to test different orderings!")
|
|
251
|
-
return pd.DataFrame()
|
|
252
|
-
|
|
253
|
-
# Generate candidate orderings
|
|
254
|
-
# Start with cardinality-based orderings
|
|
255
|
-
sorted_by_card = sorted(low_card_cols, key=lambda c: cardinality_map[c]['ndv'])
|
|
256
|
-
|
|
257
|
-
test_orderings = [
|
|
258
|
-
sorted_by_card, # Lowest cardinality first
|
|
259
|
-
sorted_by_card[::-1], # Highest cardinality first
|
|
260
|
-
]
|
|
261
|
-
|
|
262
|
-
# Add some permutations of top 3 columns
|
|
263
|
-
if len(low_card_cols) >= 3:
|
|
264
|
-
for perm in permutations(sorted_by_card[:3]):
|
|
265
|
-
if list(perm) not in test_orderings:
|
|
266
|
-
test_orderings.append(list(perm))
|
|
267
|
-
if len(test_orderings) >= max_combinations:
|
|
268
|
-
break
|
|
269
|
-
|
|
270
|
-
# Test each ordering by calculating actual RLE runs
|
|
271
|
-
print(f"\nStep 3: Testing {len(test_orderings)} different orderings...")
|
|
272
|
-
results = []
|
|
273
|
-
|
|
274
|
-
for idx, ordering in enumerate(test_orderings, 1):
|
|
275
|
-
print(f"\n[{idx}/{len(test_orderings)}] Testing: {' → '.join(ordering)}")
|
|
276
|
-
|
|
277
|
-
# Calculate RLE runs for each column with this ordering
|
|
278
|
-
# We'll sort the data by the ordering and count runs
|
|
279
|
-
order_clause = ', '.join(ordering)
|
|
280
|
-
|
|
281
|
-
column_rle = {}
|
|
282
|
-
for col in schema_info['column_name']:
|
|
283
|
-
# Count runs: a new run starts when value changes
|
|
284
|
-
rle_query = f"""
|
|
285
|
-
WITH sorted_data AS (
|
|
286
|
-
SELECT
|
|
287
|
-
{col},
|
|
288
|
-
ROW_NUMBER() OVER (ORDER BY {order_clause}) as rn
|
|
289
|
-
FROM {from_clause}
|
|
290
|
-
),
|
|
291
|
-
with_prev AS (
|
|
292
|
-
SELECT
|
|
293
|
-
{col},
|
|
294
|
-
LAG({col}) OVER (ORDER BY rn) as prev_val
|
|
295
|
-
FROM sorted_data
|
|
296
|
-
)
|
|
297
|
-
SELECT COUNT(*) as runs
|
|
298
|
-
FROM with_prev
|
|
299
|
-
WHERE prev_val IS NULL OR {col} != prev_val OR {col} IS NULL OR prev_val IS NULL
|
|
300
|
-
"""
|
|
301
|
-
|
|
302
|
-
runs = con.sql(rle_query).fetchone()[0]
|
|
303
|
-
column_rle[col] = runs
|
|
304
|
-
print(f" {col}: {runs:,} runs")
|
|
305
|
-
|
|
306
|
-
total_runs = sum(column_rle.values())
|
|
307
|
-
print(f" TOTAL: {total_runs:,} runs")
|
|
308
|
-
|
|
309
|
-
results.append({
|
|
310
|
-
'sort_order': ' → '.join(ordering),
|
|
311
|
-
'total_runs': total_runs,
|
|
312
|
-
'compression_score': total_rows / total_runs, # Higher = better compression
|
|
313
|
-
**column_rle
|
|
314
|
-
})
|
|
315
|
-
|
|
316
|
-
# Create results DataFrame
|
|
317
|
-
df = pd.DataFrame(results)
|
|
318
|
-
df = df.sort_values('total_runs').reset_index(drop=True)
|
|
319
|
-
|
|
320
|
-
print("\n" + "=" * 80)
|
|
321
|
-
print("RESULTS: Best to Worst Compression")
|
|
322
|
-
print("=" * 80)
|
|
323
|
-
|
|
324
|
-
for idx, row in df.iterrows():
|
|
325
|
-
print(f"\n{idx + 1}. {row['sort_order']}")
|
|
326
|
-
print(f" Total runs: {row['total_runs']:,}")
|
|
327
|
-
print(f" Compression score: {row['compression_score']:.2f}x")
|
|
328
|
-
if idx == 0:
|
|
329
|
-
print(" ⭐ BEST COMPRESSION")
|
|
330
|
-
|
|
331
|
-
print("\n" + "=" * 80)
|
|
332
|
-
print("CONCLUSION")
|
|
333
|
-
print("=" * 80)
|
|
334
|
-
best = df.iloc[0]
|
|
335
|
-
print(f"\nOptimal sort order: {best['sort_order']}")
|
|
336
|
-
print(f"This ordering achieves the fewest total RLE runs ({best['total_runs']:,})")
|
|
337
|
-
print(f"\nThis is how V-Order actually works:")
|
|
338
|
-
print("✓ No query pattern assumptions")
|
|
339
|
-
print("✓ No semantic understanding")
|
|
340
|
-
print("✓ Pure compression effectiveness testing")
|
|
341
|
-
print("✓ Mechanical optimization based on data patterns")
|
|
342
|
-
|
|
343
|
-
return df
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
# Example usage:
|
|
347
|
-
#
|
|
348
|
-
# import duckrun
|
|
349
|
-
#
|
|
350
|
-
# con = duckrun.connect('workspace/lakehouse.lakehouse')
|
|
351
|
-
#
|
|
352
|
-
# # Get RLE statistics:
|
|
353
|
-
# stats_df = con.get_rle_stats('my_table', top_n_values=10)
|
|
354
|
-
# print(stats_df)
|
|
355
|
-
#
|
|
356
|
-
# # Detailed frequency distribution for a specific column:
|
|
357
|
-
# freq_df = con.get_value_frequency('my_table', 'status_column', limit=20)
|
|
358
|
-
# print(freq_df)
|
|
359
|
-
#
|
|
360
|
-
# # Find optimal sort order (V-Order simulation):
|
|
361
|
-
# optimal_df = con.find_optimal_sort_order('my_table', max_combinations=10)
|
|
362
|
-
# print(optimal_df)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|