duckrun 0.2.19.dev4__tar.gz → 0.2.19.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/rle.py +10 -23
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/pyproject.toml +1 -1
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/LICENSE +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/README.md +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/__init__.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/auth.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/core.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/setup.cfg +0 -0
- {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/tests/test_rle.py +0 -0
|
@@ -238,22 +238,21 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
|
|
|
238
238
|
Calculate cardinality ratio for each column (distinct_values / total_rows).
|
|
239
239
|
Lower ratio = better for RLE compression (more repetition).
|
|
240
240
|
|
|
241
|
-
NEVER uses sampling - always scans full dataset with exact
|
|
241
|
+
NEVER uses sampling - always scans full dataset with exact distinct counts.
|
|
242
242
|
|
|
243
243
|
Args:
|
|
244
244
|
con: DuckDB connection
|
|
245
245
|
source: Either a table name (default) or parquet file path
|
|
246
246
|
limit: DEPRECATED - kept for backward compatibility but ignored. Always scans full dataset.
|
|
247
247
|
is_parquet: If True, source is a parquet file path; if False, source is a table name
|
|
248
|
-
use_approx:
|
|
249
|
-
|
|
250
|
-
approx_threshold: Row count threshold for using HyperLogLog (default: 100M rows)
|
|
248
|
+
use_approx: DEPRECATED - always uses exact COUNT(DISTINCT)
|
|
249
|
+
approx_threshold: DEPRECATED - always uses exact COUNT(DISTINCT)
|
|
251
250
|
|
|
252
251
|
Returns:
|
|
253
252
|
Dictionary mapping column names to dict with keys:
|
|
254
253
|
- 'cardinality_ratio': distinct/total, range 0-1, lower is better for RLE
|
|
255
254
|
- 'total_rows': total row count
|
|
256
|
-
- 'distinct_values': number of distinct values (exact
|
|
255
|
+
- 'distinct_values': number of distinct values (exact)
|
|
257
256
|
"""
|
|
258
257
|
# Build the FROM clause based on source type
|
|
259
258
|
if is_parquet:
|
|
@@ -274,26 +273,15 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
|
|
|
274
273
|
if not column_names:
|
|
275
274
|
return {}
|
|
276
275
|
|
|
277
|
-
#
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
281
|
-
use_approx = total_rows > approx_threshold
|
|
282
|
-
if use_approx:
|
|
283
|
-
print(f" Table has {total_rows:,} rows (>{approx_threshold:,}) - using HyperLogLog approximation")
|
|
284
|
-
else:
|
|
285
|
-
print(f" Table has {total_rows:,} rows (<={approx_threshold:,}) - using exact COUNT(DISTINCT)")
|
|
286
|
-
else:
|
|
287
|
-
total_rows = None # Will be calculated in main query
|
|
276
|
+
# Get row count
|
|
277
|
+
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
278
|
+
print(f" Table has {total_rows:,} rows - using exact COUNT(DISTINCT)")
|
|
288
279
|
|
|
289
|
-
# Build a single query that calculates all
|
|
280
|
+
# Build a single query that calculates all cardinality in one pass
|
|
290
281
|
# This scans the data only ONCE instead of once per column
|
|
291
282
|
select_clauses = []
|
|
292
283
|
for col in column_names:
|
|
293
|
-
|
|
294
|
-
select_clauses.append(f"approx_count_distinct({col}) as distinct_{col}")
|
|
295
|
-
else:
|
|
296
|
-
select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
|
|
284
|
+
select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
|
|
297
285
|
|
|
298
286
|
query = f"""
|
|
299
287
|
SELECT
|
|
@@ -307,8 +295,7 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
|
|
|
307
295
|
if not result:
|
|
308
296
|
return {}
|
|
309
297
|
|
|
310
|
-
|
|
311
|
-
total_rows = result[0]
|
|
298
|
+
total_rows = result[0]
|
|
312
299
|
|
|
313
300
|
nfv_stats = {}
|
|
314
301
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.19.
|
|
7
|
+
version = "0.2.19.dev5"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|