duckrun 0.2.19.dev4__tar.gz → 0.2.19.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/rle.py +10 -23
  3. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun.egg-info/PKG-INFO +1 -1
  4. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/pyproject.toml +1 -1
  5. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/LICENSE +0 -0
  6. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/README.md +0 -0
  7. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/__init__.py +0 -0
  8. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/auth.py +0 -0
  9. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/core.py +0 -0
  10. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/files.py +0 -0
  11. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/lakehouse.py +0 -0
  12. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/notebook.py +0 -0
  13. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/runner.py +0 -0
  14. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/semantic_model.py +0 -0
  15. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/stats.py +0 -0
  16. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun/writer.py +0 -0
  17. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun.egg-info/SOURCES.txt +0 -0
  18. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun.egg-info/dependency_links.txt +0 -0
  19. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun.egg-info/requires.txt +0 -0
  20. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/duckrun.egg-info/top_level.txt +0 -0
  21. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/setup.cfg +0 -0
  22. {duckrun-0.2.19.dev4 → duckrun-0.2.19.dev5}/tests/test_rle.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev4
3
+ Version: 0.2.19.dev5
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -238,22 +238,21 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
238
238
  Calculate cardinality ratio for each column (distinct_values / total_rows).
239
239
  Lower ratio = better for RLE compression (more repetition).
240
240
 
241
- NEVER uses sampling - always scans full dataset with exact or approximate distinct counts.
241
+ NEVER uses sampling - always scans full dataset with exact distinct counts.
242
242
 
243
243
  Args:
244
244
  con: DuckDB connection
245
245
  source: Either a table name (default) or parquet file path
246
246
  limit: DEPRECATED - kept for backward compatibility but ignored. Always scans full dataset.
247
247
  is_parquet: If True, source is a parquet file path; if False, source is a table name
248
- use_approx: If True, use HyperLogLog (approx). If False, use exact COUNT(DISTINCT).
249
- If None (default), auto-decide based on table size threshold.
250
- approx_threshold: Row count threshold for using HyperLogLog (default: 100M rows)
248
+ use_approx: DEPRECATED - always uses exact COUNT(DISTINCT)
249
+ approx_threshold: DEPRECATED - always uses exact COUNT(DISTINCT)
251
250
 
252
251
  Returns:
253
252
  Dictionary mapping column names to dict with keys:
254
253
  - 'cardinality_ratio': distinct/total, range 0-1, lower is better for RLE
255
254
  - 'total_rows': total row count
256
- - 'distinct_values': number of distinct values (exact or approximate)
255
+ - 'distinct_values': number of distinct values (exact)
257
256
  """
258
257
  # Build the FROM clause based on source type
259
258
  if is_parquet:
@@ -274,26 +273,15 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
274
273
  if not column_names:
275
274
  return {}
276
275
 
277
- # Auto-decide whether to use approximate or exact based on table size
278
- if use_approx is None:
279
- # Quick row count check
280
- total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
281
- use_approx = total_rows > approx_threshold
282
- if use_approx:
283
- print(f" Table has {total_rows:,} rows (>{approx_threshold:,}) - using HyperLogLog approximation")
284
- else:
285
- print(f" Table has {total_rows:,} rows (<={approx_threshold:,}) - using exact COUNT(DISTINCT)")
286
- else:
287
- total_rows = None # Will be calculated in main query
276
+ # Get row count
277
+ total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
278
+ print(f" Table has {total_rows:,} rows - using exact COUNT(DISTINCT)")
288
279
 
289
- # Build a single query that calculates all NFV scores in one pass
280
+ # Build a single query that calculates all cardinality in one pass
290
281
  # This scans the data only ONCE instead of once per column
291
282
  select_clauses = []
292
283
  for col in column_names:
293
- if use_approx:
294
- select_clauses.append(f"approx_count_distinct({col}) as distinct_{col}")
295
- else:
296
- select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
284
+ select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
297
285
 
298
286
  query = f"""
299
287
  SELECT
@@ -307,8 +295,7 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
307
295
  if not result:
308
296
  return {}
309
297
 
310
- if total_rows is None:
311
- total_rows = result[0]
298
+ total_rows = result[0]
312
299
 
313
300
  nfv_stats = {}
314
301
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev4
3
+ Version: 0.2.19.dev5
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev4"
7
+ version = "0.2.19.dev5"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes
File without changes