supertable 2.3.2__tar.gz → 2.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {supertable-2.3.2/supertable.egg-info → supertable-2.3.3}/PKG-INFO +1 -1
- {supertable-2.3.2 → supertable-2.3.3}/pyproject.toml +1 -1
- {supertable-2.3.2 → supertable-2.3.3}/setup.py +1 -1
- {supertable-2.3.2 → supertable-2.3.3}/supertable/__init__.py +1 -1
- {supertable-2.3.2 → supertable-2.3.3}/supertable/data_writer.py +97 -22
- {supertable-2.3.2 → supertable-2.3.3}/supertable/processing.py +27 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_writer.py +284 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_processing_compact_resources.py +93 -0
- {supertable-2.3.2 → supertable-2.3.3/supertable.egg-info}/PKG-INFO +1 -1
- {supertable-2.3.2 → supertable-2.3.3}/LICENSE +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/README.md +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/requirements.txt +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/setup.cfg +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/admin.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/chain.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/consumers.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/crypto.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/events.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/export.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/logger.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/middleware.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/reader.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/retention.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/test_chain.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/test_crypto.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/test_emit.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/test_events.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/test_retention.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/writer_parquet.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/writer_redis.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/config/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/config/defaults.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/config/homedir.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/config/settings.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/config/tests/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/config/tests/test_defaults.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/config/tests/test_homedir.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/config/tests/test_settings.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/data_classes.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/data_reader.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/__main__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/check_filter_builder.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/controller.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/data_writer_helpers.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/defaults.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/dummy_data.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/read_parquet_header.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s01_01_01_create_super_table.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s01_01_02_enable_mirroring_formats.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s01_02_create_roles.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s01_03_create_users.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_01_write_dummy_data.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_02_write_single_data.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_03_01_write_staging.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_03_02_create_pipe.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_04_01_write_monitoring_simple.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_04_02_write_monitoring_parallel.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_05_write_tombstone.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_01_read_data_error.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_02_01_read_super_data_ok.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_02_02_read_table_data_ok.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_03_read_meta.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_04_read_staging.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_06_01_read_roles.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_06_02_read_user.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_07_01_estimate_read.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_07_02_estimate_files.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_08_read_snapshot_history.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s04_01_03_delete_pipe.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s05_01_delete_table.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s05_02_delete_super_table.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/core.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/defaults.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/generate.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/load.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/topup.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/data_estimator.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/duckdb_lite.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/duckdb_pro.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/engine_common.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/engine_config.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/engine_enum.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/executor.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/plan_stats.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/spark_thrift.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/conftest.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/test_engine.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/test_engine_config.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/test_engine_routing.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/test_engine_spill.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/errors.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/benchmarks/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/benchmarks/benchmark_locking.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/benchmarks/measure_lock_speed.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/benchmarks/measure_lock_time.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/file_lock.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/redis_lock.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/tests/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/tests/test_file_lock.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/tests/test_redis_lock.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/logging.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/meta_reader.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/mirroring/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/mirroring/mirror_delta.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/mirroring/mirror_formats.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/mirroring/mirror_iceberg.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/mirroring/mirror_parquet.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/monitoring/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/monitoring/partitions.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/monitoring_writer.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/plan_extender.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/query_plan_manager.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/access_control.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/filter_builder.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/permissions.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/role_manager.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/row_column_security.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/tests/test_filter_builder.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/tests/test_rbac.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/tests/test_rbac_per_table.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/user_manager.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/redis_catalog.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/redis_connector.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/redis_infra.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/redis_keys.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/simple_table.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/staging_area.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/azure_storage.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/gcp_storage.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/local_storage.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/minio_storage.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/s3_storage.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/storage_factory.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/storage_interface.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/tests/test_storage.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/super_pipe.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/super_table.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/system_query.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_align_to_schema_fix.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_create_if_missing.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_reader.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_reader_preflight.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_writer_compact.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_writer_comprehensive.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_errors.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_meta_reader.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_monitoring_partitions.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_monitoring_sink_guard.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_newer_than.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_parquet_statistics.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_processing.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_processing_stats.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_query_sql.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_read_pruning_differential.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_read_pruning_integration.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_redis_key_prefix.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_resolve_overwrite_writes.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_simple_table.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_stats_cache.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_stats_pruning.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_stats_schema_snapshot.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_super_table.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_supertable_all.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_system_query.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/__init__.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/helper.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/profiler.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/sql_parser.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/tests/test_sql_parser_columns.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/timer.py +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable.egg-info/SOURCES.txt +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable.egg-info/dependency_links.txt +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable.egg-info/entry_points.txt +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable.egg-info/requires.txt +0 -0
- {supertable-2.3.2 → supertable-2.3.3}/supertable.egg-info/top_level.txt +0 -0
|
@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
|
|
|
19
19
|
|
|
20
20
|
setup(
|
|
21
21
|
name="supertable",
|
|
22
|
-
version="2.3.
|
|
22
|
+
version="2.3.3",
|
|
23
23
|
description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
|
|
24
24
|
long_description=long_description,
|
|
25
25
|
long_description_content_type="text/markdown",
|
|
@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
|
|
|
25
25
|
project documentation for the full API surface.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
__version__ = "2.3.
|
|
28
|
+
__version__ = "2.3.3"
|
|
29
29
|
|
|
30
30
|
# Re-export the core public surface so users can do ``from supertable import …``
|
|
31
31
|
# instead of remembering submodule paths.
|
|
@@ -35,6 +35,7 @@ from supertable.processing import (
|
|
|
35
35
|
write_parquet_and_collect_resources,
|
|
36
36
|
compact_resources,
|
|
37
37
|
compact_tombstones,
|
|
38
|
+
should_compact_small_files,
|
|
38
39
|
_max_tombstone_rows,
|
|
39
40
|
_read_parquet_safe,
|
|
40
41
|
)
|
|
@@ -527,28 +528,52 @@ class DataWriter:
|
|
|
527
528
|
)
|
|
528
529
|
mark("build_tombstone")
|
|
529
530
|
|
|
530
|
-
# 4. Threshold compaction
|
|
531
|
-
#
|
|
532
|
-
|
|
531
|
+
# 4. Threshold compaction (two triggers, same physical step):
|
|
532
|
+
# (a) the deletion-vector grew past max_tombstone_rows, or
|
|
533
|
+
# (b) the small files tripped the auto-compaction gate.
|
|
534
|
+
# Both must FIRST physically drop tombstoned rows (Phase A)
|
|
535
|
+
# and only THEN merge small files (Phase B): compact_resources
|
|
536
|
+
# rewrites data files WITHOUT consulting the deletion-vector,
|
|
537
|
+
# so sunsetting a vector-referenced file would orphan its dead
|
|
538
|
+
# rows (hidden on read, never reclaimable). Draining first
|
|
539
|
+
# guarantees Phase B only ever sees vector-free survivors.
|
|
540
|
+
post_write_resources = (
|
|
541
|
+
(last_simple_table.get("resources") or []) + new_resources
|
|
542
|
+
)
|
|
543
|
+
compaction_gate = should_compact_small_files(
|
|
544
|
+
post_write_resources, table_config
|
|
545
|
+
)
|
|
546
|
+
tombstone_threshold_hit = (
|
|
533
547
|
combined_tombstone_df is not None
|
|
534
548
|
and combined_tombstone_df.height >= _max_tombstone_rows(table_config)
|
|
535
|
-
)
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# Phase A — drain the deletion-vector when either trigger fires
|
|
552
|
+
# and a vector is actually live (freshly built this write OR
|
|
553
|
+
# carried forward from a prior one).
|
|
554
|
+
if tombstone_threshold_hit or compaction_gate:
|
|
555
|
+
dv_to_drain = combined_tombstone_df
|
|
556
|
+
if dv_to_drain is None and tombstone_path:
|
|
557
|
+
# Pure carry-forward: load the live vector so the merge
|
|
558
|
+
# below never sunsets a file it still references.
|
|
559
|
+
dv_to_drain = _read_parquet_safe(tombstone_path, profiler=profiler)
|
|
560
|
+
if dv_to_drain is not None and dv_to_drain.height > 0:
|
|
561
|
+
removed, tomb_new, tomb_sunset = compact_tombstones(
|
|
562
|
+
snapshot=last_simple_table,
|
|
563
|
+
tombstone_df=dv_to_drain,
|
|
564
|
+
data_dir=simple_table.data_dir,
|
|
565
|
+
compression_level=compression_level,
|
|
566
|
+
table_config=table_config,
|
|
567
|
+
profiler=profiler,
|
|
568
|
+
)
|
|
569
|
+
new_resources.extend(tomb_new)
|
|
570
|
+
sunset_files |= tomb_sunset
|
|
571
|
+
tombstone_path = None # deletion-vector fully consumed
|
|
572
|
+
tombstone_rows = 0
|
|
573
|
+
logger.info(lp(
|
|
574
|
+
f"tombstone compaction removed {removed} rows "
|
|
575
|
+
f"from {len(tomb_sunset)} files"
|
|
576
|
+
))
|
|
552
577
|
|
|
553
578
|
# 5. Pin the (carried-forward / new / cleared) tombstone pointer
|
|
554
579
|
# and its row count.
|
|
@@ -556,6 +581,45 @@ class DataWriter:
|
|
|
556
581
|
last_simple_table["tombstone_rows"] = tombstone_rows
|
|
557
582
|
mark("compact_tombstones")
|
|
558
583
|
|
|
584
|
+
# Phase B — auto small-file compaction. Merge the accumulated
|
|
585
|
+
# small files (existing survivors + the file just written) once
|
|
586
|
+
# the gate is open so the file count stays bounded. The vector
|
|
587
|
+
# was drained above, so every surviving file is safe to sunset.
|
|
588
|
+
# Result folds into the SAME snapshot commit below (new_resources
|
|
589
|
+
# / sunset_files feed build_stats and simple_table.update).
|
|
590
|
+
compaction_ran = False
|
|
591
|
+
if compaction_gate:
|
|
592
|
+
live_resources = [
|
|
593
|
+
r for r in (last_simple_table.get("resources") or [])
|
|
594
|
+
if r.get("file") not in sunset_files
|
|
595
|
+
]
|
|
596
|
+
live_resources += [
|
|
597
|
+
r for r in new_resources if r.get("file") not in sunset_files
|
|
598
|
+
]
|
|
599
|
+
considered, comp_rows, comp_new, comp_sunset = compact_resources(
|
|
600
|
+
snapshot={"resources": live_resources},
|
|
601
|
+
data_dir=simple_table.data_dir,
|
|
602
|
+
compression_level=compression_level,
|
|
603
|
+
table_config=table_config,
|
|
604
|
+
small_only=True,
|
|
605
|
+
)
|
|
606
|
+
if comp_new or comp_sunset:
|
|
607
|
+
sunset_files |= comp_sunset
|
|
608
|
+
# A file written above (incoming or tombstone survivor)
|
|
609
|
+
# may have been re-merged here; drop any new_resources
|
|
610
|
+
# entry that is now sunset so the snapshot never lists a
|
|
611
|
+
# file as both live and gone.
|
|
612
|
+
new_resources = [
|
|
613
|
+
r for r in (new_resources + comp_new)
|
|
614
|
+
if r.get("file") not in sunset_files
|
|
615
|
+
]
|
|
616
|
+
compaction_ran = True
|
|
617
|
+
logger.info(lp(
|
|
618
|
+
f"auto-compaction merged {considered} small files "
|
|
619
|
+
f"into {len(comp_new)} file(s) ({comp_rows} rows)"
|
|
620
|
+
))
|
|
621
|
+
mark("compact_small")
|
|
622
|
+
|
|
559
623
|
# 6. Carry forward + extend the external column-statistics parquet.
|
|
560
624
|
# Read the footers of the newly written data files, drop the
|
|
561
625
|
# rows of any sunset file, and append the new ones. No new
|
|
@@ -614,7 +678,18 @@ class DataWriter:
|
|
|
614
678
|
# model_df would shrink schema / schemaString to that partial
|
|
615
679
|
# shape even though all parquet files still have full schema.
|
|
616
680
|
# See docs/03_data_model.md "Schema Field Semantics".
|
|
617
|
-
|
|
681
|
+
#
|
|
682
|
+
# When auto-compaction merged files this write, derive the
|
|
683
|
+
# schema from the compacted output instead: a merged file may
|
|
684
|
+
# union in columns from older files that the incoming frame
|
|
685
|
+
# lacks (schema-evolving tables), so `dataframe` would narrow
|
|
686
|
+
# the metadata even though the Parquet is wider.
|
|
687
|
+
if compaction_ran:
|
|
688
|
+
schema_model_df = self._build_compact_model_df(
|
|
689
|
+
new_resources, last_simple_table
|
|
690
|
+
)
|
|
691
|
+
else:
|
|
692
|
+
schema_model_df = None if delete_only else dataframe
|
|
618
693
|
new_snapshot_dict, new_snapshot_path = simple_table.update(
|
|
619
694
|
new_resources, sunset_files, schema_model_df,
|
|
620
695
|
last_snapshot=last_simple_table,
|
|
@@ -720,7 +795,7 @@ class DataWriter:
|
|
|
720
795
|
f"lock={timings.get('lock', 0):.3f} | snapshot={timings.get('snapshot', 0):.3f} | "
|
|
721
796
|
f"overlap={timings.get('overlap', 0):.3f} | stats_prune={timings.get('stats_prune', 0):.3f} | newer_than={timings.get('newer_than', 0):.3f} | "
|
|
722
797
|
f"identify_deletes={timings.get('identify_deletes', 0):.3f} | write_parquet={timings.get('write_parquet', 0):.3f} | "
|
|
723
|
-
f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
|
|
798
|
+
f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | compact_small={timings.get('compact_small', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
|
|
724
799
|
f"update_simple={timings.get('update_simple', 0):.3f} | bump_root={timings.get('bump_root', 0):.3f} | "
|
|
725
800
|
f"mirror={timings.get('mirror', 0):.3f} | prepare_monitor={timings.get('prepare_monitor', 0):.3f}"
|
|
726
801
|
)
|
|
@@ -292,6 +292,33 @@ def prune_not_overlapping_files_by_threshold(
|
|
|
292
292
|
return result
|
|
293
293
|
|
|
294
294
|
|
|
295
|
+
def should_compact_small_files(
|
|
296
|
+
resources: List[Dict],
|
|
297
|
+
table_config: Optional[dict] = None,
|
|
298
|
+
) -> bool:
|
|
299
|
+
"""Return True when accumulated small files trip the auto-compaction gate.
|
|
300
|
+
|
|
301
|
+
Mirrors the threshold in ``prune_not_overlapping_files_by_threshold``: a
|
|
302
|
+
file is "small" when its ``file_size`` is strictly smaller than
|
|
303
|
+
``max_memory_chunk_size``. The gate opens when EITHER the small-file count
|
|
304
|
+
reaches ``max_overlapping_files`` OR the combined small-file size exceeds
|
|
305
|
+
``max_memory_chunk_size``. Files already at/above the chunk size are big
|
|
306
|
+
enough on their own and are never counted.
|
|
307
|
+
|
|
308
|
+
``resources`` is a snapshot's resource list (dicts with ``file`` /
|
|
309
|
+
``file_size``). Limits resolve per-table via ``_resolve_limits``.
|
|
310
|
+
"""
|
|
311
|
+
max_mem, max_files = _resolve_limits(table_config)
|
|
312
|
+
small_sizes = [
|
|
313
|
+
int(r.get("file_size") or 0)
|
|
314
|
+
for r in (resources or [])
|
|
315
|
+
if r.get("file") and int(r.get("file_size") or 0) < max_mem
|
|
316
|
+
]
|
|
317
|
+
if not small_sizes:
|
|
318
|
+
return False
|
|
319
|
+
return len(small_sizes) >= max_files or sum(small_sizes) > max_mem
|
|
320
|
+
|
|
321
|
+
|
|
295
322
|
# =========================
|
|
296
323
|
# Public API: Overlap selection (with compaction triggers)
|
|
297
324
|
# =========================
|
|
@@ -48,6 +48,16 @@ _PATCH_BUILD_TOMBSTONE = f"{_MOD}.build_tombstone_file"
|
|
|
48
48
|
_PATCH_MIRROR = f"{_MOD}.MirrorFormats"
|
|
49
49
|
_PATCH_GET_MON_LOGGER = f"{_MOD}.MonitoringWriter"
|
|
50
50
|
_PATCH_UUID4 = f"{_MOD}.uuid.uuid4"
|
|
51
|
+
# Auto-compaction step (Phase A drain + Phase B small-file merge) wired into
|
|
52
|
+
# write(). The gate (should_compact_small_files) is left UNMOCKED so tests
|
|
53
|
+
# drive the REAL threshold off the snapshot's resource list; the heavy merge
|
|
54
|
+
# helpers and the stats writers (which would otherwise touch storage once files
|
|
55
|
+
# are sunset) are mocked so the tests pin orchestration, not Parquet I/O.
|
|
56
|
+
_PATCH_COMPACT_RES = f"{_MOD}.compact_resources"
|
|
57
|
+
_PATCH_COMPACT_TOMB = f"{_MOD}.compact_tombstones"
|
|
58
|
+
_PATCH_READ_PARQUET = f"{_MOD}._read_parquet_safe"
|
|
59
|
+
_PATCH_EXTRACT_STATS = f"{_MOD}.extract_stats_rows"
|
|
60
|
+
_PATCH_BUILD_STATS = f"{_MOD}.build_stats_file"
|
|
51
61
|
|
|
52
62
|
|
|
53
63
|
# ---------------------------------------------------------------------------
|
|
@@ -1814,3 +1824,277 @@ class TestWriteOverwriteResolution:
|
|
|
1814
1824
|
assert kwargs["newer_than_col"] == "ts"
|
|
1815
1825
|
# The single returned delete pair drives the deleted count.
|
|
1816
1826
|
assert result[3] == 1
|
|
1827
|
+
|
|
1828
|
+
|
|
1829
|
+
# ====================================================================
|
|
1830
|
+
# 12. DataWriter.write — Inline Auto-Compaction (small-file gate)
|
|
1831
|
+
# ====================================================================
|
|
1832
|
+
|
|
1833
|
+
def _small_resources(n: int, *, size: int = 80 * 1024) -> List[Dict]:
|
|
1834
|
+
"""N small-file resource dicts that trip should_compact_small_files'
|
|
1835
|
+
REAL count gate (default MAX_OVERLAPPING_FILES=100) once n >= 100.
|
|
1836
|
+
|
|
1837
|
+
Only ``file`` / ``file_size`` matter — the gate ignores everything else,
|
|
1838
|
+
and the merge helper is mocked so the files are never opened."""
|
|
1839
|
+
return [
|
|
1840
|
+
{"file": f"small_{i}.parquet", "file_size": size, "rows": 100}
|
|
1841
|
+
for i in range(n)
|
|
1842
|
+
]
|
|
1843
|
+
|
|
1844
|
+
|
|
1845
|
+
def _mk_compaction_catalog():
|
|
1846
|
+
cat = MagicMock()
|
|
1847
|
+
cat.reserve_rowids.return_value = 0
|
|
1848
|
+
cat.get_table_config.return_value = None # → default limits (100 / 16MB)
|
|
1849
|
+
cat.acquire_simple_lock.return_value = "t"
|
|
1850
|
+
cat.release_simple_lock.return_value = True
|
|
1851
|
+
cat.set_leaf_payload_cas.return_value = 1
|
|
1852
|
+
cat.bump_root.return_value = 1
|
|
1853
|
+
return cat
|
|
1854
|
+
|
|
1855
|
+
|
|
1856
|
+
class TestWriteAutoCompaction:
|
|
1857
|
+
"""The user's bug: small files accumulated forever because automatic
|
|
1858
|
+
compaction was never wired into write() — only the manual compact()
|
|
1859
|
+
entry point merged them. These tests pin the inline step: the gate is
|
|
1860
|
+
checked on every write, draining the deletion-vector FIRST (Phase A)
|
|
1861
|
+
so the small-file merge (Phase B) can never sunset a vector-referenced
|
|
1862
|
+
file, and the merged output folds into the SAME snapshot commit."""
|
|
1863
|
+
|
|
1864
|
+
@patch(_PATCH_COMPACT_RES)
|
|
1865
|
+
@patch(_PATCH_COMPACT_TOMB)
|
|
1866
|
+
@patch(_PATCH_READ_PARQUET)
|
|
1867
|
+
@patch(_PATCH_BUILD_STATS)
|
|
1868
|
+
@patch(_PATCH_EXTRACT_STATS)
|
|
1869
|
+
@patch(_PATCH_BUILD_TOMBSTONE)
|
|
1870
|
+
@patch(_PATCH_GET_MON_LOGGER)
|
|
1871
|
+
@patch(_PATCH_MIRROR)
|
|
1872
|
+
@patch(_PATCH_PROCESS_OVERLAP)
|
|
1873
|
+
@patch(_PATCH_RESOLVE)
|
|
1874
|
+
@patch(_PATCH_FIND_OVERLAP)
|
|
1875
|
+
@patch(_PATCH_SIMPLE_TABLE)
|
|
1876
|
+
@patch(_PATCH_CHECK_WRITE)
|
|
1877
|
+
@patch(_PATCH_POLARS_FROM_ARROW)
|
|
1878
|
+
@patch(_PATCH_REDIS_CATALOG)
|
|
1879
|
+
@patch(_PATCH_SUPER_TABLE)
|
|
1880
|
+
def test_gate_trips_append_merges_and_folds_into_snapshot(
|
|
1881
|
+
self,
|
|
1882
|
+
MockST, MockCat, mock_from_arrow, mock_check_write,
|
|
1883
|
+
MockSimple, mock_find_overlap, mock_resolve, mock_process,
|
|
1884
|
+
MockMirror, mock_get_mon, mock_build_tomb, mock_extract_stats,
|
|
1885
|
+
mock_build_stats, mock_read_parquet, mock_compact_tomb,
|
|
1886
|
+
mock_compact_res,
|
|
1887
|
+
):
|
|
1888
|
+
"""100 accumulated small files → REAL gate trips → compact_resources
|
|
1889
|
+
runs once and its merged output / sunset set fold into the single
|
|
1890
|
+
simple_table.update() commit. No deletes ⇒ tombstone drain is a
|
|
1891
|
+
no-op (nothing to orphan)."""
|
|
1892
|
+
mock_st = MagicMock(super_name="s", organization="o")
|
|
1893
|
+
MockST.return_value = mock_st
|
|
1894
|
+
MockCat.return_value = _mk_compaction_catalog()
|
|
1895
|
+
|
|
1896
|
+
df = _polars_df({"id": [1], "ts": [100]})
|
|
1897
|
+
mock_from_arrow.return_value = df
|
|
1898
|
+
|
|
1899
|
+
snap = {"resources": _small_resources(100)}
|
|
1900
|
+
mock_simple = MagicMock(data_dir="/d")
|
|
1901
|
+
mock_simple.get_simple_table_snapshot.return_value = (snap, "/p")
|
|
1902
|
+
mock_simple.update.return_value = ({}, "/np")
|
|
1903
|
+
MockSimple.return_value = mock_simple
|
|
1904
|
+
mock_find_overlap.return_value = set()
|
|
1905
|
+
|
|
1906
|
+
# Pure append: rows survive, no delete pairs, no carried-forward vector.
|
|
1907
|
+
mock_resolve.side_effect = lambda **kw: (kw["incoming_df"], [])
|
|
1908
|
+
mock_build_tomb.return_value = (None, None)
|
|
1909
|
+
# The just-written file lands in new_resources (the established pattern).
|
|
1910
|
+
mock_process.side_effect = lambda **kw: kw["new_resources"].append(
|
|
1911
|
+
{"file": "new.parquet", "file_size": 80 * 1024, "rows": 1}
|
|
1912
|
+
)
|
|
1913
|
+
mock_extract_stats.return_value = MagicMock()
|
|
1914
|
+
mock_build_stats.return_value = (None, None)
|
|
1915
|
+
mock_get_mon.return_value = MagicMock()
|
|
1916
|
+
|
|
1917
|
+
# compact_resources merges EVERY live file into one and reports them
|
|
1918
|
+
# all as sunset (computed from the snapshot it actually received).
|
|
1919
|
+
def _merge(**kw):
|
|
1920
|
+
live = kw["snapshot"]["resources"]
|
|
1921
|
+
sunset = {r["file"] for r in live}
|
|
1922
|
+
return (len(live), 10_100, [{"file": "merged.parquet",
|
|
1923
|
+
"file_size": 8_000_000,
|
|
1924
|
+
"rows": 10_100}], sunset)
|
|
1925
|
+
mock_compact_res.side_effect = _merge
|
|
1926
|
+
|
|
1927
|
+
from supertable.data_writer import DataWriter
|
|
1928
|
+
dw = DataWriter("s", "o")
|
|
1929
|
+
result = dw.write("admin", "tbl", _arrow_table({"id": [1], "ts": [100]}), ["id"])
|
|
1930
|
+
|
|
1931
|
+
assert result is not None
|
|
1932
|
+
# Gate tripped → merge ran exactly once; append had no vector to drain.
|
|
1933
|
+
mock_compact_res.assert_called_once()
|
|
1934
|
+
mock_compact_tomb.assert_not_called()
|
|
1935
|
+
# The merge saw the 100 existing files plus the one just written.
|
|
1936
|
+
merged_snapshot = mock_compact_res.call_args.kwargs["snapshot"]
|
|
1937
|
+
assert len(merged_snapshot["resources"]) == 101
|
|
1938
|
+
|
|
1939
|
+
# Folded into the SAME commit: update() lists the merged file as the
|
|
1940
|
+
# sole survivor and every consumed file as sunset.
|
|
1941
|
+
new_resources_arg = mock_simple.update.call_args[0][0]
|
|
1942
|
+
sunset_arg = mock_simple.update.call_args[0][1]
|
|
1943
|
+
assert [r["file"] for r in new_resources_arg] == ["merged.parquet"]
|
|
1944
|
+
assert "new.parquet" in sunset_arg
|
|
1945
|
+
assert "small_0.parquet" in sunset_arg
|
|
1946
|
+
assert "merged.parquet" not in sunset_arg
|
|
1947
|
+
|
|
1948
|
+
@patch(_PATCH_COMPACT_RES)
|
|
1949
|
+
@patch(_PATCH_COMPACT_TOMB)
|
|
1950
|
+
@patch(_PATCH_READ_PARQUET)
|
|
1951
|
+
@patch(_PATCH_BUILD_STATS)
|
|
1952
|
+
@patch(_PATCH_EXTRACT_STATS)
|
|
1953
|
+
@patch(_PATCH_BUILD_TOMBSTONE)
|
|
1954
|
+
@patch(_PATCH_GET_MON_LOGGER)
|
|
1955
|
+
@patch(_PATCH_MIRROR)
|
|
1956
|
+
@patch(_PATCH_PROCESS_OVERLAP)
|
|
1957
|
+
@patch(_PATCH_RESOLVE)
|
|
1958
|
+
@patch(_PATCH_FIND_OVERLAP)
|
|
1959
|
+
@patch(_PATCH_SIMPLE_TABLE)
|
|
1960
|
+
@patch(_PATCH_CHECK_WRITE)
|
|
1961
|
+
@patch(_PATCH_POLARS_FROM_ARROW)
|
|
1962
|
+
@patch(_PATCH_REDIS_CATALOG)
|
|
1963
|
+
@patch(_PATCH_SUPER_TABLE)
|
|
1964
|
+
def test_below_threshold_does_not_compact(
|
|
1965
|
+
self,
|
|
1966
|
+
MockST, MockCat, mock_from_arrow, mock_check_write,
|
|
1967
|
+
MockSimple, mock_find_overlap, mock_resolve, mock_process,
|
|
1968
|
+
MockMirror, mock_get_mon, mock_build_tomb, mock_extract_stats,
|
|
1969
|
+
mock_build_stats, mock_read_parquet, mock_compact_tomb,
|
|
1970
|
+
mock_compact_res,
|
|
1971
|
+
):
|
|
1972
|
+
"""A handful of small files stays under both the count and size
|
|
1973
|
+
triggers, so the write commits without invoking either compaction
|
|
1974
|
+
helper — auto-compaction must not run on every write, only when the
|
|
1975
|
+
gate is open."""
|
|
1976
|
+
mock_st = MagicMock(super_name="s", organization="o")
|
|
1977
|
+
MockST.return_value = mock_st
|
|
1978
|
+
MockCat.return_value = _mk_compaction_catalog()
|
|
1979
|
+
|
|
1980
|
+
df = _polars_df({"id": [1], "ts": [100]})
|
|
1981
|
+
mock_from_arrow.return_value = df
|
|
1982
|
+
|
|
1983
|
+
snap = {"resources": _small_resources(5)}
|
|
1984
|
+
mock_simple = MagicMock(data_dir="/d")
|
|
1985
|
+
mock_simple.get_simple_table_snapshot.return_value = (snap, "/p")
|
|
1986
|
+
mock_simple.update.return_value = ({}, "/np")
|
|
1987
|
+
MockSimple.return_value = mock_simple
|
|
1988
|
+
mock_find_overlap.return_value = set()
|
|
1989
|
+
|
|
1990
|
+
mock_resolve.side_effect = lambda **kw: (kw["incoming_df"], [])
|
|
1991
|
+
mock_build_tomb.return_value = (None, None)
|
|
1992
|
+
mock_process.side_effect = lambda **kw: kw["new_resources"].append(
|
|
1993
|
+
{"file": "new.parquet", "file_size": 80 * 1024, "rows": 1}
|
|
1994
|
+
)
|
|
1995
|
+
mock_extract_stats.return_value = MagicMock()
|
|
1996
|
+
mock_build_stats.return_value = (None, None)
|
|
1997
|
+
mock_get_mon.return_value = MagicMock()
|
|
1998
|
+
|
|
1999
|
+
from supertable.data_writer import DataWriter
|
|
2000
|
+
dw = DataWriter("s", "o")
|
|
2001
|
+
result = dw.write("admin", "tbl", _arrow_table({"id": [1], "ts": [100]}), ["id"])
|
|
2002
|
+
|
|
2003
|
+
assert result is not None
|
|
2004
|
+
mock_compact_res.assert_not_called()
|
|
2005
|
+
mock_compact_tomb.assert_not_called()
|
|
2006
|
+
# Write still committed the freshly written file untouched.
|
|
2007
|
+
new_resources_arg = mock_simple.update.call_args[0][0]
|
|
2008
|
+
assert [r["file"] for r in new_resources_arg] == ["new.parquet"]
|
|
2009
|
+
assert mock_simple.update.call_args[0][1] == set()
|
|
2010
|
+
|
|
2011
|
+
@patch(_PATCH_COMPACT_RES)
|
|
2012
|
+
@patch(_PATCH_COMPACT_TOMB)
|
|
2013
|
+
@patch(_PATCH_READ_PARQUET)
|
|
2014
|
+
@patch(_PATCH_BUILD_STATS)
|
|
2015
|
+
@patch(_PATCH_EXTRACT_STATS)
|
|
2016
|
+
@patch(_PATCH_BUILD_TOMBSTONE)
|
|
2017
|
+
@patch(_PATCH_GET_MON_LOGGER)
|
|
2018
|
+
@patch(_PATCH_MIRROR)
|
|
2019
|
+
@patch(_PATCH_PROCESS_OVERLAP)
|
|
2020
|
+
@patch(_PATCH_RESOLVE)
|
|
2021
|
+
@patch(_PATCH_FIND_OVERLAP)
|
|
2022
|
+
@patch(_PATCH_SIMPLE_TABLE)
|
|
2023
|
+
@patch(_PATCH_CHECK_WRITE)
|
|
2024
|
+
@patch(_PATCH_POLARS_FROM_ARROW)
|
|
2025
|
+
@patch(_PATCH_REDIS_CATALOG)
|
|
2026
|
+
@patch(_PATCH_SUPER_TABLE)
|
|
2027
|
+
def test_carried_forward_vector_drains_before_merge(
|
|
2028
|
+
self,
|
|
2029
|
+
MockST, MockCat, mock_from_arrow, mock_check_write,
|
|
2030
|
+
MockSimple, mock_find_overlap, mock_resolve, mock_process,
|
|
2031
|
+
MockMirror, mock_get_mon, mock_build_tomb, mock_extract_stats,
|
|
2032
|
+
mock_build_stats, mock_read_parquet, mock_compact_tomb,
|
|
2033
|
+
mock_compact_res,
|
|
2034
|
+
):
|
|
2035
|
+
"""The ordering invariant: when a live deletion-vector is carried
|
|
2036
|
+
forward (build_tombstone_file returns a path but no fresh frame) and
|
|
2037
|
+
the gate trips, Phase A must LOAD and drain that vector (compact_
|
|
2038
|
+
tombstones) BEFORE Phase B merges small files (compact_resources).
|
|
2039
|
+
Merging first could sunset a file the vector still references and
|
|
2040
|
+
permanently orphan its dead rows."""
|
|
2041
|
+
mock_st = MagicMock(super_name="s", organization="o")
|
|
2042
|
+
MockST.return_value = mock_st
|
|
2043
|
+
MockCat.return_value = _mk_compaction_catalog()
|
|
2044
|
+
|
|
2045
|
+
df = _polars_df({"id": [1], "ts": [100]})
|
|
2046
|
+
mock_from_arrow.return_value = df
|
|
2047
|
+
|
|
2048
|
+
snap = {
|
|
2049
|
+
"resources": _small_resources(100),
|
|
2050
|
+
"tombstone": "/d/tombstone/dv.parquet",
|
|
2051
|
+
"tombstone_rows": 50,
|
|
2052
|
+
}
|
|
2053
|
+
mock_simple = MagicMock(data_dir="/d")
|
|
2054
|
+
mock_simple.get_simple_table_snapshot.return_value = (snap, "/p")
|
|
2055
|
+
mock_simple.update.return_value = ({}, "/np")
|
|
2056
|
+
MockSimple.return_value = mock_simple
|
|
2057
|
+
mock_find_overlap.return_value = set()
|
|
2058
|
+
|
|
2059
|
+
mock_resolve.side_effect = lambda **kw: (kw["incoming_df"], [])
|
|
2060
|
+
# Carry-forward: pointer reused, no fresh combined frame this write.
|
|
2061
|
+
mock_build_tomb.return_value = ("/d/tombstone/dv.parquet", None)
|
|
2062
|
+
mock_process.side_effect = lambda **kw: kw["new_resources"].append(
|
|
2063
|
+
{"file": "new.parquet", "file_size": 80 * 1024, "rows": 1}
|
|
2064
|
+
)
|
|
2065
|
+
# Phase A loads the live vector off its pointer to drain it.
|
|
2066
|
+
mock_read_parquet.return_value = _polars_df(
|
|
2067
|
+
{"__rowid__": list(range(50))}
|
|
2068
|
+
)
|
|
2069
|
+
mock_extract_stats.return_value = MagicMock()
|
|
2070
|
+
mock_build_stats.return_value = (None, None)
|
|
2071
|
+
mock_get_mon.return_value = MagicMock()
|
|
2072
|
+
|
|
2073
|
+
order: List[str] = []
|
|
2074
|
+
|
|
2075
|
+
def _drain(**kw):
|
|
2076
|
+
order.append("tomb")
|
|
2077
|
+
return (50, [{"file": "survivor.parquet",
|
|
2078
|
+
"file_size": 70 * 1024, "rows": 50}],
|
|
2079
|
+
{"small_0.parquet"})
|
|
2080
|
+
mock_compact_tomb.side_effect = _drain
|
|
2081
|
+
|
|
2082
|
+
def _merge(**kw):
|
|
2083
|
+
order.append("res")
|
|
2084
|
+
return (10, 5_000, [{"file": "merged.parquet",
|
|
2085
|
+
"file_size": 4_000_000, "rows": 5_000}],
|
|
2086
|
+
set())
|
|
2087
|
+
mock_compact_res.side_effect = _merge
|
|
2088
|
+
|
|
2089
|
+
from supertable.data_writer import DataWriter
|
|
2090
|
+
dw = DataWriter("s", "o")
|
|
2091
|
+
result = dw.write("admin", "tbl", _arrow_table({"id": [1], "ts": [100]}), ["id"])
|
|
2092
|
+
|
|
2093
|
+
assert result is not None
|
|
2094
|
+
# The carried-forward vector was read off its pointer to drain it.
|
|
2095
|
+
mock_read_parquet.assert_called_once()
|
|
2096
|
+
assert mock_read_parquet.call_args[0][0] == "/d/tombstone/dv.parquet"
|
|
2097
|
+
# Both phases ran, drain strictly before merge.
|
|
2098
|
+
mock_compact_tomb.assert_called_once()
|
|
2099
|
+
mock_compact_res.assert_called_once()
|
|
2100
|
+
assert order == ["tomb", "res"]
|
|
@@ -573,3 +573,96 @@ class TestRaceTolerance:
|
|
|
573
573
|
# Output has only the present file's rows
|
|
574
574
|
after = _read_all(patched_storage, [r["file"] for r in new_res])
|
|
575
575
|
assert _multiset(after) == _multiset(df)
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
# ---------------------------------------------------------------------------
|
|
579
|
+
# should_compact_small_files — the auto-compaction gate predicate
|
|
580
|
+
# ---------------------------------------------------------------------------
|
|
581
|
+
#
|
|
582
|
+
# Pure function (no I/O): decides whether write() should trigger an inline
|
|
583
|
+
# small-file merge. This is the gate the user's bug report was about —
|
|
584
|
+
# 170×80KB files never compacted because nothing was *checking* it on write.
|
|
585
|
+
# Two independent triggers, both measured over files SMALLER than the chunk
|
|
586
|
+
# size (large files are already "done" and must never force a merge):
|
|
587
|
+
# (a) the small-file COUNT reaches max_overlapping_files, or
|
|
588
|
+
# (b) the small-file total BYTES exceed max_memory_chunk_size.
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def _res(file: str | None, size: int) -> dict:
|
|
592
|
+
return {"file": file, "file_size": size, "rows": 1}
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
class TestShouldCompactSmallFiles:
|
|
596
|
+
|
|
597
|
+
def _limits(self):
|
|
598
|
+
from supertable.processing import _resolve_limits
|
|
599
|
+
return _resolve_limits(None) # global defaults
|
|
600
|
+
|
|
601
|
+
def test_empty_resources_never_compacts(self):
|
|
602
|
+
from supertable.processing import should_compact_small_files
|
|
603
|
+
assert should_compact_small_files([]) is False
|
|
604
|
+
assert should_compact_small_files(None) is False
|
|
605
|
+
|
|
606
|
+
def test_count_trigger_at_threshold(self):
|
|
607
|
+
from supertable.processing import should_compact_small_files
|
|
608
|
+
max_mem, max_files = self._limits()
|
|
609
|
+
# Each file small enough that the BYTE trigger stays dormant, so this
|
|
610
|
+
# isolates the COUNT trigger: sum = max_files*s = max_mem/2 <= max_mem.
|
|
611
|
+
s = max_mem // (max_files * 2)
|
|
612
|
+
at = [_res(f"f{i}.parquet", s) for i in range(max_files)]
|
|
613
|
+
below = at[:-1]
|
|
614
|
+
assert should_compact_small_files(below) is False # max_files-1
|
|
615
|
+
assert should_compact_small_files(at) is True # == max_files
|
|
616
|
+
|
|
617
|
+
def test_size_trigger_below_count(self):
|
|
618
|
+
from supertable.processing import should_compact_small_files
|
|
619
|
+
max_mem, max_files = self._limits()
|
|
620
|
+
# 5 files, each a quarter-chunk (< chunk, so "small") → 1.25 chunks
|
|
621
|
+
# total: the BYTE trigger fires even though count is far below max_files.
|
|
622
|
+
s = max_mem // 4
|
|
623
|
+
res = [_res(f"f{i}.parquet", s) for i in range(5)]
|
|
624
|
+
assert len(res) < max_files
|
|
625
|
+
assert should_compact_small_files(res) is True
|
|
626
|
+
|
|
627
|
+
def test_size_trigger_is_strict_greater_than(self):
|
|
628
|
+
from supertable.processing import should_compact_small_files
|
|
629
|
+
max_mem, _ = self._limits()
|
|
630
|
+
# Exactly == max_mem must NOT trip (boundary): two half-chunk files.
|
|
631
|
+
res = [_res("a.parquet", max_mem // 2), _res("b.parquet", max_mem // 2)]
|
|
632
|
+
assert sum(r["file_size"] for r in res) == max_mem
|
|
633
|
+
assert should_compact_small_files(res) is False
|
|
634
|
+
|
|
635
|
+
def test_large_files_are_ignored(self):
|
|
636
|
+
from supertable.processing import should_compact_small_files
|
|
637
|
+
max_mem, max_files = self._limits()
|
|
638
|
+
# Files >= chunk size are "already compacted": even max_files+50 of
|
|
639
|
+
# them must NOT trigger a merge (they are not small).
|
|
640
|
+
big = [_res(f"b{i}.parquet", max_mem) for i in range(max_files + 50)]
|
|
641
|
+
assert should_compact_small_files(big) is False
|
|
642
|
+
# A handful of small files mixed in stays below both triggers.
|
|
643
|
+
mixed = big + [_res(f"s{i}.parquet", 80 * 1024) for i in range(5)]
|
|
644
|
+
assert should_compact_small_files(mixed) is False
|
|
645
|
+
|
|
646
|
+
def test_per_table_config_overrides_global_count(self):
|
|
647
|
+
from supertable.processing import should_compact_small_files
|
|
648
|
+
cfg = {"max_overlapping_files": 10}
|
|
649
|
+
small = [_res(f"f{i}.parquet", 80 * 1024) for i in range(10)]
|
|
650
|
+
assert should_compact_small_files(small, cfg) is True
|
|
651
|
+
assert should_compact_small_files(small[:-1], cfg) is False
|
|
652
|
+
|
|
653
|
+
def test_resource_without_file_key_is_skipped(self):
|
|
654
|
+
from supertable.processing import should_compact_small_files
|
|
655
|
+
_, max_files = self._limits()
|
|
656
|
+
# Entries lacking a ``file`` path are not real files → ignored, even
|
|
657
|
+
# at max_files of them (guards against directory/placeholder rows).
|
|
658
|
+
phantom = [_res(None, 80 * 1024) for _ in range(max_files)]
|
|
659
|
+
assert should_compact_small_files(phantom) is False
|
|
660
|
+
|
|
661
|
+
def test_missing_file_size_does_not_crash(self):
|
|
662
|
+
from supertable.processing import should_compact_small_files
|
|
663
|
+
# ``file_size`` absent/None coerces to 0 (counts toward the COUNT
|
|
664
|
+
# trigger but contributes no bytes) — must never raise.
|
|
665
|
+
_, max_files = self._limits()
|
|
666
|
+
no_size = [{"file": f"f{i}.parquet"} for i in range(max_files)]
|
|
667
|
+
assert should_compact_small_files(no_size) is True # count trigger
|
|
668
|
+
assert should_compact_small_files(no_size[:1]) is False
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|