supertable 2.3.5__tar.gz → 2.3.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {supertable-2.3.5/supertable.egg-info → supertable-2.3.7}/PKG-INFO +1 -1
- {supertable-2.3.5 → supertable-2.3.7}/pyproject.toml +1 -1
- {supertable-2.3.5 → supertable-2.3.7}/setup.py +1 -1
- {supertable-2.3.5 → supertable-2.3.7}/supertable/__init__.py +1 -1
- {supertable-2.3.5 → supertable-2.3.7}/supertable/config/settings.py +8 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/data_writer.py +121 -24
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/engine_common.py +56 -2
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/conftest.py +15 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/test_engine.py +32 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/processing.py +107 -29
- {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/local_storage.py +12 -1
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_data_writer_compact.py +36 -5
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_processing_stats.py +60 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_resolve_overwrite_writes.py +22 -5
- supertable-2.3.7/supertable/tests/test_write_probe_gate.py +130 -0
- {supertable-2.3.5 → supertable-2.3.7/supertable.egg-info}/PKG-INFO +1 -1
- {supertable-2.3.5 → supertable-2.3.7}/supertable.egg-info/SOURCES.txt +1 -0
- {supertable-2.3.5 → supertable-2.3.7}/LICENSE +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/README.md +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/requirements.txt +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/setup.cfg +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/admin.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/chain.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/consumers.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/crypto.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/events.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/export.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/logger.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/middleware.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/reader.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/retention.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/test_chain.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/test_crypto.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/test_emit.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/test_events.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/test_retention.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/writer_parquet.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/writer_redis.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/config/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/config/defaults.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/config/homedir.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/config/tests/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/config/tests/test_defaults.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/config/tests/test_homedir.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/config/tests/test_settings.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/data_classes.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/data_reader.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/__main__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/check_filter_builder.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/controller.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/data_writer_helpers.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/defaults.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/dummy_data.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/read_parquet_header.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s01_01_01_create_super_table.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s01_01_02_enable_mirroring_formats.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s01_02_create_roles.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s01_03_create_users.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_01_write_dummy_data.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_02_write_single_data.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_03_01_write_staging.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_03_02_create_pipe.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_04_01_write_monitoring_simple.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_04_02_write_monitoring_parallel.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_05_write_tombstone.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_01_read_data_error.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_02_01_read_super_data_ok.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_02_02_read_table_data_ok.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_03_read_meta.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_04_read_staging.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_06_01_read_roles.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_06_02_read_user.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_07_01_estimate_read.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_07_02_estimate_files.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_08_read_snapshot_history.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s04_01_03_delete_pipe.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s05_01_delete_table.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s05_02_delete_super_table.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/core.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/defaults.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/generate.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/load.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/topup.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/data_estimator.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/duckdb_lite.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/duckdb_pro.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/engine_config.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/engine_enum.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/executor.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/plan_stats.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/spark_thrift.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/test_engine_config.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/test_engine_routing.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/test_engine_spill.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/errors.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/benchmarks/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/benchmarks/benchmark_locking.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/benchmarks/measure_lock_speed.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/benchmarks/measure_lock_time.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/file_lock.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/redis_lock.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/tests/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/tests/test_file_lock.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/tests/test_redis_lock.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/logging.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/meta_reader.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/mirroring/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/mirroring/mirror_delta.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/mirroring/mirror_formats.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/mirroring/mirror_iceberg.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/mirroring/mirror_parquet.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/monitoring/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/monitoring/partitions.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/monitoring_writer.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/plan_extender.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/query_plan_manager.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/access_control.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/filter_builder.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/permissions.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/role_manager.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/row_column_security.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/tests/test_filter_builder.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/tests/test_rbac.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/tests/test_rbac_per_table.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/user_manager.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/redis_catalog.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/redis_connector.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/redis_infra.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/redis_keys.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/simple_table.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/staging_area.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/azure_storage.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/gcp_storage.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/minio_storage.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/s3_storage.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/storage_factory.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/storage_interface.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/tests/test_storage.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/super_pipe.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/super_table.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/system_query.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_align_to_schema_fix.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_create_if_missing.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_data_reader.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_data_reader_preflight.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_data_writer.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_data_writer_comprehensive.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_errors.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_meta_reader.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_monitoring_partitions.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_monitoring_sink_guard.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_newer_than.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_parquet_statistics.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_processing.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_processing_compact_resources.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_query_sql.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_read_pruning_differential.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_read_pruning_integration.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_redis_key_prefix.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_simple_table.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_stats_cache.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_stats_pruning.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_stats_schema_snapshot.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_super_table.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_supertable_all.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_system_query.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/__init__.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/helper.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/profiler.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/sql_parser.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/tests/test_sql_parser_columns.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/timer.py +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable.egg-info/dependency_links.txt +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable.egg-info/entry_points.txt +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable.egg-info/requires.txt +0 -0
- {supertable-2.3.5 → supertable-2.3.7}/supertable.egg-info/top_level.txt +0 -0
|
@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
|
|
|
19
19
|
|
|
20
20
|
setup(
|
|
21
21
|
name="supertable",
|
|
22
|
-
version="2.3.
|
|
22
|
+
version="2.3.7",
|
|
23
23
|
description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
|
|
24
24
|
long_description=long_description,
|
|
25
25
|
long_description_content_type="text/markdown",
|
|
@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
|
|
|
25
25
|
project documentation for the full API surface.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
__version__ = "2.3.
|
|
28
|
+
__version__ = "2.3.7"
|
|
29
29
|
|
|
30
30
|
# Re-export the core public surface so users can do ``from supertable import …``
|
|
31
31
|
# instead of remembering submodule paths.
|
|
@@ -157,6 +157,13 @@ class Settings:
|
|
|
157
157
|
SUPERTABLE_DUCKDB_MATERIALIZE: str = "view" # SUPERTABLE_DUCKDB_MATERIALIZE
|
|
158
158
|
SUPERTABLE_DUCKDB_PRESIGNED: bool = False # SUPERTABLE_DUCKDB_PRESIGNED
|
|
159
159
|
SUPERTABLE_DUCKDB_USE_HTTPFS: bool = False # SUPERTABLE_DUCKDB_USE_HTTPFS
|
|
160
|
+
# Write-path overwrite/delete resolution via the DuckDB pushdown probe.
|
|
161
|
+
# Disabled by default: the polars fallback reads only the projected key
|
|
162
|
+
# columns through the storage SDK and needs no httpfs extension, so it works
|
|
163
|
+
# in environments without one (or without internet to install it). Enable
|
|
164
|
+
# only where httpfs is available and the probe's row-group skipping is worth
|
|
165
|
+
# it (e.g. very wide tables / many overlapping files).
|
|
166
|
+
SUPERTABLE_DUCKDB_WRITE_PROBE: bool = False # SUPERTABLE_DUCKDB_WRITE_PROBE
|
|
160
167
|
# Deletion-vector (tombstone) table cache. Each entry is a small
|
|
161
168
|
# `DISTINCT __rowid__` table keyed by the stable tombstone path; the
|
|
162
169
|
# tombstone view ANTI JOINs it instead of re-reading the parquet every
|
|
@@ -437,6 +444,7 @@ def _build_settings() -> Settings:
|
|
|
437
444
|
SUPERTABLE_DUCKDB_MATERIALIZE=_env_str("SUPERTABLE_DUCKDB_MATERIALIZE", "view"),
|
|
438
445
|
SUPERTABLE_DUCKDB_PRESIGNED=_env_bool("SUPERTABLE_DUCKDB_PRESIGNED", False),
|
|
439
446
|
SUPERTABLE_DUCKDB_USE_HTTPFS=_env_bool("SUPERTABLE_DUCKDB_USE_HTTPFS", False),
|
|
447
|
+
SUPERTABLE_DUCKDB_WRITE_PROBE=_env_bool("SUPERTABLE_DUCKDB_WRITE_PROBE", False),
|
|
440
448
|
SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_MAX_PER_TABLE=_env_int("SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_MAX_PER_TABLE", 8),
|
|
441
449
|
SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_TTL_SEC=_env_int("SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_TTL_SEC", 300),
|
|
442
450
|
SUPERTABLE_DEBUG_TIMINGS=_env_bool("SUPERTABLE_DEBUG_TIMINGS", False),
|
|
@@ -5,6 +5,7 @@ import json
|
|
|
5
5
|
import os
|
|
6
6
|
import time
|
|
7
7
|
import uuid
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
9
|
from datetime import datetime, timezone
|
|
9
10
|
import re
|
|
10
11
|
|
|
@@ -343,8 +344,18 @@ class DataWriter:
|
|
|
343
344
|
# layout and tight row-group zonemaps). Together with __rowid__ it
|
|
344
345
|
# is hidden from query output by the read view's
|
|
345
346
|
# ``EXCLUDE (__rowid__, __timestamp__)`` projection.
|
|
347
|
+
#
|
|
348
|
+
# System-owned, exactly like __rowid__ above: ALWAYS overwrite any
|
|
349
|
+
# caller-supplied __timestamp__ instead of preserving it. It is a
|
|
350
|
+
# reserved internal column that is both the dedup ORDER BY key (newest
|
|
351
|
+
# per key wins) and the source of the __p_year__/month/day partition
|
|
352
|
+
# derivation (processing.py); letting a caller inject an arbitrary value
|
|
353
|
+
# (wrong dtype, non-UTC, or chosen to game which row wins) would
|
|
354
|
+
# silently corrupt partitioning and dedup. ``newer_than`` is the
|
|
355
|
+
# supported, explicit mechanism for caller-controlled conflict
|
|
356
|
+
# resolution.
|
|
346
357
|
table_config = self._get_table_config(simple_name)
|
|
347
|
-
if not delete_only
|
|
358
|
+
if not delete_only:
|
|
348
359
|
dataframe = dataframe.with_columns(
|
|
349
360
|
polars.lit(datetime.now(timezone.utc)).alias("__timestamp__")
|
|
350
361
|
)
|
|
@@ -511,12 +522,21 @@ class DataWriter:
|
|
|
511
522
|
# Load the current deletion-vector once: used both to exclude
|
|
512
523
|
# already-tombstoned rows from this write's deletes (below) and,
|
|
513
524
|
# via prev_df, to extend the vector without a second read.
|
|
525
|
+
# required=True: a DV that exists but cannot be read must abort
|
|
526
|
+
# the write, never be treated as empty — silently dropping the
|
|
527
|
+
# carried-forward vector would resurrect previously deleted rows.
|
|
514
528
|
prev_dv_df = (
|
|
515
|
-
_read_parquet_safe(prev_tombstone_path, profiler=profiler)
|
|
529
|
+
_read_parquet_safe(prev_tombstone_path, profiler=profiler, required=True)
|
|
516
530
|
if prev_tombstone_path else None
|
|
517
531
|
)
|
|
532
|
+
# The rowid set is consumed only by the idempotency filter below,
|
|
533
|
+
# which runs only when this write actually tombstones rows
|
|
534
|
+
# (overwrite or delete_only). Pure appends tombstone nothing, so
|
|
535
|
+
# skip materialising the whole deletion-vector as a Python set —
|
|
536
|
+
# prev_dv_df is still carried forward into build_tombstone_file.
|
|
518
537
|
prev_dv_rowids = set()
|
|
519
|
-
if prev_dv_df is not None
|
|
538
|
+
if (overwrite_columns or delete_only) and prev_dv_df is not None \
|
|
539
|
+
and "__rowid__" in prev_dv_df.columns:
|
|
520
540
|
prev_dv_rowids = set(prev_dv_df.get_column("__rowid__").to_list())
|
|
521
541
|
|
|
522
542
|
# 1. Identify which existing rows this write deletes/replaces.
|
|
@@ -555,38 +575,87 @@ class DataWriter:
|
|
|
555
575
|
f"(excluded {len(prev_dv_rowids)} row(s) already in the deletion-vector)"
|
|
556
576
|
))
|
|
557
577
|
|
|
558
|
-
# 2. Write the incoming rows as a new file (insert/
|
|
559
|
-
#
|
|
560
|
-
|
|
578
|
+
# 2. + 3. Write the incoming rows as a new data file (insert/
|
|
579
|
+
# upsert side) AND carry-forward/extend the deletion-vector
|
|
580
|
+
# tombstone file. These two object-store PUTs are independent:
|
|
581
|
+
# neither reads the other's output and they write to disjoint
|
|
582
|
+
# dirs (data/ vs tombstone/), so they run concurrently to
|
|
583
|
+
# overlap the two round-trips. delete_only carries only
|
|
584
|
+
# predicate columns → nothing to insert. No new deletes →
|
|
585
|
+
# build_tombstone reuses the previous file (combined_df=None).
|
|
586
|
+
#
|
|
587
|
+
# Profiler is NOT thread-safe, so each branch records into its
|
|
588
|
+
# own sub-profiler which the parent merges after the join;
|
|
589
|
+
# each branch also measures its own wall time so the per-phase
|
|
590
|
+
# monitoring timings stay meaningful despite the overlap.
|
|
591
|
+
# Footers of files written via the write_bytes path are captured
|
|
592
|
+
# in footer_md_cache so stats extraction (step 6) reuses them
|
|
593
|
+
# instead of re-downloading each freshly-written file.
|
|
594
|
+
footer_md_cache = {}
|
|
595
|
+
tombstone_dir = os.path.join(simple_table.simple_dir, "tombstone")
|
|
596
|
+
do_insert = (not delete_only and dataframe.height > 0)
|
|
597
|
+
|
|
598
|
+
def _write_data_branch():
|
|
599
|
+
sub = Profiler()
|
|
600
|
+
t = time.perf_counter()
|
|
561
601
|
write_parquet_and_collect_resources(
|
|
562
602
|
write_df=dataframe,
|
|
563
603
|
overwrite_columns=[],
|
|
564
604
|
data_dir=simple_table.data_dir,
|
|
565
605
|
new_resources=new_resources,
|
|
566
606
|
compression_level=compression_level,
|
|
567
|
-
profiler=
|
|
607
|
+
profiler=sub,
|
|
608
|
+
footer_md_out=footer_md_cache,
|
|
568
609
|
)
|
|
610
|
+
return sub, time.perf_counter() - t
|
|
611
|
+
|
|
612
|
+
def _write_tombstone_branch():
|
|
613
|
+
sub = Profiler()
|
|
614
|
+
t = time.perf_counter()
|
|
615
|
+
tp, cdf = build_tombstone_file(
|
|
616
|
+
tombstone_dir=tombstone_dir,
|
|
617
|
+
prev_tombstone_path=prev_tombstone_path,
|
|
618
|
+
new_pairs=new_delete_pairs,
|
|
619
|
+
compression_level=compression_level,
|
|
620
|
+
profiler=sub,
|
|
621
|
+
prev_df=prev_dv_df,
|
|
622
|
+
)
|
|
623
|
+
return tp, cdf, sub, time.perf_counter() - t
|
|
624
|
+
|
|
625
|
+
if do_insert:
|
|
626
|
+
with ThreadPoolExecutor(max_workers=2) as _ex:
|
|
627
|
+
_f_data = _ex.submit(_write_data_branch)
|
|
628
|
+
_f_tomb = _ex.submit(_write_tombstone_branch)
|
|
629
|
+
# .result() re-raises in the parent: a failure in either
|
|
630
|
+
# PUT aborts the write before any snapshot commit, exactly
|
|
631
|
+
# as the former sequential path did (an orphaned immutable
|
|
632
|
+
# file no snapshot references is harmless garbage).
|
|
633
|
+
data_sub, data_secs = _f_data.result()
|
|
634
|
+
tombstone_path, combined_tombstone_df, tomb_sub, tomb_secs = (
|
|
635
|
+
_f_tomb.result()
|
|
636
|
+
)
|
|
637
|
+
profiler.merge(data_sub)
|
|
638
|
+
profiler.merge(tomb_sub)
|
|
569
639
|
inserted = dataframe.height
|
|
570
640
|
else:
|
|
641
|
+
tombstone_path, combined_tombstone_df, tomb_sub, tomb_secs = (
|
|
642
|
+
_write_tombstone_branch()
|
|
643
|
+
)
|
|
644
|
+
profiler.merge(tomb_sub)
|
|
645
|
+
data_secs = 0.0
|
|
571
646
|
inserted = 0
|
|
572
|
-
|
|
647
|
+
|
|
648
|
+
# Assign the two per-phase timings from each branch's own measured
|
|
649
|
+
# wall time (they overlapped, so the serial mark() deltas would
|
|
650
|
+
# misattribute the time), then advance the mark() baseline.
|
|
651
|
+
timings["write_parquet"] = data_secs
|
|
652
|
+
timings["build_tombstone"] = tomb_secs
|
|
653
|
+
t_last = time.time()
|
|
573
654
|
logger.debug(lp(
|
|
574
655
|
f"step[write]: appended {inserted} incoming row(s) as {len(new_resources)} "
|
|
575
656
|
f"new immutable file(s) (no existing data file rewritten)"
|
|
576
657
|
))
|
|
577
658
|
|
|
578
|
-
# 3. Carry forward + extend the deletion-vector tombstone file.
|
|
579
|
-
# No new deletes → reuse the previous file (combined_df=None).
|
|
580
|
-
tombstone_dir = os.path.join(simple_table.simple_dir, "tombstone")
|
|
581
|
-
tombstone_path, combined_tombstone_df = build_tombstone_file(
|
|
582
|
-
tombstone_dir=tombstone_dir,
|
|
583
|
-
prev_tombstone_path=prev_tombstone_path,
|
|
584
|
-
new_pairs=new_delete_pairs,
|
|
585
|
-
compression_level=compression_level,
|
|
586
|
-
profiler=profiler,
|
|
587
|
-
prev_df=prev_dv_df,
|
|
588
|
-
)
|
|
589
|
-
|
|
590
659
|
# Track the live deletion-vector row count so meta reads can
|
|
591
660
|
# deduct dead rows from the physical resource row totals.
|
|
592
661
|
# New deletes → combined_tombstone_df is the full deduped DV
|
|
@@ -596,7 +665,6 @@ class DataWriter:
|
|
|
596
665
|
if combined_tombstone_df is not None
|
|
597
666
|
else int(last_simple_table.get("tombstone_rows", 0) or 0)
|
|
598
667
|
)
|
|
599
|
-
mark("build_tombstone")
|
|
600
668
|
logger.debug(lp(
|
|
601
669
|
f"step[tombstone]: deletion-vector now {tombstone_rows} row(s) "
|
|
602
670
|
f"({'rewritten' if combined_tombstone_df is not None else 'carried forward unchanged'})"
|
|
@@ -745,7 +813,9 @@ class DataWriter:
|
|
|
745
813
|
r.get("file") for r in new_resources
|
|
746
814
|
if isinstance(r, dict) and r.get("file")
|
|
747
815
|
]
|
|
748
|
-
new_stats_rows = extract_stats_rows(
|
|
816
|
+
new_stats_rows = extract_stats_rows(
|
|
817
|
+
new_data_files, profiler=profiler, footer_md_cache=footer_md_cache
|
|
818
|
+
)
|
|
749
819
|
stats_path, combined_stats_df = build_stats_file(
|
|
750
820
|
stats_dir=stats_dir,
|
|
751
821
|
prev_stats_path=last_simple_table.get("stats_file"),
|
|
@@ -1181,8 +1251,17 @@ class DataWriter:
|
|
|
1181
1251
|
# the *write* path; compact() is explicit maintenance and always
|
|
1182
1252
|
# consumes the vector.
|
|
1183
1253
|
tombstone_path = last_simple_table.get("tombstone")
|
|
1254
|
+
# required=True: a DV that exists but cannot be read must abort the
|
|
1255
|
+
# compaction, never be treated as empty. A swallowed read here would
|
|
1256
|
+
# set should_run_tombstones=False, skipping both Phase A and the
|
|
1257
|
+
# pointer-clear below, so Phase B would carry the dead rows into the
|
|
1258
|
+
# new file while the vector kept pointing at the sunset __file__ —
|
|
1259
|
+
# leaving them permanently unreclaimable. Failing loud leaves the
|
|
1260
|
+
# prior snapshot + vector intact for a retry, and matches the
|
|
1261
|
+
# write-path carry-forward read (required=True) above.
|
|
1184
1262
|
tombstone_df = (
|
|
1185
|
-
_read_parquet_safe(tombstone_path)
|
|
1263
|
+
_read_parquet_safe(tombstone_path, required=True)
|
|
1264
|
+
if tombstone_path else None
|
|
1186
1265
|
)
|
|
1187
1266
|
tombstone_rows = (
|
|
1188
1267
|
tombstone_df.height if tombstone_df is not None else 0
|
|
@@ -1246,6 +1325,24 @@ class DataWriter:
|
|
|
1246
1325
|
r for r in (list(tomb_new_resources) + list(small_new_resources))
|
|
1247
1326
|
if r.get("file") not in all_sunset
|
|
1248
1327
|
]
|
|
1328
|
+
# ``all_new_resources`` is the full set of files written by THIS
|
|
1329
|
+
# compaction; it feeds stats extraction, the schema model_df and the
|
|
1330
|
+
# result metrics below, all of which need every new file.
|
|
1331
|
+
#
|
|
1332
|
+
# For ``simple_table.update`` it must NOT be reused verbatim, though:
|
|
1333
|
+
# Phase A's outputs were already spliced into
|
|
1334
|
+
# ``last_simple_table["resources"]`` (the in-memory baseline that
|
|
1335
|
+
# ``update`` starts from) right after Phase A ran. ``update`` does
|
|
1336
|
+
# ``(baseline - sunset) + new_resources`` with no dedup, so any
|
|
1337
|
+
# Phase-A output that Phase B did NOT consume (left un-sunset because
|
|
1338
|
+
# it exceeded the ``small_only`` threshold, or its read failed) would
|
|
1339
|
+
# be counted once from the baseline AND once from new_resources —
|
|
1340
|
+
# i.e. the same file listed twice in the new snapshot. Hand ``update``
|
|
1341
|
+
# only Phase B's brand-new files, which are the only resources genuinely
|
|
1342
|
+
# absent from that baseline.
|
|
1343
|
+
update_new_resources = [
|
|
1344
|
+
r for r in small_new_resources if r.get("file") not in all_sunset
|
|
1345
|
+
]
|
|
1249
1346
|
result["files_compacted"] = considered
|
|
1250
1347
|
result["new_resources"] = len(all_new_resources)
|
|
1251
1348
|
result["sunset_files"] = len(all_sunset)
|
|
@@ -1338,7 +1435,7 @@ class DataWriter:
|
|
|
1338
1435
|
)
|
|
1339
1436
|
|
|
1340
1437
|
new_snapshot_dict, new_snapshot_path = simple_table.update(
|
|
1341
|
-
|
|
1438
|
+
update_new_resources,
|
|
1342
1439
|
all_sunset,
|
|
1343
1440
|
model_df,
|
|
1344
1441
|
last_snapshot=last_simple_table,
|
|
@@ -731,12 +731,66 @@ def new_duckdb_connection(
|
|
|
731
731
|
purely local scans.
|
|
732
732
|
"""
|
|
733
733
|
con = duckdb.connect()
|
|
734
|
-
|
|
735
|
-
|
|
734
|
+
try:
|
|
735
|
+
init_connection(con, temp_dir=temp_dir, memory_limit=memory_limit)
|
|
736
|
+
if for_paths and any("://" in str(p) for p in for_paths):
|
|
737
|
+
configure_httpfs_and_s3(con, for_paths)
|
|
738
|
+
except Exception:
|
|
739
|
+
# Don't leak the half-initialised connection if a pragma / httpfs load
|
|
740
|
+
# raises; re-raise so callers still fall back exactly as before.
|
|
741
|
+
con.close()
|
|
742
|
+
raise
|
|
743
|
+
return con
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
# Thread-local pool for the write-side probe connection. DuckDB connections are
|
|
747
|
+
# NOT thread-safe, so each thread keeps its own; reusing it amortises the
|
|
748
|
+
# ~150 ms init/warmup across writes on the same thread — the same reason the
|
|
749
|
+
# read executors hold a persistent connection.
|
|
750
|
+
_probe_pool = threading.local()
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
def get_pooled_duckdb_connection(
|
|
754
|
+
temp_dir: str,
|
|
755
|
+
for_paths: Optional[List[str]] = None,
|
|
756
|
+
memory_limit: str = "1GB",
|
|
757
|
+
) -> duckdb.DuckDBPyConnection:
|
|
758
|
+
"""Return this thread's pooled probe connection, building it on first use.
|
|
759
|
+
|
|
760
|
+
The cold build goes through ``new_duckdb_connection`` so the pinned
|
|
761
|
+
``home_directory`` / pragma contract is byte-for-byte identical to a
|
|
762
|
+
transient connection. On a *warm* connection httpfs/S3 is re-applied for
|
|
763
|
+
remote paths so a connection first built for local paths can still serve a
|
|
764
|
+
later remote probe and credentials always reflect the current environment
|
|
765
|
+
(``configure_httpfs_and_s3`` re-reads env each call and is idempotent).
|
|
766
|
+
"""
|
|
767
|
+
con = getattr(_probe_pool, "con", None)
|
|
768
|
+
if con is None:
|
|
769
|
+
con = new_duckdb_connection(
|
|
770
|
+
temp_dir=temp_dir, for_paths=for_paths, memory_limit=memory_limit
|
|
771
|
+
)
|
|
772
|
+
_probe_pool.con = con
|
|
773
|
+
elif for_paths and any("://" in str(p) for p in for_paths):
|
|
736
774
|
configure_httpfs_and_s3(con, for_paths)
|
|
737
775
|
return con
|
|
738
776
|
|
|
739
777
|
|
|
778
|
+
def reset_pooled_duckdb_connections() -> None:
|
|
779
|
+
"""Close and drop the calling thread's pooled probe connection.
|
|
780
|
+
|
|
781
|
+
A no-op when the thread has none. Used for test determinism and as an
|
|
782
|
+
eviction hook; the pool slot is cleared before the close so a failing close
|
|
783
|
+
still leaves the thread ready to rebuild.
|
|
784
|
+
"""
|
|
785
|
+
con = getattr(_probe_pool, "con", None)
|
|
786
|
+
if con is not None:
|
|
787
|
+
_probe_pool.con = None
|
|
788
|
+
try:
|
|
789
|
+
con.close()
|
|
790
|
+
except Exception:
|
|
791
|
+
pass
|
|
792
|
+
|
|
793
|
+
|
|
740
794
|
def apply_runtime_pragmas(con: duckdb.DuckDBPyConnection, cfg) -> None:
|
|
741
795
|
"""Re-apply the session-settable DuckDB pragmas from a live engine config.
|
|
742
796
|
|
|
@@ -86,6 +86,21 @@ def _mock_redis_catalog():
|
|
|
86
86
|
yield
|
|
87
87
|
|
|
88
88
|
|
|
89
|
+
@pytest.fixture(autouse=True)
|
|
90
|
+
def _reset_probe_pool():
|
|
91
|
+
"""Clear the thread-local write-probe connection pool around every test.
|
|
92
|
+
|
|
93
|
+
The probe now reuses a pooled connection across writes, so tests that
|
|
94
|
+
assert how many times ``new_duckdb_connection`` is built must start from a
|
|
95
|
+
cold pool; resetting afterwards keeps the connection from leaking into the
|
|
96
|
+
next test.
|
|
97
|
+
"""
|
|
98
|
+
from supertable.engine.engine_common import reset_pooled_duckdb_connections
|
|
99
|
+
reset_pooled_duckdb_connections()
|
|
100
|
+
yield
|
|
101
|
+
reset_pooled_duckdb_connections()
|
|
102
|
+
|
|
103
|
+
|
|
89
104
|
@pytest.fixture()
|
|
90
105
|
def duckdb_con():
|
|
91
106
|
"""Provide a real in-memory DuckDB connection, closed after each test."""
|
|
@@ -587,6 +587,38 @@ class TestReadWriteDuckDBParity:
|
|
|
587
587
|
# for_paths forwarded so httpfs is loaded for remote scans.
|
|
588
588
|
assert "for_paths" in calls[0][1]
|
|
589
589
|
|
|
590
|
+
def test_probe_reuses_pooled_connection(self, tmp_path, monkeypatch):
|
|
591
|
+
# A second probe on the same thread must REUSE the pooled connection,
|
|
592
|
+
# so new_duckdb_connection is built exactly once — the ~150ms warmup is
|
|
593
|
+
# paid on the cold probe and amortised on every subsequent write.
|
|
594
|
+
import polars
|
|
595
|
+
from supertable import processing as _processing
|
|
596
|
+
|
|
597
|
+
monkeypatch.setattr(_processing, "_get_storage", lambda: object())
|
|
598
|
+
|
|
599
|
+
f1 = str(tmp_path / "f1.parquet")
|
|
600
|
+
polars.DataFrame({"__rowid__": [10, 20], "id": [1, 2]}).write_parquet(f1)
|
|
601
|
+
|
|
602
|
+
calls = []
|
|
603
|
+
real = _engine_common.new_duckdb_connection
|
|
604
|
+
monkeypatch.setattr(
|
|
605
|
+
_engine_common,
|
|
606
|
+
"new_duckdb_connection",
|
|
607
|
+
lambda *a, **k: (calls.append((a, k)), real(*a, **k))[1],
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
def _probe():
|
|
611
|
+
return _processing._duckdb_probe_overlap_matches(
|
|
612
|
+
overlap_true_files=[(f1, 0)],
|
|
613
|
+
overwrite_columns=["id"],
|
|
614
|
+
newer_than_col=None,
|
|
615
|
+
incoming_keys=polars.DataFrame({"id": [2]}),
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
assert _probe() is not None
|
|
619
|
+
assert _probe() is not None
|
|
620
|
+
assert len(calls) == 1 # built on the cold probe, reused on the warm one
|
|
621
|
+
|
|
590
622
|
def test_probe_matches_rows_on_local_parquet(self, tmp_path, monkeypatch):
|
|
591
623
|
import polars
|
|
592
624
|
from supertable import processing as _processing
|