supertable 2.3.4__tar.gz → 2.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {supertable-2.3.4/supertable.egg-info → supertable-2.3.5}/PKG-INFO +1 -1
- {supertable-2.3.4 → supertable-2.3.5}/pyproject.toml +1 -1
- {supertable-2.3.4 → supertable-2.3.5}/setup.py +1 -1
- {supertable-2.3.4 → supertable-2.3.5}/supertable/__init__.py +1 -1
- {supertable-2.3.4 → supertable-2.3.5}/supertable/config/defaults.py +6 -0
- supertable-2.3.5/supertable/config/homedir.py +96 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/data_writer.py +49 -2
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/defaults.py +3 -1
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/engine_common.py +60 -59
- {supertable-2.3.4 → supertable-2.3.5}/supertable/processing.py +51 -13
- {supertable-2.3.4 → supertable-2.3.5}/supertable/simple_table.py +0 -9
- {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/azure_storage.py +7 -2
- {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/gcp_storage.py +7 -2
- {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/local_storage.py +4 -4
- {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/minio_storage.py +7 -2
- {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/s3_storage.py +7 -2
- {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/storage_interface.py +21 -2
- {supertable-2.3.4 → supertable-2.3.5}/supertable/super_table.py +0 -6
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_data_writer_comprehensive.py +2 -1
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_processing_stats.py +3 -1
- {supertable-2.3.4 → supertable-2.3.5/supertable.egg-info}/PKG-INFO +1 -1
- supertable-2.3.4/supertable/config/homedir.py +0 -62
- {supertable-2.3.4 → supertable-2.3.5}/LICENSE +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/README.md +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/requirements.txt +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/setup.cfg +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/admin.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/chain.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/consumers.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/crypto.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/events.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/export.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/logger.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/middleware.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/reader.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/retention.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/test_chain.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/test_crypto.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/test_emit.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/test_events.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/test_retention.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/writer_parquet.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/writer_redis.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/config/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/config/settings.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/config/tests/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/config/tests/test_defaults.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/config/tests/test_homedir.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/config/tests/test_settings.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/data_classes.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/data_reader.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/__main__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/check_filter_builder.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/controller.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/data_writer_helpers.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/dummy_data.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/read_parquet_header.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s01_01_01_create_super_table.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s01_01_02_enable_mirroring_formats.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s01_02_create_roles.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s01_03_create_users.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_01_write_dummy_data.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_02_write_single_data.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_03_01_write_staging.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_03_02_create_pipe.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_04_01_write_monitoring_simple.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_04_02_write_monitoring_parallel.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_05_write_tombstone.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_01_read_data_error.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_02_01_read_super_data_ok.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_02_02_read_table_data_ok.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_03_read_meta.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_04_read_staging.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_06_01_read_roles.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_06_02_read_user.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_07_01_estimate_read.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_07_02_estimate_files.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_08_read_snapshot_history.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s04_01_03_delete_pipe.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s05_01_delete_table.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s05_02_delete_super_table.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/core.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/defaults.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/generate.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/load.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/topup.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/data_estimator.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/duckdb_lite.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/duckdb_pro.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/engine_config.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/engine_enum.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/executor.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/plan_stats.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/spark_thrift.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/conftest.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/test_engine.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/test_engine_config.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/test_engine_routing.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/test_engine_spill.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/errors.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/benchmarks/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/benchmarks/benchmark_locking.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/benchmarks/measure_lock_speed.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/benchmarks/measure_lock_time.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/file_lock.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/redis_lock.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/tests/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/tests/test_file_lock.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/tests/test_redis_lock.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/logging.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/meta_reader.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/mirroring/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/mirroring/mirror_delta.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/mirroring/mirror_formats.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/mirroring/mirror_iceberg.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/mirroring/mirror_parquet.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/monitoring/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/monitoring/partitions.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/monitoring_writer.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/plan_extender.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/query_plan_manager.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/access_control.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/filter_builder.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/permissions.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/role_manager.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/row_column_security.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/tests/test_filter_builder.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/tests/test_rbac.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/tests/test_rbac_per_table.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/user_manager.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/redis_catalog.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/redis_connector.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/redis_infra.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/redis_keys.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/staging_area.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/storage_factory.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/tests/test_storage.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/super_pipe.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/system_query.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_align_to_schema_fix.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_create_if_missing.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_data_reader.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_data_reader_preflight.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_data_writer.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_data_writer_compact.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_errors.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_meta_reader.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_monitoring_partitions.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_monitoring_sink_guard.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_newer_than.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_parquet_statistics.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_processing.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_processing_compact_resources.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_query_sql.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_read_pruning_differential.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_read_pruning_integration.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_redis_key_prefix.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_resolve_overwrite_writes.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_simple_table.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_stats_cache.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_stats_pruning.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_stats_schema_snapshot.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_super_table.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_supertable_all.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_system_query.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/__init__.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/helper.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/profiler.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/sql_parser.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/tests/test_sql_parser_columns.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/timer.py +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable.egg-info/SOURCES.txt +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable.egg-info/dependency_links.txt +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable.egg-info/entry_points.txt +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable.egg-info/requires.txt +0 -0
- {supertable-2.3.4 → supertable-2.3.5}/supertable.egg-info/top_level.txt +0 -0
|
@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
|
|
|
19
19
|
|
|
20
20
|
setup(
|
|
21
21
|
name="supertable",
|
|
22
|
-
version="2.3.
|
|
22
|
+
version="2.3.5",
|
|
23
23
|
description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
|
|
24
24
|
long_description=long_description,
|
|
25
25
|
long_description_content_type="text/markdown",
|
|
@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
|
|
|
25
25
|
project documentation for the full API surface.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
__version__ = "2.3.
|
|
28
|
+
__version__ = "2.3.5"
|
|
29
29
|
|
|
30
30
|
# Re-export the core public surface so users can do ``from supertable import …``
|
|
31
31
|
# instead of remembering submodule paths.
|
|
@@ -17,6 +17,12 @@ handler.setFormatter(colorlog.ColoredFormatter(
|
|
|
17
17
|
logging.basicConfig(level=logging.INFO, handlers=[handler])
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
|
+
# Quiet noisy third-party HTTP client loggers. At DEBUG these emit one line
|
|
21
|
+
# per request (connection setup + every HEAD/GET/PUT), which drowns out
|
|
22
|
+
# SuperTable's own logs. WARNING keeps genuine connection problems visible.
|
|
23
|
+
for _noisy_logger in ("urllib3", "botocore", "boto3", "s3transfer", "boto"):
|
|
24
|
+
logging.getLogger(_noisy_logger).setLevel(logging.WARNING)
|
|
25
|
+
|
|
20
26
|
_VALID_LOG_LEVELS = frozenset({"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"})
|
|
21
27
|
|
|
22
28
|
@dataclass(slots=True)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import tempfile
|
|
4
|
+
|
|
5
|
+
from supertable.config.settings import settings
|
|
6
|
+
from supertable.config.defaults import logger
|
|
7
|
+
|
|
8
|
+
# If this file is located in a subdirectory, adjust the path logic as needed.
|
|
9
|
+
# Currently appending ".." from __file__ to add the project root directory
|
|
10
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
11
|
+
|
|
12
|
+
# ---------- lazy home directory resolution ----------
|
|
13
|
+
_resolved_home: str | None = None
|
|
14
|
+
|
|
15
|
+
def _is_writable_dir(path: str) -> bool:
|
|
16
|
+
"""Create *path* if needed and verify we can actually write a file in it.
|
|
17
|
+
|
|
18
|
+
``os.access(..., W_OK)`` is unreliable under containers, ACLs and
|
|
19
|
+
root-squashed mounts, so probe with a real create+unlink: this is the
|
|
20
|
+
difference between a home that merely *resolves* and one DuckDB can root
|
|
21
|
+
its temp/spill, cache and extension dirs under.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
os.makedirs(path, exist_ok=True)
|
|
25
|
+
with tempfile.NamedTemporaryFile(dir=path):
|
|
26
|
+
pass
|
|
27
|
+
return True
|
|
28
|
+
except OSError:
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
def _resolve_app_home() -> str:
|
|
32
|
+
"""
|
|
33
|
+
Resolve, expand, and normalise the application home directory once.
|
|
34
|
+
|
|
35
|
+
The home must be *writable*, not just resolvable: DuckDB roots its
|
|
36
|
+
temp/spill, external-cache and extension directories here, so a
|
|
37
|
+
non-writable home silently breaks every write (the probe fails with
|
|
38
|
+
``errno 13`` and falls back to the slow full-read path). We therefore
|
|
39
|
+
verify writability and, when the configured home is not usable, fall back
|
|
40
|
+
to ``<tempdir>/supertable`` with a loud warning rather than returning a
|
|
41
|
+
path that only looks valid.
|
|
42
|
+
"""
|
|
43
|
+
global _resolved_home
|
|
44
|
+
if _resolved_home is not None:
|
|
45
|
+
return _resolved_home
|
|
46
|
+
|
|
47
|
+
raw = settings.SUPERTABLE_HOME
|
|
48
|
+
expanded = os.path.abspath(os.path.expanduser(raw))
|
|
49
|
+
|
|
50
|
+
if _is_writable_dir(expanded):
|
|
51
|
+
logger.debug(f"Ensured app home directory exists: {expanded}")
|
|
52
|
+
_resolved_home = expanded
|
|
53
|
+
return _resolved_home
|
|
54
|
+
|
|
55
|
+
fallback = os.path.join(tempfile.gettempdir(), "supertable")
|
|
56
|
+
if _is_writable_dir(fallback):
|
|
57
|
+
logger.warning(
|
|
58
|
+
f"SUPERTABLE_HOME={expanded!r} is not writable; falling back to "
|
|
59
|
+
f"{fallback!r}. Set SUPERTABLE_HOME to a writable directory to "
|
|
60
|
+
f"silence this — DuckDB temp/spill, cache and extensions live under it."
|
|
61
|
+
)
|
|
62
|
+
_resolved_home = fallback
|
|
63
|
+
return _resolved_home
|
|
64
|
+
|
|
65
|
+
raise RuntimeError(
|
|
66
|
+
f"No writable application home: tried SUPERTABLE_HOME={expanded!r} and "
|
|
67
|
+
f"fallback {fallback!r}. Set SUPERTABLE_HOME to a writable directory."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def change_to_app_home(home_dir: str | None = None) -> None:
|
|
71
|
+
"""
|
|
72
|
+
Attempts to change the current working directory to `home_dir`.
|
|
73
|
+
If home_dir is not provided, uses the resolved app home.
|
|
74
|
+
Logs the outcome.
|
|
75
|
+
"""
|
|
76
|
+
target = home_dir if home_dir else _resolve_app_home()
|
|
77
|
+
expanded_dir = os.path.expanduser(target)
|
|
78
|
+
try:
|
|
79
|
+
os.chdir(expanded_dir)
|
|
80
|
+
logger.debug(f"Changed working directory to {expanded_dir}")
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.error(f"Failed to change working directory to {expanded_dir}: {e}")
|
|
83
|
+
|
|
84
|
+
# ---------- eager init (preserves original import-time behaviour) ----------
|
|
85
|
+
_app_home = _resolve_app_home()
|
|
86
|
+
change_to_app_home(_app_home)
|
|
87
|
+
logger.debug(f"Current working directory: {os.getcwd()}")
|
|
88
|
+
|
|
89
|
+
# ---------- public API ----------
|
|
90
|
+
|
|
91
|
+
# Kept for backward compatibility; prefer get_app_home() for the expanded path.
|
|
92
|
+
app_home = _app_home
|
|
93
|
+
|
|
94
|
+
def get_app_home() -> str:
|
|
95
|
+
"""Return the fully expanded, absolute application home directory."""
|
|
96
|
+
return _resolve_app_home()
|
|
@@ -371,6 +371,14 @@ class DataWriter:
|
|
|
371
371
|
profiler=profiler,
|
|
372
372
|
)
|
|
373
373
|
mark("overlap")
|
|
374
|
+
if overwrite_columns:
|
|
375
|
+
_snap_files = len(last_simple_table.get("resources") or [])
|
|
376
|
+
_cand = sum(1 for _, ov, _ in overlapping_files if ov)
|
|
377
|
+
logger.debug(lp(
|
|
378
|
+
f"step[overlap]: {_cand}/{_snap_files} existing file(s) are overwrite "
|
|
379
|
+
f"candidates on {overwrite_columns} "
|
|
380
|
+
f"(snapshot has no per-file key stats → every file is suspect)"
|
|
381
|
+
))
|
|
374
382
|
|
|
375
383
|
# --- Stats-driven file pruning (consumer 5a) ----------------------
|
|
376
384
|
# Narrow the overwrite/delete candidate set using the external stats
|
|
@@ -387,6 +395,10 @@ class DataWriter:
|
|
|
387
395
|
stored_stats_df = load_stats(stats_file, allow_cache=True, profiler=profiler)
|
|
388
396
|
if stored_stats_df is not None and stored_stats_df.height > 0:
|
|
389
397
|
probe = probe_ranges_from_df(dataframe, overwrite_columns)
|
|
398
|
+
_probe_desc = {
|
|
399
|
+
c: (f"{v[0]}[{v[1]}..{v[2]}]" if v else "unconstrained(null/unsupported)")
|
|
400
|
+
for c, v in probe.items()
|
|
401
|
+
}
|
|
390
402
|
before = len(overlapping_files)
|
|
391
403
|
overlapping_files = prune_overlapping_files_by_stats(
|
|
392
404
|
overlapping_files,
|
|
@@ -395,8 +407,21 @@ class DataWriter:
|
|
|
395
407
|
profiler=profiler,
|
|
396
408
|
)
|
|
397
409
|
pruned = before - len(overlapping_files)
|
|
410
|
+
logger.debug(lp(
|
|
411
|
+
f"step[stats-prune]: df-probe {_probe_desc} vs {stored_stats_df.height} "
|
|
412
|
+
f"stored stat row(s) → kept {len(overlapping_files)}/{before}, "
|
|
413
|
+
f"pruned {pruned} (no data file opened)"
|
|
414
|
+
))
|
|
398
415
|
if pruned > 0:
|
|
399
416
|
logger.info(lp(f"stats pruning: skipped {pruned}/{before} candidate files"))
|
|
417
|
+
else:
|
|
418
|
+
logger.debug(lp(
|
|
419
|
+
"step[stats-prune]: stats artifact empty → no pruning, all candidates retained"
|
|
420
|
+
))
|
|
421
|
+
else:
|
|
422
|
+
logger.debug(lp(
|
|
423
|
+
"step[stats-prune]: snapshot has no stats_file → no pruning, all candidates retained"
|
|
424
|
+
))
|
|
400
425
|
mark("stats_prune")
|
|
401
426
|
|
|
402
427
|
# File cache: used only by delete_only's identify_all_rowids below.
|
|
@@ -420,6 +445,16 @@ class DataWriter:
|
|
|
420
445
|
newer_than_col=newer_than,
|
|
421
446
|
profiler=profiler,
|
|
422
447
|
)
|
|
448
|
+
mark("resolve_overwrite")
|
|
449
|
+
_counts = profiler.counts
|
|
450
|
+
_fallback = bool(_counts.get("overwrite_resolve_fallback"))
|
|
451
|
+
logger.debug(lp(
|
|
452
|
+
f"step[probe-resolve] via {'polars-fallback' if _fallback else 'duckdb-pushdown'}: "
|
|
453
|
+
f"matched {_counts.get('probe_rows_matched', _counts.get('delete_rows_matched', 0))} "
|
|
454
|
+
f"existing row(s) on {overwrite_columns} → "
|
|
455
|
+
f"{len(resolved_delete_pairs or [])} (file,__rowid__) delete pair(s); "
|
|
456
|
+
f"{dataframe.height}/{pre_filter_count} incoming row(s) survive"
|
|
457
|
+
))
|
|
423
458
|
if newer_than:
|
|
424
459
|
skipped = pre_filter_count - dataframe.height
|
|
425
460
|
if skipped > 0:
|
|
@@ -515,6 +550,10 @@ class DataWriter:
|
|
|
515
550
|
]
|
|
516
551
|
deleted = len(new_delete_pairs)
|
|
517
552
|
mark("identify_deletes")
|
|
553
|
+
logger.debug(lp(
|
|
554
|
+
f"step[deletes]: tombstoning {deleted} live row(s) this write "
|
|
555
|
+
f"(excluded {len(prev_dv_rowids)} row(s) already in the deletion-vector)"
|
|
556
|
+
))
|
|
518
557
|
|
|
519
558
|
# 2. Write the incoming rows as a new file (insert/upsert side).
|
|
520
559
|
# delete_only carries only predicate columns — nothing to insert.
|
|
@@ -531,6 +570,10 @@ class DataWriter:
|
|
|
531
570
|
else:
|
|
532
571
|
inserted = 0
|
|
533
572
|
mark("write_parquet")
|
|
573
|
+
logger.debug(lp(
|
|
574
|
+
f"step[write]: appended {inserted} incoming row(s) as {len(new_resources)} "
|
|
575
|
+
f"new immutable file(s) (no existing data file rewritten)"
|
|
576
|
+
))
|
|
534
577
|
|
|
535
578
|
# 3. Carry forward + extend the deletion-vector tombstone file.
|
|
536
579
|
# No new deletes → reuse the previous file (combined_df=None).
|
|
@@ -554,6 +597,10 @@ class DataWriter:
|
|
|
554
597
|
else int(last_simple_table.get("tombstone_rows", 0) or 0)
|
|
555
598
|
)
|
|
556
599
|
mark("build_tombstone")
|
|
600
|
+
logger.debug(lp(
|
|
601
|
+
f"step[tombstone]: deletion-vector now {tombstone_rows} row(s) "
|
|
602
|
+
f"({'rewritten' if combined_tombstone_df is not None else 'carried forward unchanged'})"
|
|
603
|
+
))
|
|
557
604
|
|
|
558
605
|
# 3b. Eager reclamation of fully-dead files. Any existing data
|
|
559
606
|
# file whose every physical row is now tombstoned is 100%
|
|
@@ -812,7 +859,7 @@ class DataWriter:
|
|
|
812
859
|
schema_json = "{}"
|
|
813
860
|
_org, _sup = self.super_table.organization, self.super_table.super_name
|
|
814
861
|
self.catalog.r.set(RK.schema(_org, _sup, simple_name), schema_json)
|
|
815
|
-
self.catalog.r.sadd(RK.
|
|
862
|
+
self.catalog.r.sadd(RK.meta_table_names(_org, _sup), simple_name)
|
|
816
863
|
except Exception as e:
|
|
817
864
|
logger.debug(f"[data-writer] schema/table_names Redis write failed: {e}")
|
|
818
865
|
|
|
@@ -862,7 +909,7 @@ class DataWriter:
|
|
|
862
909
|
f"total={total_duration:.3f} | "
|
|
863
910
|
f"convert={timings.get('convert', 0):.3f} | dedup_ts={timings.get('dedup_ts', 0):.3f} | validate={timings.get('validate', 0):.3f} | "
|
|
864
911
|
f"lock={timings.get('lock', 0):.3f} | snapshot={timings.get('snapshot', 0):.3f} | "
|
|
865
|
-
f"overlap={timings.get('overlap', 0):.3f} | stats_prune={timings.get('stats_prune', 0):.3f} | newer_than={timings.get('newer_than', 0):.3f} | "
|
|
912
|
+
f"overlap={timings.get('overlap', 0):.3f} | stats_prune={timings.get('stats_prune', 0):.3f} | resolve_overwrite={timings.get('resolve_overwrite', 0):.3f} | newer_than={timings.get('newer_than', 0):.3f} | "
|
|
866
913
|
f"identify_deletes={timings.get('identify_deletes', 0):.3f} | write_parquet={timings.get('write_parquet', 0):.3f} | "
|
|
867
914
|
f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | compact_small={timings.get('compact_small', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
|
|
868
915
|
f"update_simple={timings.get('update_simple', 0):.3f} | bump_root={timings.get('bump_root', 0):.3f} | "
|
|
@@ -14,7 +14,9 @@ from enum import Enum
|
|
|
14
14
|
|
|
15
15
|
from supertable.config import defaults
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
# Follow the configured SUPERTABLE_LOG_LEVEL (resolved in supertable.config.defaults)
|
|
18
|
+
# instead of hard-pinning INFO, so DEBUG surfaces the detailed write step[...] logs.
|
|
19
|
+
logging.getLogger("supertable").setLevel(defaults.default.LOG_LEVEL)
|
|
18
20
|
|
|
19
21
|
defaults.default.IS_SHOW_TIMING = True
|
|
20
22
|
|
|
@@ -285,39 +285,35 @@ def configure_httpfs_and_s3(
|
|
|
285
285
|
"true" if meta_cache_on else "false",
|
|
286
286
|
)
|
|
287
287
|
|
|
288
|
-
# External file cache — an in-memory cache of
|
|
289
|
-
# repeated queries do not re-download the same row
|
|
290
|
-
#
|
|
291
|
-
#
|
|
292
|
-
#
|
|
293
|
-
#
|
|
288
|
+
# External file cache — an in-memory cache of external (e.g. remote
|
|
289
|
+
# Parquet) data blocks so repeated queries do not re-download the same row
|
|
290
|
+
# groups. Enabled whenever a cache size is configured. The cache is
|
|
291
|
+
# bounded by the global memory_limit (DuckDB enforces that bound), which is
|
|
292
|
+
# the effective cap; a dedicated per-cache cap is applied only on builds
|
|
293
|
+
# that expose external_file_cache_max_size (no released DuckDB through
|
|
294
|
+
# 1.5.x does), so in practice memory_limit is the bound.
|
|
294
295
|
cache_size = settings.SUPERTABLE_DUCKDB_EXTERNAL_CACHE_SIZE
|
|
295
296
|
can_cap = "external_file_cache_max_size" in supported
|
|
296
|
-
if cache_size
|
|
297
|
+
if cache_size:
|
|
297
298
|
set_if_supported("enable_external_file_cache", "true")
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
299
|
+
if can_cap:
|
|
300
|
+
set_if_supported("external_file_cache_max_size", f"'{cache_size}'")
|
|
301
|
+
cache_dir_raw = settings.SUPERTABLE_DUCKDB_EXTERNAL_CACHE_DIR
|
|
302
|
+
if not cache_dir_raw:
|
|
303
|
+
# Derive from SUPERTABLE_HOME — single env var controls all paths.
|
|
304
|
+
cache_dir_raw = os.path.join(get_app_home(), "duckdb_cache")
|
|
305
|
+
# Expand ~ so DuckDB receives an absolute path.
|
|
306
|
+
cache_dir = os.path.expanduser(cache_dir_raw)
|
|
307
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
308
|
+
set_if_supported("external_file_cache_directory", f"'{cache_dir}'")
|
|
309
|
+
logger.debug(
|
|
308
310
|
"[duckdb.cache] external file cache enabled"
|
|
309
|
-
+ (f"
|
|
310
|
-
|
|
311
|
+
+ (f", capped at {cache_size}" if can_cap
|
|
312
|
+
else f", bounded by memory_limit (size={cache_size}; "
|
|
313
|
+
"this DuckDB build has no dedicated cap)")
|
|
311
314
|
)
|
|
312
315
|
else:
|
|
313
|
-
# Uncappable (or disabled via empty size) — turn it off explicitly so
|
|
314
|
-
# the DuckDB 1.5.x default-on cache cannot accumulate in memory.
|
|
315
316
|
set_if_supported("enable_external_file_cache", "false")
|
|
316
|
-
if cache_size and not can_cap:
|
|
317
|
-
logger.info(
|
|
318
|
-
"[duckdb.cache] external file cache disabled: this DuckDB build "
|
|
319
|
-
"cannot cap it (no external_file_cache_max_size)"
|
|
320
|
-
)
|
|
321
317
|
|
|
322
318
|
|
|
323
319
|
# =========================================================
|
|
@@ -589,14 +585,13 @@ def rewrite_query_with_hashed_tables(
|
|
|
589
585
|
# =========================================================
|
|
590
586
|
|
|
591
587
|
def _external_file_cache_cappable(con: duckdb.DuckDBPyConnection) -> bool:
|
|
592
|
-
"""True when this DuckDB build
|
|
593
|
-
|
|
594
|
-
DuckDB 1.5.x
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
running unbounded.
|
|
588
|
+
"""True when this DuckDB build exposes a dedicated external-file-cache cap.
|
|
589
|
+
|
|
590
|
+
No released DuckDB (through 1.5.x) exposes ``external_file_cache_max_size``;
|
|
591
|
+
the cache is in-memory and bounded by ``memory_limit``, which DuckDB
|
|
592
|
+
enforces. This predicate gates only the *dedicated* per-cache size cap:
|
|
593
|
+
when it returns False the cache still runs, bounded by ``memory_limit``
|
|
594
|
+
rather than a separate cap.
|
|
600
595
|
"""
|
|
601
596
|
try:
|
|
602
597
|
return bool(con.execute(
|
|
@@ -679,16 +674,19 @@ def init_connection(
|
|
|
679
674
|
except Exception:
|
|
680
675
|
pass # older DuckDB builds may not support this setting
|
|
681
676
|
|
|
682
|
-
# External file cache baseline. DuckDB 1.
|
|
683
|
-
#
|
|
684
|
-
#
|
|
685
|
-
#
|
|
686
|
-
#
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
677
|
+
# External file cache baseline. DuckDB (>=1.3.0) ships an in-memory cache
|
|
678
|
+
# of external files, bounded by memory_limit (DuckDB enforces that bound).
|
|
679
|
+
# Honour the configured default here: enable when a cache size is set,
|
|
680
|
+
# otherwise off. configure_httpfs_and_s3 / apply_runtime_pragmas refine
|
|
681
|
+
# this (and apply a dedicated cap on builds that expose one).
|
|
682
|
+
try:
|
|
683
|
+
con.execute(
|
|
684
|
+
"SET enable_external_file_cache="
|
|
685
|
+
+ ("true" if settings.SUPERTABLE_DUCKDB_EXTERNAL_CACHE_SIZE else "false")
|
|
686
|
+
+ ";"
|
|
687
|
+
)
|
|
688
|
+
except Exception:
|
|
689
|
+
pass
|
|
692
690
|
|
|
693
691
|
# Thread count.
|
|
694
692
|
# If SUPERTABLE_DUCKDB_THREADS is set explicitly, honour it exactly.
|
|
@@ -783,17 +781,19 @@ def apply_runtime_pragmas(con: duckdb.DuckDBPyConnection, cfg) -> None:
|
|
|
783
781
|
except Exception:
|
|
784
782
|
pass
|
|
785
783
|
|
|
786
|
-
# External file cache:
|
|
787
|
-
#
|
|
788
|
-
#
|
|
789
|
-
#
|
|
784
|
+
# External file cache: enable whenever the org configures a cache size.
|
|
785
|
+
# The cache is in-memory and bounded by memory_limit (DuckDB enforces that
|
|
786
|
+
# bound), which is the effective cap. A dedicated per-cache cap is applied
|
|
787
|
+
# only on builds that expose external_file_cache_max_size (no released
|
|
788
|
+
# DuckDB through 1.5.x does); otherwise memory_limit is the bound.
|
|
790
789
|
cache_size = normalize_memory_size(cfg.duckdb_external_cache_size, default="")
|
|
791
|
-
if cache_size
|
|
790
|
+
if cache_size:
|
|
792
791
|
try:
|
|
793
792
|
con.execute("SET enable_external_file_cache=true;")
|
|
794
|
-
con
|
|
795
|
-
|
|
796
|
-
|
|
793
|
+
if _external_file_cache_cappable(con):
|
|
794
|
+
con.execute(
|
|
795
|
+
f"SET external_file_cache_max_size='{sanitize_sql_string(cache_size)}';"
|
|
796
|
+
)
|
|
797
797
|
except Exception as e:
|
|
798
798
|
logger.warning(f"[duckdb.pragma] external file cache config failed: {e}")
|
|
799
799
|
else:
|
|
@@ -892,7 +892,7 @@ def run_engine_diagnostics(cfg=None, engine: str = "lite") -> Dict[str, Any]:
|
|
|
892
892
|
("external_file_cache_max_size supported — the file cache can be capped"
|
|
893
893
|
if cappable else
|
|
894
894
|
"this build has no external_file_cache_max_size — the file cache "
|
|
895
|
-
"
|
|
895
|
+
"runs bounded by memory_limit rather than a dedicated cap"),
|
|
896
896
|
version)
|
|
897
897
|
except Exception as e:
|
|
898
898
|
add("version", "DuckDB version", "warn", f"version() failed: {e}")
|
|
@@ -1039,18 +1039,19 @@ def run_engine_diagnostics(cfg=None, engine: str = "lite") -> Dict[str, Any]:
|
|
|
1039
1039
|
getattr(cfg, "duckdb_external_cache_size", ""), default=""
|
|
1040
1040
|
)
|
|
1041
1041
|
if efc_on and not cappable:
|
|
1042
|
-
add("cache", "External file cache", "
|
|
1043
|
-
"Cache is ON
|
|
1044
|
-
"
|
|
1042
|
+
add("cache", "External file cache", "ok",
|
|
1043
|
+
"Cache is ON, bounded by memory_limit — this build has no "
|
|
1044
|
+
"dedicated cap (external_file_cache_max_size), so memory_limit "
|
|
1045
|
+
"is the bound", "on · memory_limit")
|
|
1045
1046
|
elif efc_on and cappable:
|
|
1046
1047
|
add("cache", "External file cache", "ok",
|
|
1047
1048
|
f"Cache is ON and capped at {cache_cfg or 'the configured size'}",
|
|
1048
1049
|
"on · capped")
|
|
1049
1050
|
else:
|
|
1050
1051
|
add("cache", "External file cache", "ok",
|
|
1051
|
-
"Cache is OFF —
|
|
1052
|
-
"
|
|
1053
|
-
"off")
|
|
1052
|
+
"Cache is OFF — remote files are re-fetched per query (set "
|
|
1053
|
+
"SUPERTABLE_DUCKDB_EXTERNAL_CACHE_SIZE to enable, bounded by "
|
|
1054
|
+
"memory_limit)", "off")
|
|
1054
1055
|
except Exception as e:
|
|
1055
1056
|
add("cache", "External file cache", "warn", f"Could not read cache state: {e}")
|
|
1056
1057
|
|
|
@@ -225,6 +225,7 @@ def _read_parquet_safe(
|
|
|
225
225
|
path: str,
|
|
226
226
|
profiler: Optional[Profiler] = None,
|
|
227
227
|
file_size: int = 0,
|
|
228
|
+
columns: Optional[List[str]] = None,
|
|
228
229
|
) -> Optional[polars.DataFrame]:
|
|
229
230
|
p = profiler or get_null_profiler()
|
|
230
231
|
if not _safe_exists(path, profiler=p):
|
|
@@ -232,7 +233,13 @@ def _read_parquet_safe(
|
|
|
232
233
|
return None
|
|
233
234
|
try:
|
|
234
235
|
with p.span("io.read_parquet"):
|
|
235
|
-
|
|
236
|
+
# Project to *columns* when given so only those column chunks are
|
|
237
|
+
# read (memory-bound fallback); gated so storages/test doubles that
|
|
238
|
+
# only accept ``path`` keep working on the unprojected paths.
|
|
239
|
+
tbl = (
|
|
240
|
+
_get_storage().read_parquet(path, columns=columns)
|
|
241
|
+
if columns else _get_storage().read_parquet(path)
|
|
242
|
+
) # -> pyarrow.Table
|
|
236
243
|
with p.span("io.arrow_to_polars"):
|
|
237
244
|
df = polars.from_arrow(tbl)
|
|
238
245
|
p.add("files_read", 1)
|
|
@@ -627,11 +634,12 @@ def _write_single_parquet_file(
|
|
|
627
634
|
rows = write_df.shape[0]
|
|
628
635
|
columns = write_df.shape[1]
|
|
629
636
|
|
|
630
|
-
# Ensure target directory exists
|
|
637
|
+
# Ensure the target directory exists. makedirs is idempotent on local
|
|
638
|
+
# storage and a no-op on object storage; calling it directly avoids a
|
|
639
|
+
# pointless prefix HEAD (which always 404s) on object stores.
|
|
631
640
|
with p.span("write.ensure_dir"):
|
|
632
641
|
try:
|
|
633
|
-
|
|
634
|
-
_get_storage().makedirs(target_dir)
|
|
642
|
+
_get_storage().makedirs(target_dir)
|
|
635
643
|
except Exception:
|
|
636
644
|
pass
|
|
637
645
|
|
|
@@ -728,6 +736,7 @@ def filter_stale_incoming_rows(
|
|
|
728
736
|
newer_than_col: str,
|
|
729
737
|
file_cache: Optional[Dict[str, polars.DataFrame]] = None,
|
|
730
738
|
profiler: Optional[Profiler] = None,
|
|
739
|
+
read_columns: Optional[List[str]] = None,
|
|
731
740
|
) -> polars.DataFrame:
|
|
732
741
|
"""
|
|
733
742
|
Remove rows from *incoming_df* that are stale or already present in existing data.
|
|
@@ -762,7 +771,7 @@ def filter_stale_incoming_rows(
|
|
|
762
771
|
# Read and collect relevant rows from overlapping files
|
|
763
772
|
existing_parts: List[polars.DataFrame] = []
|
|
764
773
|
for file_path, file_size in overlap_true_files:
|
|
765
|
-
part = _read_parquet_safe(file_path, profiler=p, file_size=file_size)
|
|
774
|
+
part = _read_parquet_safe(file_path, profiler=p, file_size=file_size, columns=read_columns)
|
|
766
775
|
if part is None:
|
|
767
776
|
continue
|
|
768
777
|
# Cache the full DataFrame for downstream reuse (avoids double-read)
|
|
@@ -880,6 +889,7 @@ def identify_deleted_rowids(
|
|
|
880
889
|
overwrite_columns: List[str],
|
|
881
890
|
file_cache: Optional[Dict[str, polars.DataFrame]] = None,
|
|
882
891
|
profiler: Optional[Profiler] = None,
|
|
892
|
+
read_columns: Optional[List[str]] = None,
|
|
883
893
|
) -> List[Tuple[str, int]]:
|
|
884
894
|
"""Find the ``(file, __rowid__)`` pairs of existing rows matching a delete predicate.
|
|
885
895
|
|
|
@@ -911,7 +921,7 @@ def identify_deleted_rowids(
|
|
|
911
921
|
if file_cache is not None and file in file_cache:
|
|
912
922
|
existing_df = file_cache.get(file)
|
|
913
923
|
else:
|
|
914
|
-
existing_df = _read_parquet_safe(file, profiler=p, file_size=file_size)
|
|
924
|
+
existing_df = _read_parquet_safe(file, profiler=p, file_size=file_size, columns=read_columns)
|
|
915
925
|
if existing_df is None:
|
|
916
926
|
continue
|
|
917
927
|
if ROWID_COL not in existing_df.columns:
|
|
@@ -1110,6 +1120,10 @@ def _duckdb_probe_overlap_matches(
|
|
|
1110
1120
|
f"filename=TRUE, hive_partitioning=FALSE) AS src "
|
|
1111
1121
|
f"SEMI JOIN {ik_name} AS k ON {join_cond}"
|
|
1112
1122
|
)
|
|
1123
|
+
logging.debug(
|
|
1124
|
+
f"[write-probe] duckdb scan: {len(paths)} file(s), "
|
|
1125
|
+
f"project={select_cols}, semi-join on {incoming_keys.height} key(s)"
|
|
1126
|
+
)
|
|
1113
1127
|
with p.span("io.duckdb_probe"):
|
|
1114
1128
|
return con.execute(sql).pl()
|
|
1115
1129
|
|
|
@@ -1168,6 +1182,11 @@ def _duckdb_probe_overlap_matches(
|
|
|
1168
1182
|
return None
|
|
1169
1183
|
p.add("probe_files", len(duck_paths))
|
|
1170
1184
|
p.add("probe_rows_matched", int(matched.height))
|
|
1185
|
+
logging.debug(
|
|
1186
|
+
f"[write-probe] duckdb scan matched {matched.height} existing row(s) "
|
|
1187
|
+
f"across {len(duck_paths)} file(s) (only key/__rowid__ columns read, "
|
|
1188
|
+
f"row groups skipped by footer min/max)"
|
|
1189
|
+
)
|
|
1171
1190
|
return matched
|
|
1172
1191
|
|
|
1173
1192
|
|
|
@@ -1273,6 +1292,11 @@ def resolve_overwrite_writes(
|
|
|
1273
1292
|
return incoming_df, []
|
|
1274
1293
|
|
|
1275
1294
|
incoming_keys = incoming_df.select(overwrite_columns).unique()
|
|
1295
|
+
logging.debug(
|
|
1296
|
+
f"[write-probe] resolve: {len(overlap_true)} overlapping file(s), "
|
|
1297
|
+
f"{incoming_keys.height} unique incoming key(s) on {overwrite_columns}, "
|
|
1298
|
+
f"newer_than={newer_than_col}"
|
|
1299
|
+
)
|
|
1276
1300
|
matched = _duckdb_probe_overlap_matches(
|
|
1277
1301
|
overlap_true, overwrite_columns, newer_than_col, incoming_keys, profiler=p,
|
|
1278
1302
|
)
|
|
@@ -1286,6 +1310,19 @@ def resolve_overwrite_writes(
|
|
|
1286
1310
|
|
|
1287
1311
|
# ---- Fallback: original polars full-read path (semantics oracle) ----
|
|
1288
1312
|
p.add("overwrite_resolve_fallback", 1)
|
|
1313
|
+
# Project reads to only the columns the fallback consumes — overwrite keys
|
|
1314
|
+
# (+ newer-than for stale filtering) + __rowid__ (for the delete vector) —
|
|
1315
|
+
# so wide tables are not fully materialised into memory. The shared
|
|
1316
|
+
# file_cache holds this projected union; each consumer selects its subset.
|
|
1317
|
+
read_columns = list(dict.fromkeys(
|
|
1318
|
+
list(overwrite_columns)
|
|
1319
|
+
+ ([newer_than_col] if newer_than_col else [])
|
|
1320
|
+
+ [ROWID_COL]
|
|
1321
|
+
))
|
|
1322
|
+
logging.debug(
|
|
1323
|
+
f"[write-probe] polars full-read fallback over {len(overlap_true)} file(s), "
|
|
1324
|
+
f"reading only {read_columns}"
|
|
1325
|
+
)
|
|
1289
1326
|
file_cache: Dict[str, polars.DataFrame] = {}
|
|
1290
1327
|
if newer_than_col:
|
|
1291
1328
|
filtered = filter_stale_incoming_rows(
|
|
@@ -1295,12 +1332,13 @@ def resolve_overwrite_writes(
|
|
|
1295
1332
|
newer_than_col=newer_than_col,
|
|
1296
1333
|
file_cache=file_cache,
|
|
1297
1334
|
profiler=p,
|
|
1335
|
+
read_columns=read_columns,
|
|
1298
1336
|
)
|
|
1299
1337
|
else:
|
|
1300
1338
|
filtered = incoming_df
|
|
1301
1339
|
pairs = identify_deleted_rowids(
|
|
1302
1340
|
filtered, overlapping_files, overwrite_columns,
|
|
1303
|
-
file_cache=file_cache, profiler=p,
|
|
1341
|
+
file_cache=file_cache, profiler=p, read_columns=read_columns,
|
|
1304
1342
|
)
|
|
1305
1343
|
return filtered, pairs
|
|
1306
1344
|
|
|
@@ -1354,8 +1392,8 @@ def build_tombstone_file(
|
|
|
1354
1392
|
combined = combined.unique(subset=[ROWID_COL], keep="first")
|
|
1355
1393
|
|
|
1356
1394
|
try:
|
|
1357
|
-
|
|
1358
|
-
|
|
1395
|
+
# Direct makedirs (idempotent local, no-op object) — avoids a 404 prefix HEAD.
|
|
1396
|
+
_get_storage().makedirs(tombstone_dir)
|
|
1359
1397
|
except Exception:
|
|
1360
1398
|
pass
|
|
1361
1399
|
|
|
@@ -1418,8 +1456,8 @@ def reclaim_fully_dead_files(
|
|
|
1418
1456
|
return fully_dead, None, None
|
|
1419
1457
|
|
|
1420
1458
|
try:
|
|
1421
|
-
|
|
1422
|
-
|
|
1459
|
+
# Direct makedirs (idempotent local, no-op object) — avoids a 404 prefix HEAD.
|
|
1460
|
+
_get_storage().makedirs(tombstone_dir)
|
|
1423
1461
|
except Exception:
|
|
1424
1462
|
pass
|
|
1425
1463
|
|
|
@@ -1688,8 +1726,8 @@ def build_stats_file(
|
|
|
1688
1726
|
combined = new_df
|
|
1689
1727
|
|
|
1690
1728
|
try:
|
|
1691
|
-
|
|
1692
|
-
|
|
1729
|
+
# Direct makedirs (idempotent local, no-op object) — avoids a 404 prefix HEAD.
|
|
1730
|
+
_get_storage().makedirs(stats_dir)
|
|
1693
1731
|
except Exception:
|
|
1694
1732
|
pass
|
|
1695
1733
|
|
|
@@ -128,19 +128,10 @@ class SimpleTable:
|
|
|
128
128
|
self.data_dir = os.path.join(self.simple_dir, "data")
|
|
129
129
|
self.snapshot_dir = os.path.join(self.simple_dir, "snapshots")
|
|
130
130
|
|
|
131
|
-
logger.debug(f"simple_dir: {self.simple_dir}")
|
|
132
|
-
logger.debug(f"data_dir: {self.data_dir}")
|
|
133
|
-
logger.debug(f"snapshot_dir: {self.snapshot_dir}")
|
|
134
|
-
|
|
135
131
|
# Fast path: if meta:leaf exists, don't touch storage
|
|
136
132
|
if self.catalog.leaf_exists(
|
|
137
133
|
self.super_table.organization, self.super_table.super_name, self.simple_name
|
|
138
134
|
):
|
|
139
|
-
logger.debug(
|
|
140
|
-
f"[SimpleTable] Leaf exists in Redis for "
|
|
141
|
-
f"{self.super_table.organization}/{self.super_table.super_name}/{self.simple_name}; "
|
|
142
|
-
f"skipping storage mkdirs and bootstrap."
|
|
143
|
-
)
|
|
144
135
|
return
|
|
145
136
|
|
|
146
137
|
# Read-only opt-out: refuse to bootstrap as a side effect. The
|