supertable 2.1.0__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {supertable-2.1.0/supertable.egg-info → supertable-2.2.0}/PKG-INFO +1 -1
- {supertable-2.1.0 → supertable-2.2.0}/pyproject.toml +1 -1
- {supertable-2.1.0 → supertable-2.2.0}/setup.py +1 -1
- {supertable-2.1.0 → supertable-2.2.0}/supertable/__init__.py +9 -1
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/writer_redis.py +16 -1
- {supertable-2.1.0 → supertable-2.2.0}/supertable/config/settings.py +22 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/data_reader.py +49 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/data_writer.py +50 -1
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/defaults.py +1 -1
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/data_estimator.py +9 -1
- supertable-2.2.0/supertable/errors.py +75 -0
- supertable-2.2.0/supertable/gc/__init__.py +26 -0
- supertable-2.2.0/supertable/gc/cleaner.py +258 -0
- supertable-2.2.0/supertable/gc/daemon.py +280 -0
- supertable-2.2.0/supertable/gc/queue.py +221 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/meta_reader.py +34 -14
- supertable-2.2.0/supertable/monitoring/__init__.py +62 -0
- supertable-2.2.0/supertable/monitoring/partitions.py +572 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/monitoring_writer.py +39 -11
- {supertable-2.1.0 → supertable-2.2.0}/supertable/plan_extender.py +37 -10
- {supertable-2.1.0 → supertable-2.2.0}/supertable/redis_catalog.py +10 -2
- {supertable-2.1.0 → supertable-2.2.0}/supertable/redis_keys.py +178 -16
- {supertable-2.1.0 → supertable-2.2.0}/supertable/simple_table.py +32 -1
- {supertable-2.1.0 → supertable-2.2.0}/supertable/super_table.py +27 -1
- supertable-2.2.0/supertable/tests/test_create_if_missing.py +267 -0
- supertable-2.2.0/supertable/tests/test_data_reader_preflight.py +249 -0
- supertable-2.2.0/supertable/tests/test_errors.py +81 -0
- supertable-2.2.0/supertable/tests/test_gc_cleaner.py +330 -0
- supertable-2.2.0/supertable/tests/test_gc_daemon.py +251 -0
- supertable-2.2.0/supertable/tests/test_gc_queue.py +331 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_meta_reader.py +9 -1
- supertable-2.2.0/supertable/tests/test_monitoring_partitions.py +724 -0
- supertable-2.2.0/supertable/tests/test_monitoring_sink_guard.py +167 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_redis_key_prefix.py +84 -14
- {supertable-2.1.0 → supertable-2.2.0/supertable.egg-info}/PKG-INFO +1 -1
- {supertable-2.1.0 → supertable-2.2.0}/supertable.egg-info/SOURCES.txt +15 -0
- {supertable-2.1.0 → supertable-2.2.0}/LICENSE +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/README.md +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/requirements.txt +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/setup.cfg +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/admin.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/chain.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/consumers.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/crypto.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/events.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/export.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/logger.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/middleware.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/reader.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/retention.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/tests/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/tests/test_chain.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/tests/test_crypto.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/tests/test_emit.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/tests/test_events.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/tests/test_retention.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/audit/writer_parquet.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/config/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/config/defaults.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/config/homedir.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/config/tests/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/config/tests/test_defaults.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/config/tests/test_homedir.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/config/tests/test_settings.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/data_classes.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/__main__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/check_filter_builder.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/controller.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/data_writer_helpers.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/dummy_data.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/read_parquet_header.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s01_01_01_create_super_table.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s01_01_02_enable_mirroring_formats.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s01_02_create_roles.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s01_03_create_users.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s02_01_write_dummy_data.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s02_02_write_single_data.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s02_03_01_write_staging.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s02_03_02_create_pipe.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s02_04_01_write_monitoring_simple.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s02_04_02_write_monitoring_parallel.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s02_05_write_tombstone.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s03_01_read_data_error.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s03_02_01_read_super_data_ok.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s03_02_02_read_table_data_ok.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s03_03_read_meta.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s03_04_read_staging.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s03_06_01_read_roles.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s03_06_02_read_user.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s03_07_01_estimate_read.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s03_07_02_estimate_files.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s03_08_read_snapshot_history.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s04_01_03_delete_pipe.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s05_01_delete_table.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/quickstart/s05_02_delete_super_table.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/webshop/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/webshop/core.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/webshop/defaults.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/webshop/generate.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/webshop/load.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/demo/webshop/topup.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/duckdb_lite.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/duckdb_pro.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/engine_common.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/engine_enum.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/executor.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/plan_stats.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/spark_thrift.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/tests/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/tests/conftest.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/tests/test_dedup_read.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/engine/tests/test_engine.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/locking/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/locking/benchmarks/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/locking/benchmarks/benchmark_locking.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/locking/benchmarks/measure_lock_speed.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/locking/benchmarks/measure_lock_time.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/locking/file_lock.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/locking/redis_lock.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/locking/tests/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/locking/tests/test_file_lock.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/locking/tests/test_redis_lock.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/logging.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/mirroring/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/mirroring/mirror_delta.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/mirroring/mirror_formats.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/mirroring/mirror_iceberg.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/mirroring/mirror_parquet.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/processing.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/query_plan_manager.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/rbac/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/rbac/access_control.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/rbac/filter_builder.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/rbac/permissions.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/rbac/role_manager.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/rbac/row_column_security.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/rbac/tests/test_filter_builder.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/rbac/tests/test_rbac.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/rbac/tests/test_rbac_per_table.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/rbac/user_manager.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/redis_connector.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/redis_infra.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/staging_area.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/storage/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/storage/azure_storage.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/storage/gcp_storage.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/storage/local_storage.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/storage/minio_storage.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/storage/s3_storage.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/storage/storage_factory.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/storage/storage_interface.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/storage/tests/test_storage.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/super_pipe.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_align_to_schema_fix.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_data_reader.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_data_writer.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_data_writer_comprehensive.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_data_writer_tombstones.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_dedup_on_read_write.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_newer_than.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_process_delete_only.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_processing.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_query_sql.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_simple_table.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_small_file_compaction.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_super_table.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/tests/test_supertable_all.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/utils/__init__.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/utils/helper.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/utils/sql_parser.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/utils/tests/test_sql_parser_columns.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable/utils/timer.py +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable.egg-info/dependency_links.txt +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable.egg-info/entry_points.txt +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable.egg-info/requires.txt +0 -0
- {supertable-2.1.0 → supertable-2.2.0}/supertable.egg-info/top_level.txt +0 -0
|
@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
|
|
|
19
19
|
|
|
20
20
|
setup(
|
|
21
21
|
name="supertable",
|
|
22
|
-
version="2.
|
|
22
|
+
version="2.2.0",
|
|
23
23
|
description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
|
|
24
24
|
long_description=long_description,
|
|
25
25
|
long_description_content_type="text/markdown",
|
|
@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
|
|
|
25
25
|
project documentation for the full API surface.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
__version__ = "2.
|
|
28
|
+
__version__ = "2.2.0"
|
|
29
29
|
|
|
30
30
|
# Re-export the core public surface so users can do ``from supertable import …``
|
|
31
31
|
# instead of remembering submodule paths.
|
|
@@ -39,6 +39,11 @@ from supertable.super_pipe import SuperPipe
|
|
|
39
39
|
from supertable.redis_catalog import RedisCatalog
|
|
40
40
|
from supertable.rbac.role_manager import RoleManager
|
|
41
41
|
from supertable.rbac.user_manager import UserManager
|
|
42
|
+
from supertable.errors import (
|
|
43
|
+
SupertableLookupError,
|
|
44
|
+
SuperTableNotFoundError,
|
|
45
|
+
TableNotFoundError,
|
|
46
|
+
)
|
|
42
47
|
|
|
43
48
|
__all__ = [
|
|
44
49
|
"__version__",
|
|
@@ -55,4 +60,7 @@ __all__ = [
|
|
|
55
60
|
"RedisCatalog",
|
|
56
61
|
"RoleManager",
|
|
57
62
|
"UserManager",
|
|
63
|
+
"SupertableLookupError",
|
|
64
|
+
"SuperTableNotFoundError",
|
|
65
|
+
"TableNotFoundError",
|
|
58
66
|
]
|
|
@@ -50,9 +50,24 @@ class RedisAuditWriter:
|
|
|
50
50
|
self._instance_id = instance_id
|
|
51
51
|
self._maxlen = maxlen
|
|
52
52
|
self._stream = RK.audit_stream(org)
|
|
53
|
-
|
|
53
|
+
# The chain-head key is lazy: read-only callers (e.g. the
|
|
54
|
+
# audit reader) instantiate this class purely to use
|
|
55
|
+
# ``query()`` and pass instance_id="". Building the
|
|
56
|
+
# chain-head key with an empty instance_id would fail v2's
|
|
57
|
+
# segment validator, so defer construction until a method
|
|
58
|
+
# that actually needs it is called.
|
|
54
59
|
self._ensure_stream()
|
|
55
60
|
|
|
61
|
+
@property
|
|
62
|
+
def _chain_key(self) -> str:
|
|
63
|
+
if not self._instance_id:
|
|
64
|
+
raise RuntimeError(
|
|
65
|
+
"RedisAuditWriter._chain_key requires a non-empty "
|
|
66
|
+
"instance_id; this writer was instantiated for "
|
|
67
|
+
"read-only use (query) and cannot save/load chain head."
|
|
68
|
+
)
|
|
69
|
+
return RK.audit_chain_head(self._org, self._instance_id)
|
|
70
|
+
|
|
56
71
|
def _ensure_stream(self) -> None:
|
|
57
72
|
"""Create the stream and archival consumer group if they don't exist."""
|
|
58
73
|
try:
|
|
@@ -282,6 +282,21 @@ class Settings:
|
|
|
282
282
|
SUPERTABLE_SHARE_PRESIGN_TTL: int = 14400 # SUPERTABLE_SHARE_PRESIGN_TTL (seconds, default 4h)
|
|
283
283
|
SUPERTABLE_SHARE_REFRESH_BUFFER: int = 600 # SUPERTABLE_SHARE_REFRESH_BUFFER (seconds, refresh 10min before expiry)
|
|
284
284
|
|
|
285
|
+
# ── Storage GC (deferred deletion of sunset parquets + old snapshot JSONs) ──
|
|
286
|
+
# Sunset parquet files and old snapshot JSONs are never deleted by the
|
|
287
|
+
# writer directly. When either flag is enabled, the writer XADDs paths
|
|
288
|
+
# to a per-table Redis STREAM after a successful leaf-CAS. A separate
|
|
289
|
+
# cleaner daemon (``python -m supertable.gc.cleaner``) drains entries
|
|
290
|
+
# older than ``SUPERTABLE_GC_DELAY_SEC`` and calls ``storage.delete()``.
|
|
291
|
+
# The delay window avoids the race where an in-flight reader's
|
|
292
|
+
# ``parquet_scan([...])`` resolves a file path right before the writer
|
|
293
|
+
# deletes it. Defaults below preserve today's behaviour (off, unlimited).
|
|
294
|
+
SUPERTABLE_SNAPSHOT_RETENTION: int = 0 # SUPERTABLE_SNAPSHOT_RETENTION (0 = keep all snapshot JSONs)
|
|
295
|
+
SUPERTABLE_SUNSET_GC_ENABLED: bool = False # SUPERTABLE_SUNSET_GC_ENABLED (delete sunset parquets via queue)
|
|
296
|
+
SUPERTABLE_GC_DELAY_SEC: int = 1800 # SUPERTABLE_GC_DELAY_SEC (min age before cleaner deletes)
|
|
297
|
+
SUPERTABLE_GC_SLEEP_SEC: int = 60 # SUPERTABLE_GC_SLEEP_SEC (cleaner loop sleep)
|
|
298
|
+
SUPERTABLE_GC_BATCH_SIZE: int = 500 # SUPERTABLE_GC_BATCH_SIZE (max entries processed per table per tick)
|
|
299
|
+
|
|
285
300
|
|
|
286
301
|
# ── Convenience properties ───────────────────────────────────────
|
|
287
302
|
|
|
@@ -528,6 +543,13 @@ def _build_settings() -> Settings:
|
|
|
528
543
|
# Data Sharing
|
|
529
544
|
SUPERTABLE_SHARE_PRESIGN_TTL=_env_int("SUPERTABLE_SHARE_PRESIGN_TTL", 14400),
|
|
530
545
|
SUPERTABLE_SHARE_REFRESH_BUFFER=_env_int("SUPERTABLE_SHARE_REFRESH_BUFFER", 600),
|
|
546
|
+
|
|
547
|
+
# ── Storage GC ───────────────────────────────────────────────
|
|
548
|
+
SUPERTABLE_SNAPSHOT_RETENTION=_env_int("SUPERTABLE_SNAPSHOT_RETENTION", 0),
|
|
549
|
+
SUPERTABLE_SUNSET_GC_ENABLED=_env_bool("SUPERTABLE_SUNSET_GC_ENABLED", False),
|
|
550
|
+
SUPERTABLE_GC_DELAY_SEC=_env_int("SUPERTABLE_GC_DELAY_SEC", 1800),
|
|
551
|
+
SUPERTABLE_GC_SLEEP_SEC=_env_int("SUPERTABLE_GC_SLEEP_SEC", 60),
|
|
552
|
+
SUPERTABLE_GC_BATCH_SIZE=_env_int("SUPERTABLE_GC_BATCH_SIZE", 500),
|
|
531
553
|
)
|
|
532
554
|
|
|
533
555
|
|
|
@@ -10,6 +10,7 @@ from typing import Optional, Tuple, Any, List, Dict
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
|
|
12
12
|
from supertable.config.defaults import logger
|
|
13
|
+
from supertable.errors import SuperTableNotFoundError, TableNotFoundError
|
|
13
14
|
from supertable.storage.storage_factory import get_storage
|
|
14
15
|
from supertable.storage.storage_interface import StorageInterface
|
|
15
16
|
from supertable.utils.timer import Timer
|
|
@@ -59,6 +60,44 @@ class DataReader:
|
|
|
59
60
|
def _lp(self, msg: str) -> str:
|
|
60
61
|
return f"{self._log_ctx}{msg}"
|
|
61
62
|
|
|
63
|
+
def _assert_targets_exist(self, physical_tables) -> None:
|
|
64
|
+
"""Fail fast if any referenced (super, simple) is missing in Redis.
|
|
65
|
+
|
|
66
|
+
The read path must never create catalog entries as a side effect
|
|
67
|
+
of resolving a query. ``SuperTable`` / ``SimpleTable``
|
|
68
|
+
constructors used to do exactly that for callers that didn't pass
|
|
69
|
+
``create_if_missing=False`` — this guard is the SDK-level
|
|
70
|
+
invariant that says "reads cannot mint tables".
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
SuperTableNotFoundError: when the supertable's
|
|
74
|
+
``meta:root`` pointer is missing.
|
|
75
|
+
TableNotFoundError: when the simple table's
|
|
76
|
+
``meta:leaf:doc:{simple}`` pointer is missing.
|
|
77
|
+
"""
|
|
78
|
+
if not physical_tables:
|
|
79
|
+
return
|
|
80
|
+
# One catalog handle for the whole loop — cheaper than letting
|
|
81
|
+
# each .exists() call open a fresh connection.
|
|
82
|
+
catalog = RedisCatalog()
|
|
83
|
+
# Dedup by (super, simple) — SQL may mention the same table
|
|
84
|
+
# multiple times via different aliases.
|
|
85
|
+
seen = set()
|
|
86
|
+
for td in physical_tables:
|
|
87
|
+
super_name = td.super_name
|
|
88
|
+
simple_name = td.simple_name
|
|
89
|
+
if not super_name or not simple_name:
|
|
90
|
+
continue
|
|
91
|
+
key = (super_name, simple_name)
|
|
92
|
+
if key in seen:
|
|
93
|
+
continue
|
|
94
|
+
seen.add(key)
|
|
95
|
+
if not catalog.root_exists(self.organization, super_name):
|
|
96
|
+
raise SuperTableNotFoundError(self.organization, super_name)
|
|
97
|
+
if not catalog.leaf_exists(self.organization, super_name, simple_name):
|
|
98
|
+
raise TableNotFoundError(
|
|
99
|
+
self.organization, super_name, simple_name
|
|
100
|
+
)
|
|
62
101
|
|
|
63
102
|
def execute(
|
|
64
103
|
self,
|
|
@@ -90,6 +129,16 @@ class DataReader:
|
|
|
90
129
|
)
|
|
91
130
|
|
|
92
131
|
try:
|
|
132
|
+
# Read-path policy: reads never create. Verify every
|
|
133
|
+
# referenced (super, simple) exists in the Redis catalog
|
|
134
|
+
# *before* the estimator / executor get a chance to
|
|
135
|
+
# side-effect-bootstrap them via SuperTable constructor
|
|
136
|
+
# calls. On a miss this raises SuperTableNotFoundError /
|
|
137
|
+
# TableNotFoundError, which the surrounding ``except`` turns
|
|
138
|
+
# into the same (empty_df, Status.ERROR, message) tuple as
|
|
139
|
+
# every other read failure — no state is touched.
|
|
140
|
+
self._assert_targets_exist(physical_tables)
|
|
141
|
+
|
|
93
142
|
# Make executor aware of storage for presign retry
|
|
94
143
|
executor = Executor(storage=self.storage, organization=self.organization)
|
|
95
144
|
|
|
@@ -11,6 +11,9 @@ import polars
|
|
|
11
11
|
from polars import DataFrame
|
|
12
12
|
|
|
13
13
|
from supertable.config.defaults import logger
|
|
14
|
+
from supertable.config.settings import settings
|
|
15
|
+
from supertable.gc.queue import collect_old_snapshot_paths, enqueue_deletions
|
|
16
|
+
from supertable.monitoring.partitions import MONITORING_SINK_TABLES
|
|
14
17
|
from supertable.monitoring_writer import MonitoringWriter # async monitoring
|
|
15
18
|
from supertable.super_table import SuperTable
|
|
16
19
|
from supertable import redis_keys as RK
|
|
@@ -442,6 +445,46 @@ class DataWriter:
|
|
|
442
445
|
self.catalog.bump_root(self.super_table.organization, self.super_table.super_name, now_ms=now_ms)
|
|
443
446
|
mark("bump_root")
|
|
444
447
|
|
|
448
|
+
# --- Enqueue deferred deletions (post-CAS, post-bump) ----------
|
|
449
|
+
# Strict ordering matters: leaf-CAS + root-bump have committed
|
|
450
|
+
# the new snapshot as the authoritative state. Only NOW is it
|
|
451
|
+
# safe to schedule physical deletion of files that the old
|
|
452
|
+
# snapshot still references — any reader that loaded the
|
|
453
|
+
# previous leaf payload will finish before the cleaner's delay
|
|
454
|
+
# window expires.
|
|
455
|
+
#
|
|
456
|
+
# Both blocks are best-effort: enqueue_deletions and
|
|
457
|
+
# collect_old_snapshot_paths swallow Redis/storage errors and
|
|
458
|
+
# log a warning. A GC failure must never fail the write.
|
|
459
|
+
try:
|
|
460
|
+
_org = self.super_table.organization
|
|
461
|
+
_sup = self.super_table.super_name
|
|
462
|
+
if settings.SUPERTABLE_SUNSET_GC_ENABLED and sunset_files:
|
|
463
|
+
enqueue_deletions(
|
|
464
|
+
self.catalog,
|
|
465
|
+
_org, _sup, simple_name,
|
|
466
|
+
"parquet",
|
|
467
|
+
list(sunset_files),
|
|
468
|
+
write_id=qid,
|
|
469
|
+
)
|
|
470
|
+
if settings.SUPERTABLE_SNAPSHOT_RETENTION > 0:
|
|
471
|
+
old_paths = collect_old_snapshot_paths(
|
|
472
|
+
new_snapshot_dict,
|
|
473
|
+
self.super_table.storage,
|
|
474
|
+
settings.SUPERTABLE_SNAPSHOT_RETENTION,
|
|
475
|
+
)
|
|
476
|
+
if old_paths:
|
|
477
|
+
enqueue_deletions(
|
|
478
|
+
self.catalog,
|
|
479
|
+
_org, _sup, simple_name,
|
|
480
|
+
"snapshot",
|
|
481
|
+
old_paths,
|
|
482
|
+
write_id=qid,
|
|
483
|
+
)
|
|
484
|
+
except Exception as e:
|
|
485
|
+
logger.warning(lp(f"GC enqueue failed (write still succeeded): {e}"))
|
|
486
|
+
mark("gc_enqueue")
|
|
487
|
+
|
|
445
488
|
# --- Store schema + table name in Redis (permanent, not cache) ---
|
|
446
489
|
try:
|
|
447
490
|
schema_raw = new_snapshot_dict.get("schema", {})
|
|
@@ -533,8 +576,14 @@ class DataWriter:
|
|
|
533
576
|
# Monitoring enqueue + flush is fully outside any data locks.
|
|
534
577
|
# MonitoringWriter.__exit__ calls request_flush() so the metric is
|
|
535
578
|
# guaranteed to reach Redis before this scope closes.
|
|
579
|
+
#
|
|
580
|
+
# Loop guard: writes targeted at a monitoring sink table
|
|
581
|
+
# (``__writes__``/``__reads__``/``__mcp__``/``__plans__``)
|
|
582
|
+
# are deliberately not measured — the external orchestrator
|
|
583
|
+
# that drained the partition is *writing back* the metric,
|
|
584
|
+
# and re-emitting it would create a 1:1 amplification cycle.
|
|
536
585
|
try:
|
|
537
|
-
if stats_payload is not None:
|
|
586
|
+
if stats_payload is not None and simple_name not in MONITORING_SINK_TABLES:
|
|
538
587
|
# Monitoring is org-wide as of SDK 2.2.0 — record the
|
|
539
588
|
# touched supertable in the payload's ``supertables``
|
|
540
589
|
# field. A DataWriter only touches one supertable, but
|
|
@@ -20,7 +20,7 @@ defaults.default.IS_SHOW_TIMING = True
|
|
|
20
20
|
|
|
21
21
|
# --- Identity ---------------------------------------------------------------
|
|
22
22
|
organization = "kladna-soft"
|
|
23
|
-
super_name = "
|
|
23
|
+
super_name = "demo"
|
|
24
24
|
simple_name = "facts"
|
|
25
25
|
|
|
26
26
|
# --- Roles (must match those created in 1.2. create_roles.py) ---------------
|
|
@@ -306,7 +306,15 @@ class DataEstimator:
|
|
|
306
306
|
|
|
307
307
|
for simple_name in tables:
|
|
308
308
|
snapshots = self._filter_snapshots(super_name, simple_name, all_snapshots)
|
|
309
|
-
|
|
309
|
+
# Defence in depth: the read path must never bootstrap a
|
|
310
|
+
# missing supertable. ``DataReader._assert_targets_exist``
|
|
311
|
+
# is the primary guard at the entry point; this kwarg
|
|
312
|
+
# ensures any other caller of ``DataEstimator`` (or any
|
|
313
|
+
# future code path) cannot accidentally side-effect a
|
|
314
|
+
# creation through the SuperTable constructor.
|
|
315
|
+
super_table = SuperTable(
|
|
316
|
+
super_name, self.organization, create_if_missing=False,
|
|
317
|
+
)
|
|
310
318
|
|
|
311
319
|
parquet_files: List[str] = []
|
|
312
320
|
schema: Set[str] = set()
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# route: supertable.errors
|
|
2
|
+
"""
|
|
3
|
+
Public exception hierarchy for SuperTable.
|
|
4
|
+
|
|
5
|
+
The exceptions defined here are part of the SDK's stable public surface
|
|
6
|
+
and are raised at well-known boundaries (read-path pre-flight checks,
|
|
7
|
+
constructor opt-outs, etc.). API layers wrap them into HTTP responses;
|
|
8
|
+
CLI layers print their messages directly.
|
|
9
|
+
|
|
10
|
+
Hierarchy
|
|
11
|
+
---------
|
|
12
|
+
|
|
13
|
+
LookupError (stdlib)
|
|
14
|
+
└── SupertableLookupError (this module)
|
|
15
|
+
├── SuperTableNotFoundError
|
|
16
|
+
└── TableNotFoundError
|
|
17
|
+
|
|
18
|
+
Inheriting from the stdlib ``LookupError`` means existing
|
|
19
|
+
``except LookupError`` / ``except KeyError`` callers keep working — every
|
|
20
|
+
SuperTable lookup failure is a "key not found" at heart.
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SupertableLookupError(LookupError):
|
|
26
|
+
"""Base for catalog lookup failures on read paths.
|
|
27
|
+
|
|
28
|
+
Carries the ``organization`` it was raised against so API/CLI layers
|
|
29
|
+
can format the error without re-parsing the message.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, message: str, organization: str):
|
|
33
|
+
super().__init__(message)
|
|
34
|
+
self.organization = organization
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class SuperTableNotFoundError(SupertableLookupError):
|
|
38
|
+
"""Raised when a SuperTable name is referenced but no Redis ``meta:root``
|
|
39
|
+
pointer exists for it.
|
|
40
|
+
|
|
41
|
+
Read-side code (``DataReader``, ``MetaReader``, ``DataEstimator``)
|
|
42
|
+
raises this instead of silently bootstrapping a new supertable as a
|
|
43
|
+
side effect of constructing the Python object.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, organization: str, super_name: str):
|
|
47
|
+
super().__init__(
|
|
48
|
+
f"SuperTable not found: {organization}/{super_name}",
|
|
49
|
+
organization=organization,
|
|
50
|
+
)
|
|
51
|
+
self.super_name = super_name
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class TableNotFoundError(SupertableLookupError):
|
|
55
|
+
"""Raised when a SimpleTable (``super.simple``) is referenced but no
|
|
56
|
+
Redis ``meta:leaf:doc:{simple}`` pointer exists for it.
|
|
57
|
+
|
|
58
|
+
Read-side code raises this instead of silently bootstrapping a new
|
|
59
|
+
empty table as a side effect of constructing the Python object.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, organization: str, super_name: str, simple_name: str):
|
|
63
|
+
super().__init__(
|
|
64
|
+
f"Table not found: {organization}/{super_name}/{simple_name}",
|
|
65
|
+
organization=organization,
|
|
66
|
+
)
|
|
67
|
+
self.super_name = super_name
|
|
68
|
+
self.simple_name = simple_name
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
__all__ = [
|
|
72
|
+
"SupertableLookupError",
|
|
73
|
+
"SuperTableNotFoundError",
|
|
74
|
+
"TableNotFoundError",
|
|
75
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# route: supertable.gc
|
|
2
|
+
"""
|
|
3
|
+
Deferred-deletion garbage collection for SuperTable storage.
|
|
4
|
+
|
|
5
|
+
The data writer never deletes files inline. When the GC flags are enabled
|
|
6
|
+
(``SUPERTABLE_SUNSET_GC_ENABLED`` / ``SUPERTABLE_SNAPSHOT_RETENTION``) the
|
|
7
|
+
writer XADDs paths to a per-table Redis STREAM after a successful leaf-CAS.
|
|
8
|
+
A long-running ``GCCleaner`` daemon (one per org) drains entries older than
|
|
9
|
+
``SUPERTABLE_GC_DELAY_SEC`` and calls ``storage.delete()`` on each path.
|
|
10
|
+
|
|
11
|
+
The delay window is what makes deletion safe under concurrent reads: a
|
|
12
|
+
query that resolved the leaf payload right before the writer committed
|
|
13
|
+
finishes long before the cleaner touches the file.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from supertable.gc.queue import (
|
|
17
|
+
collect_old_snapshot_paths,
|
|
18
|
+
enqueue_deletions,
|
|
19
|
+
nuke_stream,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"collect_old_snapshot_paths",
|
|
24
|
+
"enqueue_deletions",
|
|
25
|
+
"nuke_stream",
|
|
26
|
+
]
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# route: supertable.gc.cleaner
|
|
2
|
+
"""
|
|
3
|
+
GC orchestration primitives — the **library** surface.
|
|
4
|
+
|
|
5
|
+
This module exposes :class:`GCCleaner`, a pure orchestration object
|
|
6
|
+
whose only state-modifying operation is :meth:`GCCleaner.tick`. One
|
|
7
|
+
``tick()`` is one full pass over the org's per-table GC streams: drain
|
|
8
|
+
entries older than the configured delay, ``storage.delete()`` each
|
|
9
|
+
referenced path, ``XDEL`` the processed entries, return a stats dict.
|
|
10
|
+
|
|
11
|
+
The expected usage pattern is **caller-owned scheduling** — your
|
|
12
|
+
service decides when to call ``tick()``:
|
|
13
|
+
|
|
14
|
+
cleaner = GCCleaner(org="acme", catalog=catalog, storage=storage,
|
|
15
|
+
delay_sec=1800, batch_size=500)
|
|
16
|
+
while not service.shutdown_requested:
|
|
17
|
+
stats = cleaner.tick()
|
|
18
|
+
service.publish_metrics(stats)
|
|
19
|
+
service.sleep(60)
|
|
20
|
+
|
|
21
|
+
For deployments that don't already have a scheduler, the convenience
|
|
22
|
+
daemon in :mod:`supertable.gc.daemon` wraps this class with a
|
|
23
|
+
``run_forever`` loop and a CLI entrypoint. **Most deployments should
|
|
24
|
+
prefer calling ``tick()`` directly from their own scheduler** —
|
|
25
|
+
running ``run_forever`` is just here so that single-node installs
|
|
26
|
+
have a one-command setup path.
|
|
27
|
+
|
|
28
|
+
The delay window (``SUPERTABLE_GC_DELAY_SEC``) is the safety
|
|
29
|
+
guarantee: by the time the cleaner touches a file, any in-flight
|
|
30
|
+
reader that resolved the leaf payload before the writer committed has
|
|
31
|
+
long since finished. We never delete a file that's still reachable
|
|
32
|
+
from any reader's plan.
|
|
33
|
+
"""
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import logging
|
|
37
|
+
import threading
|
|
38
|
+
import time
|
|
39
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
40
|
+
|
|
41
|
+
from supertable import redis_keys as RK
|
|
42
|
+
from supertable.config.settings import settings
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _now_ms() -> int:
|
|
48
|
+
return int(time.time() * 1000)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _decode(v: Any) -> str:
|
|
52
|
+
"""Decode a redis-py value (bytes or str) to str."""
|
|
53
|
+
if isinstance(v, bytes):
|
|
54
|
+
try:
|
|
55
|
+
return v.decode("utf-8")
|
|
56
|
+
except UnicodeDecodeError:
|
|
57
|
+
return v.decode("utf-8", errors="replace")
|
|
58
|
+
return v
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _decode_fields(fields: Dict[Any, Any]) -> Dict[str, str]:
|
|
62
|
+
"""Decode a redis-py stream-entry field dict to {str: str}."""
|
|
63
|
+
return {_decode(k): _decode(v) for k, v in (fields or {}).items()}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class GCCleaner:
|
|
67
|
+
"""Per-org cleaner — orchestration primitive.
|
|
68
|
+
|
|
69
|
+
Calling :meth:`tick` performs one full pass over the org's GC
|
|
70
|
+
streams. It is intentionally **pure orchestration**: no loops, no
|
|
71
|
+
threads, no signal handlers. The caller's service is expected to
|
|
72
|
+
own scheduling, retries, and shutdown.
|
|
73
|
+
|
|
74
|
+
For deployments that don't have an external scheduler, the
|
|
75
|
+
convenience wrapper in :mod:`supertable.gc.daemon` adds a
|
|
76
|
+
``run_forever`` loop and a CLI entrypoint on top of this class.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
org:
|
|
81
|
+
Organisation name this cleaner is responsible for.
|
|
82
|
+
catalog:
|
|
83
|
+
``RedisCatalog`` (or any object with a ``.r`` redis-py client).
|
|
84
|
+
storage:
|
|
85
|
+
``StorageInterface`` instance used to delete files.
|
|
86
|
+
delay_sec, sleep_sec, batch_size:
|
|
87
|
+
Override the corresponding ``settings.*`` defaults. Useful for
|
|
88
|
+
tests; production code should leave these as ``None`` and pick
|
|
89
|
+
up the env-driven settings. ``sleep_sec`` is only consulted by
|
|
90
|
+
the daemon wrapper and has no effect on ``tick()`` itself.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
org: str,
|
|
96
|
+
catalog: Any,
|
|
97
|
+
storage: Any,
|
|
98
|
+
*,
|
|
99
|
+
delay_sec: Optional[int] = None,
|
|
100
|
+
sleep_sec: Optional[int] = None,
|
|
101
|
+
batch_size: Optional[int] = None,
|
|
102
|
+
) -> None:
|
|
103
|
+
if not org:
|
|
104
|
+
raise ValueError("GCCleaner: org is required")
|
|
105
|
+
self.org = org
|
|
106
|
+
self.catalog = catalog
|
|
107
|
+
self.storage = storage
|
|
108
|
+
self.delay_sec = int(delay_sec if delay_sec is not None else settings.SUPERTABLE_GC_DELAY_SEC)
|
|
109
|
+
self.sleep_sec = int(sleep_sec if sleep_sec is not None else settings.SUPERTABLE_GC_SLEEP_SEC)
|
|
110
|
+
self.batch_size = int(batch_size if batch_size is not None else settings.SUPERTABLE_GC_BATCH_SIZE)
|
|
111
|
+
if self.delay_sec < 0:
|
|
112
|
+
self.delay_sec = 0
|
|
113
|
+
if self.sleep_sec < 1:
|
|
114
|
+
self.sleep_sec = 1
|
|
115
|
+
if self.batch_size < 1:
|
|
116
|
+
self.batch_size = 1
|
|
117
|
+
|
|
118
|
+
# ``_stop`` is consulted by :meth:`tick` between streams so the
|
|
119
|
+
# daemon wrapper (or a test) can interrupt a long-running tick.
|
|
120
|
+
# The library surface itself never touches it after init.
|
|
121
|
+
self._stop = threading.Event()
|
|
122
|
+
|
|
123
|
+
# ── Lifecycle ────────────────────────────────────────────────────
|
|
124
|
+
|
|
125
|
+
def stop(self) -> None:
|
|
126
|
+
"""Signal an in-progress :meth:`tick` to bail early.
|
|
127
|
+
|
|
128
|
+
Used by the daemon wrapper and tests. After calling ``stop()``,
|
|
129
|
+
a subsequent ``tick()`` returns as soon as the current stream
|
|
130
|
+
finishes. The stop flag is not auto-reset — to re-use the
|
|
131
|
+
cleaner after a stop, instantiate a new one (cheap; no
|
|
132
|
+
persistent state).
|
|
133
|
+
"""
|
|
134
|
+
self._stop.set()
|
|
135
|
+
|
|
136
|
+
# ── One pass ─────────────────────────────────────────────────────
|
|
137
|
+
|
|
138
|
+
def tick(self) -> Dict[str, int]:
|
|
139
|
+
"""Process every per-table stream once. Returns stats dict.
|
|
140
|
+
|
|
141
|
+
Stats keys: ``streams_processed``, ``deleted``,
|
|
142
|
+
``deleted_parquet``, ``deleted_snapshot``, ``errors``.
|
|
143
|
+
|
|
144
|
+
Idempotent: re-calling produces the same effect if no new
|
|
145
|
+
entries appeared. Safe to call concurrently from multiple
|
|
146
|
+
processes — the worst case is two cleaners XDEL-ing the same
|
|
147
|
+
entry (Redis tolerates this) and double-attempting a
|
|
148
|
+
``storage.delete`` (also idempotent via ``FileNotFoundError``).
|
|
149
|
+
"""
|
|
150
|
+
stats = {
|
|
151
|
+
"streams_processed": 0,
|
|
152
|
+
"deleted": 0,
|
|
153
|
+
"deleted_parquet": 0,
|
|
154
|
+
"deleted_snapshot": 0,
|
|
155
|
+
"errors": 0,
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
cutoff_ms = _now_ms() - self.delay_sec * 1000
|
|
159
|
+
if cutoff_ms < 0:
|
|
160
|
+
cutoff_ms = 0
|
|
161
|
+
cutoff_id = f"{cutoff_ms}-0"
|
|
162
|
+
|
|
163
|
+
for stream_key in self._discover_streams():
|
|
164
|
+
if self._stop.is_set():
|
|
165
|
+
break
|
|
166
|
+
try:
|
|
167
|
+
processed, by_kind = self._drain_stream(stream_key, cutoff_id)
|
|
168
|
+
except Exception as e: # noqa: BLE001
|
|
169
|
+
logger.warning(
|
|
170
|
+
"[gc-cleaner] org=%s stream=%s drain failed: %s",
|
|
171
|
+
self.org, stream_key, e,
|
|
172
|
+
)
|
|
173
|
+
stats["errors"] += 1
|
|
174
|
+
continue
|
|
175
|
+
stats["streams_processed"] += 1
|
|
176
|
+
stats["deleted"] += processed
|
|
177
|
+
stats["deleted_parquet"] += by_kind.get("parquet", 0)
|
|
178
|
+
stats["deleted_snapshot"] += by_kind.get("snapshot", 0)
|
|
179
|
+
|
|
180
|
+
return stats
|
|
181
|
+
|
|
182
|
+
# ── Internals ────────────────────────────────────────────────────
|
|
183
|
+
|
|
184
|
+
def _discover_streams(self) -> Iterable[str]:
|
|
185
|
+
"""SCAN the org's GC streams and yield each key as a str."""
|
|
186
|
+
r = getattr(self.catalog, "r", None)
|
|
187
|
+
if r is None:
|
|
188
|
+
return []
|
|
189
|
+
pattern = RK.gc_pending_pattern_for_org(self.org)
|
|
190
|
+
try:
|
|
191
|
+
return [_decode(k) for k in r.scan_iter(match=pattern, count=512)]
|
|
192
|
+
except Exception as e: # noqa: BLE001
|
|
193
|
+
logger.warning("[gc-cleaner] org=%s SCAN failed: %s", self.org, e)
|
|
194
|
+
return []
|
|
195
|
+
|
|
196
|
+
def _drain_stream(
|
|
197
|
+
self, stream_key: str, cutoff_id: str
|
|
198
|
+
) -> Tuple[int, Dict[str, int]]:
|
|
199
|
+
"""XRANGE one stream up to cutoff, delete files, XDEL entries.
|
|
200
|
+
|
|
201
|
+
Returns ``(processed_count, by_kind)`` where ``by_kind`` maps
|
|
202
|
+
``"parquet"|"snapshot"`` → count. Re-raises Redis errors so the
|
|
203
|
+
caller's per-stream error budget tracks them.
|
|
204
|
+
"""
|
|
205
|
+
r = self.catalog.r
|
|
206
|
+
# Inclusive bounds: "-" = the very first id, "<cutoff_ms>-0" =
|
|
207
|
+
# the smallest id at the cutoff millisecond. Anything strictly
|
|
208
|
+
# after the cutoff is not returned.
|
|
209
|
+
entries = r.xrange(stream_key, min="-", max=cutoff_id, count=self.batch_size)
|
|
210
|
+
if not entries:
|
|
211
|
+
return 0, {}
|
|
212
|
+
|
|
213
|
+
deleted = 0
|
|
214
|
+
by_kind: Dict[str, int] = {"parquet": 0, "snapshot": 0}
|
|
215
|
+
to_xdel: List[bytes] = []
|
|
216
|
+
|
|
217
|
+
for entry_id, raw_fields in entries:
|
|
218
|
+
if self._stop.is_set():
|
|
219
|
+
break
|
|
220
|
+
fields = _decode_fields(raw_fields)
|
|
221
|
+
kind = fields.get("kind", "")
|
|
222
|
+
path = fields.get("path", "")
|
|
223
|
+
if not path or kind not in ("parquet", "snapshot"):
|
|
224
|
+
# Malformed entry — XDEL it so it doesn't block progress
|
|
225
|
+
to_xdel.append(entry_id)
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
self.storage.delete(path)
|
|
230
|
+
deleted += 1
|
|
231
|
+
by_kind[kind] = by_kind.get(kind, 0) + 1
|
|
232
|
+
except FileNotFoundError:
|
|
233
|
+
# Idempotent: already gone (another cleaner, manual
|
|
234
|
+
# removal, etc.) — still XDEL the entry.
|
|
235
|
+
pass
|
|
236
|
+
except Exception as e: # noqa: BLE001
|
|
237
|
+
logger.warning(
|
|
238
|
+
"[gc-cleaner] storage.delete failed for %s (kind=%s): %s",
|
|
239
|
+
path, kind, e,
|
|
240
|
+
)
|
|
241
|
+
# Leave entry on the stream so we retry next tick.
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
to_xdel.append(entry_id)
|
|
245
|
+
|
|
246
|
+
if to_xdel:
|
|
247
|
+
try:
|
|
248
|
+
r.xdel(stream_key, *to_xdel)
|
|
249
|
+
except Exception as e: # noqa: BLE001
|
|
250
|
+
logger.warning(
|
|
251
|
+
"[gc-cleaner] XDEL failed on %s for %d entries: %s",
|
|
252
|
+
stream_key, len(to_xdel), e,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return deleted, by_kind
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
__all__ = ["GCCleaner"]
|