supertable 2.3.2__tar.gz → 2.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. {supertable-2.3.2/supertable.egg-info → supertable-2.3.3}/PKG-INFO +1 -1
  2. {supertable-2.3.2 → supertable-2.3.3}/pyproject.toml +1 -1
  3. {supertable-2.3.2 → supertable-2.3.3}/setup.py +1 -1
  4. {supertable-2.3.2 → supertable-2.3.3}/supertable/__init__.py +1 -1
  5. {supertable-2.3.2 → supertable-2.3.3}/supertable/data_writer.py +97 -22
  6. {supertable-2.3.2 → supertable-2.3.3}/supertable/processing.py +27 -0
  7. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_writer.py +284 -0
  8. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_processing_compact_resources.py +93 -0
  9. {supertable-2.3.2 → supertable-2.3.3/supertable.egg-info}/PKG-INFO +1 -1
  10. {supertable-2.3.2 → supertable-2.3.3}/LICENSE +0 -0
  11. {supertable-2.3.2 → supertable-2.3.3}/README.md +0 -0
  12. {supertable-2.3.2 → supertable-2.3.3}/requirements.txt +0 -0
  13. {supertable-2.3.2 → supertable-2.3.3}/setup.cfg +0 -0
  14. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/__init__.py +0 -0
  15. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/admin.py +0 -0
  16. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/chain.py +0 -0
  17. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/consumers.py +0 -0
  18. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/crypto.py +0 -0
  19. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/events.py +0 -0
  20. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/export.py +0 -0
  21. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/logger.py +0 -0
  22. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/middleware.py +0 -0
  23. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/reader.py +0 -0
  24. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/retention.py +0 -0
  25. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/__init__.py +0 -0
  26. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/test_chain.py +0 -0
  27. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/test_crypto.py +0 -0
  28. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/test_emit.py +0 -0
  29. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/test_events.py +0 -0
  30. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/tests/test_retention.py +0 -0
  31. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/writer_parquet.py +0 -0
  32. {supertable-2.3.2 → supertable-2.3.3}/supertable/audit/writer_redis.py +0 -0
  33. {supertable-2.3.2 → supertable-2.3.3}/supertable/config/__init__.py +0 -0
  34. {supertable-2.3.2 → supertable-2.3.3}/supertable/config/defaults.py +0 -0
  35. {supertable-2.3.2 → supertable-2.3.3}/supertable/config/homedir.py +0 -0
  36. {supertable-2.3.2 → supertable-2.3.3}/supertable/config/settings.py +0 -0
  37. {supertable-2.3.2 → supertable-2.3.3}/supertable/config/tests/__init__.py +0 -0
  38. {supertable-2.3.2 → supertable-2.3.3}/supertable/config/tests/test_defaults.py +0 -0
  39. {supertable-2.3.2 → supertable-2.3.3}/supertable/config/tests/test_homedir.py +0 -0
  40. {supertable-2.3.2 → supertable-2.3.3}/supertable/config/tests/test_settings.py +0 -0
  41. {supertable-2.3.2 → supertable-2.3.3}/supertable/data_classes.py +0 -0
  42. {supertable-2.3.2 → supertable-2.3.3}/supertable/data_reader.py +0 -0
  43. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/__init__.py +0 -0
  44. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/__init__.py +0 -0
  45. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/__main__.py +0 -0
  46. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/check_filter_builder.py +0 -0
  47. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/controller.py +0 -0
  48. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/data_writer_helpers.py +0 -0
  49. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/defaults.py +0 -0
  50. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/dummy_data.py +0 -0
  51. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/read_parquet_header.py +0 -0
  52. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s01_01_01_create_super_table.py +0 -0
  53. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s01_01_02_enable_mirroring_formats.py +0 -0
  54. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s01_02_create_roles.py +0 -0
  55. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s01_03_create_users.py +0 -0
  56. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_01_write_dummy_data.py +0 -0
  57. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_02_write_single_data.py +0 -0
  58. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_03_01_write_staging.py +0 -0
  59. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_03_02_create_pipe.py +0 -0
  60. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_04_01_write_monitoring_simple.py +0 -0
  61. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_04_02_write_monitoring_parallel.py +0 -0
  62. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s02_05_write_tombstone.py +0 -0
  63. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_01_read_data_error.py +0 -0
  64. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_02_01_read_super_data_ok.py +0 -0
  65. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_02_02_read_table_data_ok.py +0 -0
  66. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_03_read_meta.py +0 -0
  67. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_04_read_staging.py +0 -0
  68. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_06_01_read_roles.py +0 -0
  69. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_06_02_read_user.py +0 -0
  70. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_07_01_estimate_read.py +0 -0
  71. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_07_02_estimate_files.py +0 -0
  72. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s03_08_read_snapshot_history.py +0 -0
  73. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s04_01_03_delete_pipe.py +0 -0
  74. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s05_01_delete_table.py +0 -0
  75. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/quickstart/s05_02_delete_super_table.py +0 -0
  76. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/__init__.py +0 -0
  77. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/core.py +0 -0
  78. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/defaults.py +0 -0
  79. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/generate.py +0 -0
  80. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/load.py +0 -0
  81. {supertable-2.3.2 → supertable-2.3.3}/supertable/demo/webshop/topup.py +0 -0
  82. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/__init__.py +0 -0
  83. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/data_estimator.py +0 -0
  84. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/duckdb_lite.py +0 -0
  85. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/duckdb_pro.py +0 -0
  86. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/engine_common.py +0 -0
  87. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/engine_config.py +0 -0
  88. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/engine_enum.py +0 -0
  89. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/executor.py +0 -0
  90. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/plan_stats.py +0 -0
  91. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/spark_thrift.py +0 -0
  92. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/__init__.py +0 -0
  93. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/conftest.py +0 -0
  94. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/test_engine.py +0 -0
  95. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/test_engine_config.py +0 -0
  96. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/test_engine_routing.py +0 -0
  97. {supertable-2.3.2 → supertable-2.3.3}/supertable/engine/tests/test_engine_spill.py +0 -0
  98. {supertable-2.3.2 → supertable-2.3.3}/supertable/errors.py +0 -0
  99. {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/__init__.py +0 -0
  100. {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/benchmarks/__init__.py +0 -0
  101. {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/benchmarks/benchmark_locking.py +0 -0
  102. {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/benchmarks/measure_lock_speed.py +0 -0
  103. {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/benchmarks/measure_lock_time.py +0 -0
  104. {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/file_lock.py +0 -0
  105. {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/redis_lock.py +0 -0
  106. {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/tests/__init__.py +0 -0
  107. {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/tests/test_file_lock.py +0 -0
  108. {supertable-2.3.2 → supertable-2.3.3}/supertable/locking/tests/test_redis_lock.py +0 -0
  109. {supertable-2.3.2 → supertable-2.3.3}/supertable/logging.py +0 -0
  110. {supertable-2.3.2 → supertable-2.3.3}/supertable/meta_reader.py +0 -0
  111. {supertable-2.3.2 → supertable-2.3.3}/supertable/mirroring/__init__.py +0 -0
  112. {supertable-2.3.2 → supertable-2.3.3}/supertable/mirroring/mirror_delta.py +0 -0
  113. {supertable-2.3.2 → supertable-2.3.3}/supertable/mirroring/mirror_formats.py +0 -0
  114. {supertable-2.3.2 → supertable-2.3.3}/supertable/mirroring/mirror_iceberg.py +0 -0
  115. {supertable-2.3.2 → supertable-2.3.3}/supertable/mirroring/mirror_parquet.py +0 -0
  116. {supertable-2.3.2 → supertable-2.3.3}/supertable/monitoring/__init__.py +0 -0
  117. {supertable-2.3.2 → supertable-2.3.3}/supertable/monitoring/partitions.py +0 -0
  118. {supertable-2.3.2 → supertable-2.3.3}/supertable/monitoring_writer.py +0 -0
  119. {supertable-2.3.2 → supertable-2.3.3}/supertable/plan_extender.py +0 -0
  120. {supertable-2.3.2 → supertable-2.3.3}/supertable/query_plan_manager.py +0 -0
  121. {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/__init__.py +0 -0
  122. {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/access_control.py +0 -0
  123. {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/filter_builder.py +0 -0
  124. {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/permissions.py +0 -0
  125. {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/role_manager.py +0 -0
  126. {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/row_column_security.py +0 -0
  127. {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/tests/test_filter_builder.py +0 -0
  128. {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/tests/test_rbac.py +0 -0
  129. {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/tests/test_rbac_per_table.py +0 -0
  130. {supertable-2.3.2 → supertable-2.3.3}/supertable/rbac/user_manager.py +0 -0
  131. {supertable-2.3.2 → supertable-2.3.3}/supertable/redis_catalog.py +0 -0
  132. {supertable-2.3.2 → supertable-2.3.3}/supertable/redis_connector.py +0 -0
  133. {supertable-2.3.2 → supertable-2.3.3}/supertable/redis_infra.py +0 -0
  134. {supertable-2.3.2 → supertable-2.3.3}/supertable/redis_keys.py +0 -0
  135. {supertable-2.3.2 → supertable-2.3.3}/supertable/simple_table.py +0 -0
  136. {supertable-2.3.2 → supertable-2.3.3}/supertable/staging_area.py +0 -0
  137. {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/__init__.py +0 -0
  138. {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/azure_storage.py +0 -0
  139. {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/gcp_storage.py +0 -0
  140. {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/local_storage.py +0 -0
  141. {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/minio_storage.py +0 -0
  142. {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/s3_storage.py +0 -0
  143. {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/storage_factory.py +0 -0
  144. {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/storage_interface.py +0 -0
  145. {supertable-2.3.2 → supertable-2.3.3}/supertable/storage/tests/test_storage.py +0 -0
  146. {supertable-2.3.2 → supertable-2.3.3}/supertable/super_pipe.py +0 -0
  147. {supertable-2.3.2 → supertable-2.3.3}/supertable/super_table.py +0 -0
  148. {supertable-2.3.2 → supertable-2.3.3}/supertable/system_query.py +0 -0
  149. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/__init__.py +0 -0
  150. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_align_to_schema_fix.py +0 -0
  151. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_create_if_missing.py +0 -0
  152. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_reader.py +0 -0
  153. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_reader_preflight.py +0 -0
  154. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_writer_compact.py +0 -0
  155. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_data_writer_comprehensive.py +0 -0
  156. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_errors.py +0 -0
  157. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_meta_reader.py +0 -0
  158. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_monitoring_partitions.py +0 -0
  159. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_monitoring_sink_guard.py +0 -0
  160. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_newer_than.py +0 -0
  161. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_parquet_statistics.py +0 -0
  162. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_processing.py +0 -0
  163. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_processing_stats.py +0 -0
  164. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_query_sql.py +0 -0
  165. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_read_pruning_differential.py +0 -0
  166. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_read_pruning_integration.py +0 -0
  167. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_redis_key_prefix.py +0 -0
  168. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_resolve_overwrite_writes.py +0 -0
  169. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_simple_table.py +0 -0
  170. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_stats_cache.py +0 -0
  171. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_stats_pruning.py +0 -0
  172. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_stats_schema_snapshot.py +0 -0
  173. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_super_table.py +0 -0
  174. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_supertable_all.py +0 -0
  175. {supertable-2.3.2 → supertable-2.3.3}/supertable/tests/test_system_query.py +0 -0
  176. {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/__init__.py +0 -0
  177. {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/helper.py +0 -0
  178. {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/profiler.py +0 -0
  179. {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/sql_parser.py +0 -0
  180. {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/tests/test_sql_parser_columns.py +0 -0
  181. {supertable-2.3.2 → supertable-2.3.3}/supertable/utils/timer.py +0 -0
  182. {supertable-2.3.2 → supertable-2.3.3}/supertable.egg-info/SOURCES.txt +0 -0
  183. {supertable-2.3.2 → supertable-2.3.3}/supertable.egg-info/dependency_links.txt +0 -0
  184. {supertable-2.3.2 → supertable-2.3.3}/supertable.egg-info/entry_points.txt +0 -0
  185. {supertable-2.3.2 → supertable-2.3.3}/supertable.egg-info/requires.txt +0 -0
  186. {supertable-2.3.2 → supertable-2.3.3}/supertable.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: supertable
3
- Version: 2.3.2
3
+ Version: 2.3.3
4
4
  Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
5
5
  Author: Levente Kupas
6
6
  Author-email: Levente Kupas <lkupas@kladnasoft.com>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "supertable"
7
- version = "2.3.2"
7
+ version = "2.3.3"
8
8
  description = "SuperTable — versioned data lake library for SQL analytics on Parquet + Redis."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
19
19
 
20
20
  setup(
21
21
  name="supertable",
22
- version="2.3.2",
22
+ version="2.3.3",
23
23
  description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
24
24
  long_description=long_description,
25
25
  long_description_content_type="text/markdown",
@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
25
25
  project documentation for the full API surface.
26
26
  """
27
27
 
28
- __version__ = "2.3.2"
28
+ __version__ = "2.3.3"
29
29
 
30
30
  # Re-export the core public surface so users can do ``from supertable import …``
31
31
  # instead of remembering submodule paths.
@@ -35,6 +35,7 @@ from supertable.processing import (
35
35
  write_parquet_and_collect_resources,
36
36
  compact_resources,
37
37
  compact_tombstones,
38
+ should_compact_small_files,
38
39
  _max_tombstone_rows,
39
40
  _read_parquet_safe,
40
41
  )
@@ -527,28 +528,52 @@ class DataWriter:
527
528
  )
528
529
  mark("build_tombstone")
529
530
 
530
- # 4. Threshold compaction: physically drop dead rows once the
531
- # deletion-vector grows past max_tombstone_rows, then clear it.
532
- if (
531
+ # 4. Threshold compaction (two triggers, same physical step):
532
+ # (a) the deletion-vector grew past max_tombstone_rows, or
533
+ # (b) the small files tripped the auto-compaction gate.
534
+ # Both must FIRST physically drop tombstoned rows (Phase A)
535
+ # and only THEN merge small files (Phase B): compact_resources
536
+ # rewrites data files WITHOUT consulting the deletion-vector,
537
+ # so sunsetting a vector-referenced file would orphan its dead
538
+ # rows (hidden on read, never reclaimable). Draining first
539
+ # guarantees Phase B only ever sees vector-free survivors.
540
+ post_write_resources = (
541
+ (last_simple_table.get("resources") or []) + new_resources
542
+ )
543
+ compaction_gate = should_compact_small_files(
544
+ post_write_resources, table_config
545
+ )
546
+ tombstone_threshold_hit = (
533
547
  combined_tombstone_df is not None
534
548
  and combined_tombstone_df.height >= _max_tombstone_rows(table_config)
535
- ):
536
- removed, compact_new, compact_sunset = compact_tombstones(
537
- snapshot=last_simple_table,
538
- tombstone_df=combined_tombstone_df,
539
- data_dir=simple_table.data_dir,
540
- compression_level=compression_level,
541
- table_config=table_config,
542
- profiler=profiler,
543
- )
544
- new_resources.extend(compact_new)
545
- sunset_files |= compact_sunset
546
- tombstone_path = None # deletion-vector fully consumed
547
- tombstone_rows = 0
548
- logger.info(lp(
549
- f"tombstone compaction removed {removed} rows "
550
- f"from {len(compact_sunset)} files"
551
- ))
549
+ )
550
+
551
+ # Phase A — drain the deletion-vector when either trigger fires
552
+ # and a vector is actually live (freshly built this write OR
553
+ # carried forward from a prior one).
554
+ if tombstone_threshold_hit or compaction_gate:
555
+ dv_to_drain = combined_tombstone_df
556
+ if dv_to_drain is None and tombstone_path:
557
+ # Pure carry-forward: load the live vector so the merge
558
+ # below never sunsets a file it still references.
559
+ dv_to_drain = _read_parquet_safe(tombstone_path, profiler=profiler)
560
+ if dv_to_drain is not None and dv_to_drain.height > 0:
561
+ removed, tomb_new, tomb_sunset = compact_tombstones(
562
+ snapshot=last_simple_table,
563
+ tombstone_df=dv_to_drain,
564
+ data_dir=simple_table.data_dir,
565
+ compression_level=compression_level,
566
+ table_config=table_config,
567
+ profiler=profiler,
568
+ )
569
+ new_resources.extend(tomb_new)
570
+ sunset_files |= tomb_sunset
571
+ tombstone_path = None # deletion-vector fully consumed
572
+ tombstone_rows = 0
573
+ logger.info(lp(
574
+ f"tombstone compaction removed {removed} rows "
575
+ f"from {len(tomb_sunset)} files"
576
+ ))
552
577
 
553
578
  # 5. Pin the (carried-forward / new / cleared) tombstone pointer
554
579
  # and its row count.
@@ -556,6 +581,45 @@ class DataWriter:
556
581
  last_simple_table["tombstone_rows"] = tombstone_rows
557
582
  mark("compact_tombstones")
558
583
 
584
+ # Phase B — auto small-file compaction. Merge the accumulated
585
+ # small files (existing survivors + the file just written) once
586
+ # the gate is open so the file count stays bounded. The vector
587
+ # was drained above, so every surviving file is safe to sunset.
588
+ # Result folds into the SAME snapshot commit below (new_resources
589
+ # / sunset_files feed build_stats and simple_table.update).
590
+ compaction_ran = False
591
+ if compaction_gate:
592
+ live_resources = [
593
+ r for r in (last_simple_table.get("resources") or [])
594
+ if r.get("file") not in sunset_files
595
+ ]
596
+ live_resources += [
597
+ r for r in new_resources if r.get("file") not in sunset_files
598
+ ]
599
+ considered, comp_rows, comp_new, comp_sunset = compact_resources(
600
+ snapshot={"resources": live_resources},
601
+ data_dir=simple_table.data_dir,
602
+ compression_level=compression_level,
603
+ table_config=table_config,
604
+ small_only=True,
605
+ )
606
+ if comp_new or comp_sunset:
607
+ sunset_files |= comp_sunset
608
+ # A file written above (incoming or tombstone survivor)
609
+ # may have been re-merged here; drop any new_resources
610
+ # entry that is now sunset so the snapshot never lists a
611
+ # file as both live and gone.
612
+ new_resources = [
613
+ r for r in (new_resources + comp_new)
614
+ if r.get("file") not in sunset_files
615
+ ]
616
+ compaction_ran = True
617
+ logger.info(lp(
618
+ f"auto-compaction merged {considered} small files "
619
+ f"into {len(comp_new)} file(s) ({comp_rows} rows)"
620
+ ))
621
+ mark("compact_small")
622
+
559
623
  # 6. Carry forward + extend the external column-statistics parquet.
560
624
  # Read the footers of the newly written data files, drop the
561
625
  # rows of any sunset file, and append the new ones. No new
@@ -614,7 +678,18 @@ class DataWriter:
614
678
  # model_df would shrink schema / schemaString to that partial
615
679
  # shape even though all parquet files still have full schema.
616
680
  # See docs/03_data_model.md "Schema Field Semantics".
617
- schema_model_df = None if delete_only else dataframe
681
+ #
682
+ # When auto-compaction merged files this write, derive the
683
+ # schema from the compacted output instead: a merged file may
684
+ # union in columns from older files that the incoming frame
685
+ # lacks (schema-evolving tables), so `dataframe` would narrow
686
+ # the metadata even though the Parquet is wider.
687
+ if compaction_ran:
688
+ schema_model_df = self._build_compact_model_df(
689
+ new_resources, last_simple_table
690
+ )
691
+ else:
692
+ schema_model_df = None if delete_only else dataframe
618
693
  new_snapshot_dict, new_snapshot_path = simple_table.update(
619
694
  new_resources, sunset_files, schema_model_df,
620
695
  last_snapshot=last_simple_table,
@@ -720,7 +795,7 @@ class DataWriter:
720
795
  f"lock={timings.get('lock', 0):.3f} | snapshot={timings.get('snapshot', 0):.3f} | "
721
796
  f"overlap={timings.get('overlap', 0):.3f} | stats_prune={timings.get('stats_prune', 0):.3f} | newer_than={timings.get('newer_than', 0):.3f} | "
722
797
  f"identify_deletes={timings.get('identify_deletes', 0):.3f} | write_parquet={timings.get('write_parquet', 0):.3f} | "
723
- f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
798
+ f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | compact_small={timings.get('compact_small', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
724
799
  f"update_simple={timings.get('update_simple', 0):.3f} | bump_root={timings.get('bump_root', 0):.3f} | "
725
800
  f"mirror={timings.get('mirror', 0):.3f} | prepare_monitor={timings.get('prepare_monitor', 0):.3f}"
726
801
  )
@@ -292,6 +292,33 @@ def prune_not_overlapping_files_by_threshold(
292
292
  return result
293
293
 
294
294
 
295
+ def should_compact_small_files(
296
+ resources: List[Dict],
297
+ table_config: Optional[dict] = None,
298
+ ) -> bool:
299
+ """Return True when accumulated small files trip the auto-compaction gate.
300
+
301
+ Mirrors the threshold in ``prune_not_overlapping_files_by_threshold``: a
302
+ file is "small" when its ``file_size`` is strictly smaller than
303
+ ``max_memory_chunk_size``. The gate opens when EITHER the small-file count
304
+ reaches ``max_overlapping_files`` OR the combined small-file size exceeds
305
+ ``max_memory_chunk_size``. Files already at/above the chunk size are big
306
+ enough on their own and are never counted.
307
+
308
+ ``resources`` is a snapshot's resource list (dicts with ``file`` /
309
+ ``file_size``). Limits resolve per-table via ``_resolve_limits``.
310
+ """
311
+ max_mem, max_files = _resolve_limits(table_config)
312
+ small_sizes = [
313
+ int(r.get("file_size") or 0)
314
+ for r in (resources or [])
315
+ if r.get("file") and int(r.get("file_size") or 0) < max_mem
316
+ ]
317
+ if not small_sizes:
318
+ return False
319
+ return len(small_sizes) >= max_files or sum(small_sizes) > max_mem
320
+
321
+
295
322
  # =========================
296
323
  # Public API: Overlap selection (with compaction triggers)
297
324
  # =========================
@@ -48,6 +48,16 @@ _PATCH_BUILD_TOMBSTONE = f"{_MOD}.build_tombstone_file"
48
48
  _PATCH_MIRROR = f"{_MOD}.MirrorFormats"
49
49
  _PATCH_GET_MON_LOGGER = f"{_MOD}.MonitoringWriter"
50
50
  _PATCH_UUID4 = f"{_MOD}.uuid.uuid4"
51
+ # Auto-compaction step (Phase A drain + Phase B small-file merge) wired into
52
+ # write(). The gate (should_compact_small_files) is left UNMOCKED so tests
53
+ # drive the REAL threshold off the snapshot's resource list; the heavy merge
54
+ # helpers and the stats writers (which would otherwise touch storage once files
55
+ # are sunset) are mocked so the tests pin orchestration, not Parquet I/O.
56
+ _PATCH_COMPACT_RES = f"{_MOD}.compact_resources"
57
+ _PATCH_COMPACT_TOMB = f"{_MOD}.compact_tombstones"
58
+ _PATCH_READ_PARQUET = f"{_MOD}._read_parquet_safe"
59
+ _PATCH_EXTRACT_STATS = f"{_MOD}.extract_stats_rows"
60
+ _PATCH_BUILD_STATS = f"{_MOD}.build_stats_file"
51
61
 
52
62
 
53
63
  # ---------------------------------------------------------------------------
@@ -1814,3 +1824,277 @@ class TestWriteOverwriteResolution:
1814
1824
  assert kwargs["newer_than_col"] == "ts"
1815
1825
  # The single returned delete pair drives the deleted count.
1816
1826
  assert result[3] == 1
1827
+
1828
+
1829
+ # ====================================================================
1830
+ # 12. DataWriter.write — Inline Auto-Compaction (small-file gate)
1831
+ # ====================================================================
1832
+
1833
+ def _small_resources(n: int, *, size: int = 80 * 1024) -> List[Dict]:
1834
+ """N small-file resource dicts that trip should_compact_small_files'
1835
+ REAL count gate (default MAX_OVERLAPPING_FILES=100) once n >= 100.
1836
+
1837
+ Only ``file`` / ``file_size`` matter — the gate ignores everything else,
1838
+ and the merge helper is mocked so the files are never opened."""
1839
+ return [
1840
+ {"file": f"small_{i}.parquet", "file_size": size, "rows": 100}
1841
+ for i in range(n)
1842
+ ]
1843
+
1844
+
1845
+ def _mk_compaction_catalog():
1846
+ cat = MagicMock()
1847
+ cat.reserve_rowids.return_value = 0
1848
+ cat.get_table_config.return_value = None # → default limits (100 / 16MB)
1849
+ cat.acquire_simple_lock.return_value = "t"
1850
+ cat.release_simple_lock.return_value = True
1851
+ cat.set_leaf_payload_cas.return_value = 1
1852
+ cat.bump_root.return_value = 1
1853
+ return cat
1854
+
1855
+
1856
+ class TestWriteAutoCompaction:
1857
+ """The user's bug: small files accumulated forever because automatic
1858
+ compaction was never wired into write() — only the manual compact()
1859
+ entry point merged them. These tests pin the inline step: the gate is
1860
+ checked on every write, draining the deletion-vector FIRST (Phase A)
1861
+ so the small-file merge (Phase B) can never sunset a vector-referenced
1862
+ file, and the merged output folds into the SAME snapshot commit."""
1863
+
1864
+ @patch(_PATCH_COMPACT_RES)
1865
+ @patch(_PATCH_COMPACT_TOMB)
1866
+ @patch(_PATCH_READ_PARQUET)
1867
+ @patch(_PATCH_BUILD_STATS)
1868
+ @patch(_PATCH_EXTRACT_STATS)
1869
+ @patch(_PATCH_BUILD_TOMBSTONE)
1870
+ @patch(_PATCH_GET_MON_LOGGER)
1871
+ @patch(_PATCH_MIRROR)
1872
+ @patch(_PATCH_PROCESS_OVERLAP)
1873
+ @patch(_PATCH_RESOLVE)
1874
+ @patch(_PATCH_FIND_OVERLAP)
1875
+ @patch(_PATCH_SIMPLE_TABLE)
1876
+ @patch(_PATCH_CHECK_WRITE)
1877
+ @patch(_PATCH_POLARS_FROM_ARROW)
1878
+ @patch(_PATCH_REDIS_CATALOG)
1879
+ @patch(_PATCH_SUPER_TABLE)
1880
+ def test_gate_trips_append_merges_and_folds_into_snapshot(
1881
+ self,
1882
+ MockST, MockCat, mock_from_arrow, mock_check_write,
1883
+ MockSimple, mock_find_overlap, mock_resolve, mock_process,
1884
+ MockMirror, mock_get_mon, mock_build_tomb, mock_extract_stats,
1885
+ mock_build_stats, mock_read_parquet, mock_compact_tomb,
1886
+ mock_compact_res,
1887
+ ):
1888
+ """100 accumulated small files → REAL gate trips → compact_resources
1889
+ runs once and its merged output / sunset set fold into the single
1890
+ simple_table.update() commit. No deletes ⇒ tombstone drain is a
1891
+ no-op (nothing to orphan)."""
1892
+ mock_st = MagicMock(super_name="s", organization="o")
1893
+ MockST.return_value = mock_st
1894
+ MockCat.return_value = _mk_compaction_catalog()
1895
+
1896
+ df = _polars_df({"id": [1], "ts": [100]})
1897
+ mock_from_arrow.return_value = df
1898
+
1899
+ snap = {"resources": _small_resources(100)}
1900
+ mock_simple = MagicMock(data_dir="/d")
1901
+ mock_simple.get_simple_table_snapshot.return_value = (snap, "/p")
1902
+ mock_simple.update.return_value = ({}, "/np")
1903
+ MockSimple.return_value = mock_simple
1904
+ mock_find_overlap.return_value = set()
1905
+
1906
+ # Pure append: rows survive, no delete pairs, no carried-forward vector.
1907
+ mock_resolve.side_effect = lambda **kw: (kw["incoming_df"], [])
1908
+ mock_build_tomb.return_value = (None, None)
1909
+ # The just-written file lands in new_resources (the established pattern).
1910
+ mock_process.side_effect = lambda **kw: kw["new_resources"].append(
1911
+ {"file": "new.parquet", "file_size": 80 * 1024, "rows": 1}
1912
+ )
1913
+ mock_extract_stats.return_value = MagicMock()
1914
+ mock_build_stats.return_value = (None, None)
1915
+ mock_get_mon.return_value = MagicMock()
1916
+
1917
+ # compact_resources merges EVERY live file into one and reports them
1918
+ # all as sunset (computed from the snapshot it actually received).
1919
+ def _merge(**kw):
1920
+ live = kw["snapshot"]["resources"]
1921
+ sunset = {r["file"] for r in live}
1922
+ return (len(live), 10_100, [{"file": "merged.parquet",
1923
+ "file_size": 8_000_000,
1924
+ "rows": 10_100}], sunset)
1925
+ mock_compact_res.side_effect = _merge
1926
+
1927
+ from supertable.data_writer import DataWriter
1928
+ dw = DataWriter("s", "o")
1929
+ result = dw.write("admin", "tbl", _arrow_table({"id": [1], "ts": [100]}), ["id"])
1930
+
1931
+ assert result is not None
1932
+ # Gate tripped → merge ran exactly once; append had no vector to drain.
1933
+ mock_compact_res.assert_called_once()
1934
+ mock_compact_tomb.assert_not_called()
1935
+ # The merge saw the 100 existing files plus the one just written.
1936
+ merged_snapshot = mock_compact_res.call_args.kwargs["snapshot"]
1937
+ assert len(merged_snapshot["resources"]) == 101
1938
+
1939
+ # Folded into the SAME commit: update() lists the merged file as the
1940
+ # sole survivor and every consumed file as sunset.
1941
+ new_resources_arg = mock_simple.update.call_args[0][0]
1942
+ sunset_arg = mock_simple.update.call_args[0][1]
1943
+ assert [r["file"] for r in new_resources_arg] == ["merged.parquet"]
1944
+ assert "new.parquet" in sunset_arg
1945
+ assert "small_0.parquet" in sunset_arg
1946
+ assert "merged.parquet" not in sunset_arg
1947
+
1948
+ @patch(_PATCH_COMPACT_RES)
1949
+ @patch(_PATCH_COMPACT_TOMB)
1950
+ @patch(_PATCH_READ_PARQUET)
1951
+ @patch(_PATCH_BUILD_STATS)
1952
+ @patch(_PATCH_EXTRACT_STATS)
1953
+ @patch(_PATCH_BUILD_TOMBSTONE)
1954
+ @patch(_PATCH_GET_MON_LOGGER)
1955
+ @patch(_PATCH_MIRROR)
1956
+ @patch(_PATCH_PROCESS_OVERLAP)
1957
+ @patch(_PATCH_RESOLVE)
1958
+ @patch(_PATCH_FIND_OVERLAP)
1959
+ @patch(_PATCH_SIMPLE_TABLE)
1960
+ @patch(_PATCH_CHECK_WRITE)
1961
+ @patch(_PATCH_POLARS_FROM_ARROW)
1962
+ @patch(_PATCH_REDIS_CATALOG)
1963
+ @patch(_PATCH_SUPER_TABLE)
1964
+ def test_below_threshold_does_not_compact(
1965
+ self,
1966
+ MockST, MockCat, mock_from_arrow, mock_check_write,
1967
+ MockSimple, mock_find_overlap, mock_resolve, mock_process,
1968
+ MockMirror, mock_get_mon, mock_build_tomb, mock_extract_stats,
1969
+ mock_build_stats, mock_read_parquet, mock_compact_tomb,
1970
+ mock_compact_res,
1971
+ ):
1972
+ """A handful of small files stays under both the count and size
1973
+ triggers, so the write commits without invoking either compaction
1974
+ helper — auto-compaction must not run on every write, only when the
1975
+ gate is open."""
1976
+ mock_st = MagicMock(super_name="s", organization="o")
1977
+ MockST.return_value = mock_st
1978
+ MockCat.return_value = _mk_compaction_catalog()
1979
+
1980
+ df = _polars_df({"id": [1], "ts": [100]})
1981
+ mock_from_arrow.return_value = df
1982
+
1983
+ snap = {"resources": _small_resources(5)}
1984
+ mock_simple = MagicMock(data_dir="/d")
1985
+ mock_simple.get_simple_table_snapshot.return_value = (snap, "/p")
1986
+ mock_simple.update.return_value = ({}, "/np")
1987
+ MockSimple.return_value = mock_simple
1988
+ mock_find_overlap.return_value = set()
1989
+
1990
+ mock_resolve.side_effect = lambda **kw: (kw["incoming_df"], [])
1991
+ mock_build_tomb.return_value = (None, None)
1992
+ mock_process.side_effect = lambda **kw: kw["new_resources"].append(
1993
+ {"file": "new.parquet", "file_size": 80 * 1024, "rows": 1}
1994
+ )
1995
+ mock_extract_stats.return_value = MagicMock()
1996
+ mock_build_stats.return_value = (None, None)
1997
+ mock_get_mon.return_value = MagicMock()
1998
+
1999
+ from supertable.data_writer import DataWriter
2000
+ dw = DataWriter("s", "o")
2001
+ result = dw.write("admin", "tbl", _arrow_table({"id": [1], "ts": [100]}), ["id"])
2002
+
2003
+ assert result is not None
2004
+ mock_compact_res.assert_not_called()
2005
+ mock_compact_tomb.assert_not_called()
2006
+ # Write still committed the freshly written file untouched.
2007
+ new_resources_arg = mock_simple.update.call_args[0][0]
2008
+ assert [r["file"] for r in new_resources_arg] == ["new.parquet"]
2009
+ assert mock_simple.update.call_args[0][1] == set()
2010
+
2011
+ @patch(_PATCH_COMPACT_RES)
2012
+ @patch(_PATCH_COMPACT_TOMB)
2013
+ @patch(_PATCH_READ_PARQUET)
2014
+ @patch(_PATCH_BUILD_STATS)
2015
+ @patch(_PATCH_EXTRACT_STATS)
2016
+ @patch(_PATCH_BUILD_TOMBSTONE)
2017
+ @patch(_PATCH_GET_MON_LOGGER)
2018
+ @patch(_PATCH_MIRROR)
2019
+ @patch(_PATCH_PROCESS_OVERLAP)
2020
+ @patch(_PATCH_RESOLVE)
2021
+ @patch(_PATCH_FIND_OVERLAP)
2022
+ @patch(_PATCH_SIMPLE_TABLE)
2023
+ @patch(_PATCH_CHECK_WRITE)
2024
+ @patch(_PATCH_POLARS_FROM_ARROW)
2025
+ @patch(_PATCH_REDIS_CATALOG)
2026
+ @patch(_PATCH_SUPER_TABLE)
2027
+ def test_carried_forward_vector_drains_before_merge(
2028
+ self,
2029
+ MockST, MockCat, mock_from_arrow, mock_check_write,
2030
+ MockSimple, mock_find_overlap, mock_resolve, mock_process,
2031
+ MockMirror, mock_get_mon, mock_build_tomb, mock_extract_stats,
2032
+ mock_build_stats, mock_read_parquet, mock_compact_tomb,
2033
+ mock_compact_res,
2034
+ ):
2035
+ """The ordering invariant: when a live deletion-vector is carried
2036
+ forward (build_tombstone_file returns a path but no fresh frame) and
2037
+ the gate trips, Phase A must LOAD and drain that vector (compact_
2038
+ tombstones) BEFORE Phase B merges small files (compact_resources).
2039
+ Merging first could sunset a file the vector still references and
2040
+ permanently orphan its dead rows."""
2041
+ mock_st = MagicMock(super_name="s", organization="o")
2042
+ MockST.return_value = mock_st
2043
+ MockCat.return_value = _mk_compaction_catalog()
2044
+
2045
+ df = _polars_df({"id": [1], "ts": [100]})
2046
+ mock_from_arrow.return_value = df
2047
+
2048
+ snap = {
2049
+ "resources": _small_resources(100),
2050
+ "tombstone": "/d/tombstone/dv.parquet",
2051
+ "tombstone_rows": 50,
2052
+ }
2053
+ mock_simple = MagicMock(data_dir="/d")
2054
+ mock_simple.get_simple_table_snapshot.return_value = (snap, "/p")
2055
+ mock_simple.update.return_value = ({}, "/np")
2056
+ MockSimple.return_value = mock_simple
2057
+ mock_find_overlap.return_value = set()
2058
+
2059
+ mock_resolve.side_effect = lambda **kw: (kw["incoming_df"], [])
2060
+ # Carry-forward: pointer reused, no fresh combined frame this write.
2061
+ mock_build_tomb.return_value = ("/d/tombstone/dv.parquet", None)
2062
+ mock_process.side_effect = lambda **kw: kw["new_resources"].append(
2063
+ {"file": "new.parquet", "file_size": 80 * 1024, "rows": 1}
2064
+ )
2065
+ # Phase A loads the live vector off its pointer to drain it.
2066
+ mock_read_parquet.return_value = _polars_df(
2067
+ {"__rowid__": list(range(50))}
2068
+ )
2069
+ mock_extract_stats.return_value = MagicMock()
2070
+ mock_build_stats.return_value = (None, None)
2071
+ mock_get_mon.return_value = MagicMock()
2072
+
2073
+ order: List[str] = []
2074
+
2075
+ def _drain(**kw):
2076
+ order.append("tomb")
2077
+ return (50, [{"file": "survivor.parquet",
2078
+ "file_size": 70 * 1024, "rows": 50}],
2079
+ {"small_0.parquet"})
2080
+ mock_compact_tomb.side_effect = _drain
2081
+
2082
+ def _merge(**kw):
2083
+ order.append("res")
2084
+ return (10, 5_000, [{"file": "merged.parquet",
2085
+ "file_size": 4_000_000, "rows": 5_000}],
2086
+ set())
2087
+ mock_compact_res.side_effect = _merge
2088
+
2089
+ from supertable.data_writer import DataWriter
2090
+ dw = DataWriter("s", "o")
2091
+ result = dw.write("admin", "tbl", _arrow_table({"id": [1], "ts": [100]}), ["id"])
2092
+
2093
+ assert result is not None
2094
+ # The carried-forward vector was read off its pointer to drain it.
2095
+ mock_read_parquet.assert_called_once()
2096
+ assert mock_read_parquet.call_args[0][0] == "/d/tombstone/dv.parquet"
2097
+ # Both phases ran, drain strictly before merge.
2098
+ mock_compact_tomb.assert_called_once()
2099
+ mock_compact_res.assert_called_once()
2100
+ assert order == ["tomb", "res"]
@@ -573,3 +573,96 @@ class TestRaceTolerance:
573
573
  # Output has only the present file's rows
574
574
  after = _read_all(patched_storage, [r["file"] for r in new_res])
575
575
  assert _multiset(after) == _multiset(df)
576
+
577
+
578
+ # ---------------------------------------------------------------------------
579
+ # should_compact_small_files — the auto-compaction gate predicate
580
+ # ---------------------------------------------------------------------------
581
+ #
582
+ # Pure function (no I/O): decides whether write() should trigger an inline
583
+ # small-file merge. This is the gate the user's bug report was about —
584
+ # 170×80KB files never compacted because nothing was *checking* it on write.
585
+ # Two independent triggers, both measured over files SMALLER than the chunk
586
+ # size (large files are already "done" and must never force a merge):
587
+ # (a) the small-file COUNT reaches max_overlapping_files, or
588
+ # (b) the small-file total BYTES exceed max_memory_chunk_size.
589
+
590
+
591
+ def _res(file: str | None, size: int) -> dict:
592
+ return {"file": file, "file_size": size, "rows": 1}
593
+
594
+
595
+ class TestShouldCompactSmallFiles:
596
+
597
+ def _limits(self):
598
+ from supertable.processing import _resolve_limits
599
+ return _resolve_limits(None) # global defaults
600
+
601
+ def test_empty_resources_never_compacts(self):
602
+ from supertable.processing import should_compact_small_files
603
+ assert should_compact_small_files([]) is False
604
+ assert should_compact_small_files(None) is False
605
+
606
+ def test_count_trigger_at_threshold(self):
607
+ from supertable.processing import should_compact_small_files
608
+ max_mem, max_files = self._limits()
609
+ # Each file small enough that the BYTE trigger stays dormant, so this
610
+ # isolates the COUNT trigger: sum = max_files*s = max_mem/2 <= max_mem.
611
+ s = max_mem // (max_files * 2)
612
+ at = [_res(f"f{i}.parquet", s) for i in range(max_files)]
613
+ below = at[:-1]
614
+ assert should_compact_small_files(below) is False # max_files-1
615
+ assert should_compact_small_files(at) is True # == max_files
616
+
617
+ def test_size_trigger_below_count(self):
618
+ from supertable.processing import should_compact_small_files
619
+ max_mem, max_files = self._limits()
620
+ # 5 files, each a quarter-chunk (< chunk, so "small") → 1.25 chunks
621
+ # total: the BYTE trigger fires even though count is far below max_files.
622
+ s = max_mem // 4
623
+ res = [_res(f"f{i}.parquet", s) for i in range(5)]
624
+ assert len(res) < max_files
625
+ assert should_compact_small_files(res) is True
626
+
627
+ def test_size_trigger_is_strict_greater_than(self):
628
+ from supertable.processing import should_compact_small_files
629
+ max_mem, _ = self._limits()
630
+ # Exactly == max_mem must NOT trip (boundary): two half-chunk files.
631
+ res = [_res("a.parquet", max_mem // 2), _res("b.parquet", max_mem // 2)]
632
+ assert sum(r["file_size"] for r in res) == max_mem
633
+ assert should_compact_small_files(res) is False
634
+
635
+ def test_large_files_are_ignored(self):
636
+ from supertable.processing import should_compact_small_files
637
+ max_mem, max_files = self._limits()
638
+ # Files >= chunk size are "already compacted": even max_files+50 of
639
+ # them must NOT trigger a merge (they are not small).
640
+ big = [_res(f"b{i}.parquet", max_mem) for i in range(max_files + 50)]
641
+ assert should_compact_small_files(big) is False
642
+ # A handful of small files mixed in stays below both triggers.
643
+ mixed = big + [_res(f"s{i}.parquet", 80 * 1024) for i in range(5)]
644
+ assert should_compact_small_files(mixed) is False
645
+
646
+ def test_per_table_config_overrides_global_count(self):
647
+ from supertable.processing import should_compact_small_files
648
+ cfg = {"max_overlapping_files": 10}
649
+ small = [_res(f"f{i}.parquet", 80 * 1024) for i in range(10)]
650
+ assert should_compact_small_files(small, cfg) is True
651
+ assert should_compact_small_files(small[:-1], cfg) is False
652
+
653
+ def test_resource_without_file_key_is_skipped(self):
654
+ from supertable.processing import should_compact_small_files
655
+ _, max_files = self._limits()
656
+ # Entries lacking a ``file`` path are not real files → ignored, even
657
+ # at max_files of them (guards against directory/placeholder rows).
658
+ phantom = [_res(None, 80 * 1024) for _ in range(max_files)]
659
+ assert should_compact_small_files(phantom) is False
660
+
661
+ def test_missing_file_size_does_not_crash(self):
662
+ from supertable.processing import should_compact_small_files
663
+ # ``file_size`` absent/None coerces to 0 (counts toward the COUNT
664
+ # trigger but contributes no bytes) — must never raise.
665
+ _, max_files = self._limits()
666
+ no_size = [{"file": f"f{i}.parquet"} for i in range(max_files)]
667
+ assert should_compact_small_files(no_size) is True # count trigger
668
+ assert should_compact_small_files(no_size[:1]) is False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: supertable
3
- Version: 2.3.2
3
+ Version: 2.3.3
4
4
  Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
5
5
  Author: Levente Kupas
6
6
  Author-email: Levente Kupas <lkupas@kladnasoft.com>
File without changes
File without changes
File without changes
File without changes