supertable 2.3.5__tar.gz → 2.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. {supertable-2.3.5/supertable.egg-info → supertable-2.3.6}/PKG-INFO +1 -1
  2. {supertable-2.3.5 → supertable-2.3.6}/pyproject.toml +1 -1
  3. {supertable-2.3.5 → supertable-2.3.6}/setup.py +1 -1
  4. {supertable-2.3.5 → supertable-2.3.6}/supertable/__init__.py +1 -1
  5. {supertable-2.3.5 → supertable-2.3.6}/supertable/data_writer.py +121 -24
  6. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/engine_common.py +56 -2
  7. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/tests/conftest.py +15 -0
  8. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/tests/test_engine.py +32 -0
  9. {supertable-2.3.5 → supertable-2.3.6}/supertable/processing.py +92 -22
  10. {supertable-2.3.5 → supertable-2.3.6}/supertable/storage/local_storage.py +12 -1
  11. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_data_writer_compact.py +36 -5
  12. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_processing_stats.py +60 -0
  13. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_resolve_overwrite_writes.py +6 -5
  14. {supertable-2.3.5 → supertable-2.3.6/supertable.egg-info}/PKG-INFO +1 -1
  15. {supertable-2.3.5 → supertable-2.3.6}/LICENSE +0 -0
  16. {supertable-2.3.5 → supertable-2.3.6}/README.md +0 -0
  17. {supertable-2.3.5 → supertable-2.3.6}/requirements.txt +0 -0
  18. {supertable-2.3.5 → supertable-2.3.6}/setup.cfg +0 -0
  19. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/__init__.py +0 -0
  20. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/admin.py +0 -0
  21. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/chain.py +0 -0
  22. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/consumers.py +0 -0
  23. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/crypto.py +0 -0
  24. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/events.py +0 -0
  25. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/export.py +0 -0
  26. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/logger.py +0 -0
  27. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/middleware.py +0 -0
  28. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/reader.py +0 -0
  29. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/retention.py +0 -0
  30. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/tests/__init__.py +0 -0
  31. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/tests/test_chain.py +0 -0
  32. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/tests/test_crypto.py +0 -0
  33. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/tests/test_emit.py +0 -0
  34. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/tests/test_events.py +0 -0
  35. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/tests/test_retention.py +0 -0
  36. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/writer_parquet.py +0 -0
  37. {supertable-2.3.5 → supertable-2.3.6}/supertable/audit/writer_redis.py +0 -0
  38. {supertable-2.3.5 → supertable-2.3.6}/supertable/config/__init__.py +0 -0
  39. {supertable-2.3.5 → supertable-2.3.6}/supertable/config/defaults.py +0 -0
  40. {supertable-2.3.5 → supertable-2.3.6}/supertable/config/homedir.py +0 -0
  41. {supertable-2.3.5 → supertable-2.3.6}/supertable/config/settings.py +0 -0
  42. {supertable-2.3.5 → supertable-2.3.6}/supertable/config/tests/__init__.py +0 -0
  43. {supertable-2.3.5 → supertable-2.3.6}/supertable/config/tests/test_defaults.py +0 -0
  44. {supertable-2.3.5 → supertable-2.3.6}/supertable/config/tests/test_homedir.py +0 -0
  45. {supertable-2.3.5 → supertable-2.3.6}/supertable/config/tests/test_settings.py +0 -0
  46. {supertable-2.3.5 → supertable-2.3.6}/supertable/data_classes.py +0 -0
  47. {supertable-2.3.5 → supertable-2.3.6}/supertable/data_reader.py +0 -0
  48. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/__init__.py +0 -0
  49. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/__init__.py +0 -0
  50. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/__main__.py +0 -0
  51. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/check_filter_builder.py +0 -0
  52. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/controller.py +0 -0
  53. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/data_writer_helpers.py +0 -0
  54. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/defaults.py +0 -0
  55. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/dummy_data.py +0 -0
  56. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/read_parquet_header.py +0 -0
  57. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s01_01_01_create_super_table.py +0 -0
  58. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s01_01_02_enable_mirroring_formats.py +0 -0
  59. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s01_02_create_roles.py +0 -0
  60. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s01_03_create_users.py +0 -0
  61. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s02_01_write_dummy_data.py +0 -0
  62. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s02_02_write_single_data.py +0 -0
  63. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s02_03_01_write_staging.py +0 -0
  64. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s02_03_02_create_pipe.py +0 -0
  65. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s02_04_01_write_monitoring_simple.py +0 -0
  66. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s02_04_02_write_monitoring_parallel.py +0 -0
  67. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s02_05_write_tombstone.py +0 -0
  68. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s03_01_read_data_error.py +0 -0
  69. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s03_02_01_read_super_data_ok.py +0 -0
  70. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s03_02_02_read_table_data_ok.py +0 -0
  71. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s03_03_read_meta.py +0 -0
  72. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s03_04_read_staging.py +0 -0
  73. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s03_06_01_read_roles.py +0 -0
  74. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s03_06_02_read_user.py +0 -0
  75. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s03_07_01_estimate_read.py +0 -0
  76. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s03_07_02_estimate_files.py +0 -0
  77. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s03_08_read_snapshot_history.py +0 -0
  78. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s04_01_03_delete_pipe.py +0 -0
  79. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s05_01_delete_table.py +0 -0
  80. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/quickstart/s05_02_delete_super_table.py +0 -0
  81. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/webshop/__init__.py +0 -0
  82. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/webshop/core.py +0 -0
  83. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/webshop/defaults.py +0 -0
  84. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/webshop/generate.py +0 -0
  85. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/webshop/load.py +0 -0
  86. {supertable-2.3.5 → supertable-2.3.6}/supertable/demo/webshop/topup.py +0 -0
  87. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/__init__.py +0 -0
  88. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/data_estimator.py +0 -0
  89. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/duckdb_lite.py +0 -0
  90. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/duckdb_pro.py +0 -0
  91. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/engine_config.py +0 -0
  92. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/engine_enum.py +0 -0
  93. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/executor.py +0 -0
  94. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/plan_stats.py +0 -0
  95. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/spark_thrift.py +0 -0
  96. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/tests/__init__.py +0 -0
  97. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/tests/test_engine_config.py +0 -0
  98. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/tests/test_engine_routing.py +0 -0
  99. {supertable-2.3.5 → supertable-2.3.6}/supertable/engine/tests/test_engine_spill.py +0 -0
  100. {supertable-2.3.5 → supertable-2.3.6}/supertable/errors.py +0 -0
  101. {supertable-2.3.5 → supertable-2.3.6}/supertable/locking/__init__.py +0 -0
  102. {supertable-2.3.5 → supertable-2.3.6}/supertable/locking/benchmarks/__init__.py +0 -0
  103. {supertable-2.3.5 → supertable-2.3.6}/supertable/locking/benchmarks/benchmark_locking.py +0 -0
  104. {supertable-2.3.5 → supertable-2.3.6}/supertable/locking/benchmarks/measure_lock_speed.py +0 -0
  105. {supertable-2.3.5 → supertable-2.3.6}/supertable/locking/benchmarks/measure_lock_time.py +0 -0
  106. {supertable-2.3.5 → supertable-2.3.6}/supertable/locking/file_lock.py +0 -0
  107. {supertable-2.3.5 → supertable-2.3.6}/supertable/locking/redis_lock.py +0 -0
  108. {supertable-2.3.5 → supertable-2.3.6}/supertable/locking/tests/__init__.py +0 -0
  109. {supertable-2.3.5 → supertable-2.3.6}/supertable/locking/tests/test_file_lock.py +0 -0
  110. {supertable-2.3.5 → supertable-2.3.6}/supertable/locking/tests/test_redis_lock.py +0 -0
  111. {supertable-2.3.5 → supertable-2.3.6}/supertable/logging.py +0 -0
  112. {supertable-2.3.5 → supertable-2.3.6}/supertable/meta_reader.py +0 -0
  113. {supertable-2.3.5 → supertable-2.3.6}/supertable/mirroring/__init__.py +0 -0
  114. {supertable-2.3.5 → supertable-2.3.6}/supertable/mirroring/mirror_delta.py +0 -0
  115. {supertable-2.3.5 → supertable-2.3.6}/supertable/mirroring/mirror_formats.py +0 -0
  116. {supertable-2.3.5 → supertable-2.3.6}/supertable/mirroring/mirror_iceberg.py +0 -0
  117. {supertable-2.3.5 → supertable-2.3.6}/supertable/mirroring/mirror_parquet.py +0 -0
  118. {supertable-2.3.5 → supertable-2.3.6}/supertable/monitoring/__init__.py +0 -0
  119. {supertable-2.3.5 → supertable-2.3.6}/supertable/monitoring/partitions.py +0 -0
  120. {supertable-2.3.5 → supertable-2.3.6}/supertable/monitoring_writer.py +0 -0
  121. {supertable-2.3.5 → supertable-2.3.6}/supertable/plan_extender.py +0 -0
  122. {supertable-2.3.5 → supertable-2.3.6}/supertable/query_plan_manager.py +0 -0
  123. {supertable-2.3.5 → supertable-2.3.6}/supertable/rbac/__init__.py +0 -0
  124. {supertable-2.3.5 → supertable-2.3.6}/supertable/rbac/access_control.py +0 -0
  125. {supertable-2.3.5 → supertable-2.3.6}/supertable/rbac/filter_builder.py +0 -0
  126. {supertable-2.3.5 → supertable-2.3.6}/supertable/rbac/permissions.py +0 -0
  127. {supertable-2.3.5 → supertable-2.3.6}/supertable/rbac/role_manager.py +0 -0
  128. {supertable-2.3.5 → supertable-2.3.6}/supertable/rbac/row_column_security.py +0 -0
  129. {supertable-2.3.5 → supertable-2.3.6}/supertable/rbac/tests/test_filter_builder.py +0 -0
  130. {supertable-2.3.5 → supertable-2.3.6}/supertable/rbac/tests/test_rbac.py +0 -0
  131. {supertable-2.3.5 → supertable-2.3.6}/supertable/rbac/tests/test_rbac_per_table.py +0 -0
  132. {supertable-2.3.5 → supertable-2.3.6}/supertable/rbac/user_manager.py +0 -0
  133. {supertable-2.3.5 → supertable-2.3.6}/supertable/redis_catalog.py +0 -0
  134. {supertable-2.3.5 → supertable-2.3.6}/supertable/redis_connector.py +0 -0
  135. {supertable-2.3.5 → supertable-2.3.6}/supertable/redis_infra.py +0 -0
  136. {supertable-2.3.5 → supertable-2.3.6}/supertable/redis_keys.py +0 -0
  137. {supertable-2.3.5 → supertable-2.3.6}/supertable/simple_table.py +0 -0
  138. {supertable-2.3.5 → supertable-2.3.6}/supertable/staging_area.py +0 -0
  139. {supertable-2.3.5 → supertable-2.3.6}/supertable/storage/__init__.py +0 -0
  140. {supertable-2.3.5 → supertable-2.3.6}/supertable/storage/azure_storage.py +0 -0
  141. {supertable-2.3.5 → supertable-2.3.6}/supertable/storage/gcp_storage.py +0 -0
  142. {supertable-2.3.5 → supertable-2.3.6}/supertable/storage/minio_storage.py +0 -0
  143. {supertable-2.3.5 → supertable-2.3.6}/supertable/storage/s3_storage.py +0 -0
  144. {supertable-2.3.5 → supertable-2.3.6}/supertable/storage/storage_factory.py +0 -0
  145. {supertable-2.3.5 → supertable-2.3.6}/supertable/storage/storage_interface.py +0 -0
  146. {supertable-2.3.5 → supertable-2.3.6}/supertable/storage/tests/test_storage.py +0 -0
  147. {supertable-2.3.5 → supertable-2.3.6}/supertable/super_pipe.py +0 -0
  148. {supertable-2.3.5 → supertable-2.3.6}/supertable/super_table.py +0 -0
  149. {supertable-2.3.5 → supertable-2.3.6}/supertable/system_query.py +0 -0
  150. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/__init__.py +0 -0
  151. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_align_to_schema_fix.py +0 -0
  152. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_create_if_missing.py +0 -0
  153. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_data_reader.py +0 -0
  154. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_data_reader_preflight.py +0 -0
  155. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_data_writer.py +0 -0
  156. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_data_writer_comprehensive.py +0 -0
  157. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_errors.py +0 -0
  158. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_meta_reader.py +0 -0
  159. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_monitoring_partitions.py +0 -0
  160. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_monitoring_sink_guard.py +0 -0
  161. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_newer_than.py +0 -0
  162. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_parquet_statistics.py +0 -0
  163. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_processing.py +0 -0
  164. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_processing_compact_resources.py +0 -0
  165. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_query_sql.py +0 -0
  166. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_read_pruning_differential.py +0 -0
  167. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_read_pruning_integration.py +0 -0
  168. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_redis_key_prefix.py +0 -0
  169. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_simple_table.py +0 -0
  170. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_stats_cache.py +0 -0
  171. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_stats_pruning.py +0 -0
  172. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_stats_schema_snapshot.py +0 -0
  173. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_super_table.py +0 -0
  174. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_supertable_all.py +0 -0
  175. {supertable-2.3.5 → supertable-2.3.6}/supertable/tests/test_system_query.py +0 -0
  176. {supertable-2.3.5 → supertable-2.3.6}/supertable/utils/__init__.py +0 -0
  177. {supertable-2.3.5 → supertable-2.3.6}/supertable/utils/helper.py +0 -0
  178. {supertable-2.3.5 → supertable-2.3.6}/supertable/utils/profiler.py +0 -0
  179. {supertable-2.3.5 → supertable-2.3.6}/supertable/utils/sql_parser.py +0 -0
  180. {supertable-2.3.5 → supertable-2.3.6}/supertable/utils/tests/test_sql_parser_columns.py +0 -0
  181. {supertable-2.3.5 → supertable-2.3.6}/supertable/utils/timer.py +0 -0
  182. {supertable-2.3.5 → supertable-2.3.6}/supertable.egg-info/SOURCES.txt +0 -0
  183. {supertable-2.3.5 → supertable-2.3.6}/supertable.egg-info/dependency_links.txt +0 -0
  184. {supertable-2.3.5 → supertable-2.3.6}/supertable.egg-info/entry_points.txt +0 -0
  185. {supertable-2.3.5 → supertable-2.3.6}/supertable.egg-info/requires.txt +0 -0
  186. {supertable-2.3.5 → supertable-2.3.6}/supertable.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: supertable
3
- Version: 2.3.5
3
+ Version: 2.3.6
4
4
  Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
5
5
  Author: Levente Kupas
6
6
  Author-email: Levente Kupas <lkupas@kladnasoft.com>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "supertable"
7
- version = "2.3.5"
7
+ version = "2.3.6"
8
8
  description = "SuperTable — versioned data lake library for SQL analytics on Parquet + Redis."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
19
19
 
20
20
  setup(
21
21
  name="supertable",
22
- version="2.3.5",
22
+ version="2.3.6",
23
23
  description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
24
24
  long_description=long_description,
25
25
  long_description_content_type="text/markdown",
@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
25
25
  project documentation for the full API surface.
26
26
  """
27
27
 
28
- __version__ = "2.3.5"
28
+ __version__ = "2.3.6"
29
29
 
30
30
  # Re-export the core public surface so users can do ``from supertable import …``
31
31
  # instead of remembering submodule paths.
@@ -5,6 +5,7 @@ import json
5
5
  import os
6
6
  import time
7
7
  import uuid
8
+ from concurrent.futures import ThreadPoolExecutor
8
9
  from datetime import datetime, timezone
9
10
  import re
10
11
 
@@ -343,8 +344,18 @@ class DataWriter:
343
344
  # layout and tight row-group zonemaps). Together with __rowid__ it
344
345
  # is hidden from query output by the read view's
345
346
  # ``EXCLUDE (__rowid__, __timestamp__)`` projection.
347
+ #
348
+ # System-owned, exactly like __rowid__ above: ALWAYS overwrite any
349
+ # caller-supplied __timestamp__ instead of preserving it. It is a
350
+ # reserved internal column that is both the dedup ORDER BY key (newest
351
+ # per key wins) and the source of the __p_year__/month/day partition
352
+ # derivation (processing.py); letting a caller inject an arbitrary value
353
+ # (wrong dtype, non-UTC, or chosen to game which row wins) would
354
+ # silently corrupt partitioning and dedup. ``newer_than`` is the
355
+ # supported, explicit mechanism for caller-controlled conflict
356
+ # resolution.
346
357
  table_config = self._get_table_config(simple_name)
347
- if not delete_only and "__timestamp__" not in dataframe.columns:
358
+ if not delete_only:
348
359
  dataframe = dataframe.with_columns(
349
360
  polars.lit(datetime.now(timezone.utc)).alias("__timestamp__")
350
361
  )
@@ -511,12 +522,21 @@ class DataWriter:
511
522
  # Load the current deletion-vector once: used both to exclude
512
523
  # already-tombstoned rows from this write's deletes (below) and,
513
524
  # via prev_df, to extend the vector without a second read.
525
+ # required=True: a DV that exists but cannot be read must abort
526
+ # the write, never be treated as empty — silently dropping the
527
+ # carried-forward vector would resurrect previously deleted rows.
514
528
  prev_dv_df = (
515
- _read_parquet_safe(prev_tombstone_path, profiler=profiler)
529
+ _read_parquet_safe(prev_tombstone_path, profiler=profiler, required=True)
516
530
  if prev_tombstone_path else None
517
531
  )
532
+ # The rowid set is consumed only by the idempotency filter below,
533
+ # which runs only when this write actually tombstones rows
534
+ # (overwrite or delete_only). Pure appends tombstone nothing, so
535
+ # skip materialising the whole deletion-vector as a Python set —
536
+ # prev_dv_df is still carried forward into build_tombstone_file.
518
537
  prev_dv_rowids = set()
519
- if prev_dv_df is not None and "__rowid__" in prev_dv_df.columns:
538
+ if (overwrite_columns or delete_only) and prev_dv_df is not None \
539
+ and "__rowid__" in prev_dv_df.columns:
520
540
  prev_dv_rowids = set(prev_dv_df.get_column("__rowid__").to_list())
521
541
 
522
542
  # 1. Identify which existing rows this write deletes/replaces.
@@ -555,38 +575,87 @@ class DataWriter:
555
575
  f"(excluded {len(prev_dv_rowids)} row(s) already in the deletion-vector)"
556
576
  ))
557
577
 
558
- # 2. Write the incoming rows as a new file (insert/upsert side).
559
- # delete_only carries only predicate columns — nothing to insert.
560
- if not delete_only and dataframe.height > 0:
578
+ # 2. + 3. Write the incoming rows as a new data file (insert/
579
+ # upsert side) AND carry-forward/extend the deletion-vector
580
+ # tombstone file. These two object-store PUTs are independent:
581
+ # neither reads the other's output and they write to disjoint
582
+ # dirs (data/ vs tombstone/), so they run concurrently to
583
+ # overlap the two round-trips. delete_only carries only
584
+ # predicate columns → nothing to insert. No new deletes →
585
+ # build_tombstone reuses the previous file (combined_df=None).
586
+ #
587
+ # Profiler is NOT thread-safe, so each branch records into its
588
+ # own sub-profiler which the parent merges after the join;
589
+ # each branch also measures its own wall time so the per-phase
590
+ # monitoring timings stay meaningful despite the overlap.
591
+ # Footers of files written via the write_bytes path are captured
592
+ # in footer_md_cache so stats extraction (step 6) reuses them
593
+ # instead of re-downloading each freshly-written file.
594
+ footer_md_cache = {}
595
+ tombstone_dir = os.path.join(simple_table.simple_dir, "tombstone")
596
+ do_insert = (not delete_only and dataframe.height > 0)
597
+
598
+ def _write_data_branch():
599
+ sub = Profiler()
600
+ t = time.perf_counter()
561
601
  write_parquet_and_collect_resources(
562
602
  write_df=dataframe,
563
603
  overwrite_columns=[],
564
604
  data_dir=simple_table.data_dir,
565
605
  new_resources=new_resources,
566
606
  compression_level=compression_level,
567
- profiler=profiler,
607
+ profiler=sub,
608
+ footer_md_out=footer_md_cache,
568
609
  )
610
+ return sub, time.perf_counter() - t
611
+
612
+ def _write_tombstone_branch():
613
+ sub = Profiler()
614
+ t = time.perf_counter()
615
+ tp, cdf = build_tombstone_file(
616
+ tombstone_dir=tombstone_dir,
617
+ prev_tombstone_path=prev_tombstone_path,
618
+ new_pairs=new_delete_pairs,
619
+ compression_level=compression_level,
620
+ profiler=sub,
621
+ prev_df=prev_dv_df,
622
+ )
623
+ return tp, cdf, sub, time.perf_counter() - t
624
+
625
+ if do_insert:
626
+ with ThreadPoolExecutor(max_workers=2) as _ex:
627
+ _f_data = _ex.submit(_write_data_branch)
628
+ _f_tomb = _ex.submit(_write_tombstone_branch)
629
+ # .result() re-raises in the parent: a failure in either
630
+ # PUT aborts the write before any snapshot commit, exactly
631
+ # as the former sequential path did (an orphaned immutable
632
+ # file no snapshot references is harmless garbage).
633
+ data_sub, data_secs = _f_data.result()
634
+ tombstone_path, combined_tombstone_df, tomb_sub, tomb_secs = (
635
+ _f_tomb.result()
636
+ )
637
+ profiler.merge(data_sub)
638
+ profiler.merge(tomb_sub)
569
639
  inserted = dataframe.height
570
640
  else:
641
+ tombstone_path, combined_tombstone_df, tomb_sub, tomb_secs = (
642
+ _write_tombstone_branch()
643
+ )
644
+ profiler.merge(tomb_sub)
645
+ data_secs = 0.0
571
646
  inserted = 0
572
- mark("write_parquet")
647
+
648
+ # Assign the two per-phase timings from each branch's own measured
649
+ # wall time (they overlapped, so the serial mark() deltas would
650
+ # misattribute the time), then advance the mark() baseline.
651
+ timings["write_parquet"] = data_secs
652
+ timings["build_tombstone"] = tomb_secs
653
+ t_last = time.time()
573
654
  logger.debug(lp(
574
655
  f"step[write]: appended {inserted} incoming row(s) as {len(new_resources)} "
575
656
  f"new immutable file(s) (no existing data file rewritten)"
576
657
  ))
577
658
 
578
- # 3. Carry forward + extend the deletion-vector tombstone file.
579
- # No new deletes → reuse the previous file (combined_df=None).
580
- tombstone_dir = os.path.join(simple_table.simple_dir, "tombstone")
581
- tombstone_path, combined_tombstone_df = build_tombstone_file(
582
- tombstone_dir=tombstone_dir,
583
- prev_tombstone_path=prev_tombstone_path,
584
- new_pairs=new_delete_pairs,
585
- compression_level=compression_level,
586
- profiler=profiler,
587
- prev_df=prev_dv_df,
588
- )
589
-
590
659
  # Track the live deletion-vector row count so meta reads can
591
660
  # deduct dead rows from the physical resource row totals.
592
661
  # New deletes → combined_tombstone_df is the full deduped DV
@@ -596,7 +665,6 @@ class DataWriter:
596
665
  if combined_tombstone_df is not None
597
666
  else int(last_simple_table.get("tombstone_rows", 0) or 0)
598
667
  )
599
- mark("build_tombstone")
600
668
  logger.debug(lp(
601
669
  f"step[tombstone]: deletion-vector now {tombstone_rows} row(s) "
602
670
  f"({'rewritten' if combined_tombstone_df is not None else 'carried forward unchanged'})"
@@ -745,7 +813,9 @@ class DataWriter:
745
813
  r.get("file") for r in new_resources
746
814
  if isinstance(r, dict) and r.get("file")
747
815
  ]
748
- new_stats_rows = extract_stats_rows(new_data_files, profiler=profiler)
816
+ new_stats_rows = extract_stats_rows(
817
+ new_data_files, profiler=profiler, footer_md_cache=footer_md_cache
818
+ )
749
819
  stats_path, combined_stats_df = build_stats_file(
750
820
  stats_dir=stats_dir,
751
821
  prev_stats_path=last_simple_table.get("stats_file"),
@@ -1181,8 +1251,17 @@ class DataWriter:
1181
1251
  # the *write* path; compact() is explicit maintenance and always
1182
1252
  # consumes the vector.
1183
1253
  tombstone_path = last_simple_table.get("tombstone")
1254
+ # required=True: a DV that exists but cannot be read must abort the
1255
+ # compaction, never be treated as empty. A swallowed read here would
1256
+ # set should_run_tombstones=False, skipping both Phase A and the
1257
+ # pointer-clear below, so Phase B would carry the dead rows into the
1258
+ # new file while the vector kept pointing at the sunset __file__ —
1259
+ # leaving them permanently unreclaimable. Failing loud leaves the
1260
+ # prior snapshot + vector intact for a retry, and matches the
1261
+ # write-path carry-forward read (required=True) above.
1184
1262
  tombstone_df = (
1185
- _read_parquet_safe(tombstone_path) if tombstone_path else None
1263
+ _read_parquet_safe(tombstone_path, required=True)
1264
+ if tombstone_path else None
1186
1265
  )
1187
1266
  tombstone_rows = (
1188
1267
  tombstone_df.height if tombstone_df is not None else 0
@@ -1246,6 +1325,24 @@ class DataWriter:
1246
1325
  r for r in (list(tomb_new_resources) + list(small_new_resources))
1247
1326
  if r.get("file") not in all_sunset
1248
1327
  ]
1328
+ # ``all_new_resources`` is the full set of files written by THIS
1329
+ # compaction; it feeds stats extraction, the schema model_df and the
1330
+ # result metrics below, all of which need every new file.
1331
+ #
1332
+ # For ``simple_table.update`` it must NOT be reused verbatim, though:
1333
+ # Phase A's outputs were already spliced into
1334
+ # ``last_simple_table["resources"]`` (the in-memory baseline that
1335
+ # ``update`` starts from) right after Phase A ran. ``update`` does
1336
+ # ``(baseline - sunset) + new_resources`` with no dedup, so any
1337
+ # Phase-A output that Phase B did NOT consume (left un-sunset because
1338
+ # it exceeded the ``small_only`` threshold, or its read failed) would
1339
+ # be counted once from the baseline AND once from new_resources —
1340
+ # i.e. the same file listed twice in the new snapshot. Hand ``update``
1341
+ # only Phase B's brand-new files, which are the only resources genuinely
1342
+ # absent from that baseline.
1343
+ update_new_resources = [
1344
+ r for r in small_new_resources if r.get("file") not in all_sunset
1345
+ ]
1249
1346
  result["files_compacted"] = considered
1250
1347
  result["new_resources"] = len(all_new_resources)
1251
1348
  result["sunset_files"] = len(all_sunset)
@@ -1338,7 +1435,7 @@ class DataWriter:
1338
1435
  )
1339
1436
 
1340
1437
  new_snapshot_dict, new_snapshot_path = simple_table.update(
1341
- all_new_resources,
1438
+ update_new_resources,
1342
1439
  all_sunset,
1343
1440
  model_df,
1344
1441
  last_snapshot=last_simple_table,
@@ -731,12 +731,66 @@ def new_duckdb_connection(
731
731
  purely local scans.
732
732
  """
733
733
  con = duckdb.connect()
734
- init_connection(con, temp_dir=temp_dir, memory_limit=memory_limit)
735
- if for_paths and any("://" in str(p) for p in for_paths):
734
+ try:
735
+ init_connection(con, temp_dir=temp_dir, memory_limit=memory_limit)
736
+ if for_paths and any("://" in str(p) for p in for_paths):
737
+ configure_httpfs_and_s3(con, for_paths)
738
+ except Exception:
739
+ # Don't leak the half-initialised connection if a pragma / httpfs load
740
+ # raises; re-raise so callers still fall back exactly as before.
741
+ con.close()
742
+ raise
743
+ return con
744
+
745
+
746
+ # Thread-local pool for the write-side probe connection. DuckDB connections are
747
+ # NOT thread-safe, so each thread keeps its own; reusing it amortises the
748
+ # ~150 ms init/warmup across writes on the same thread — the same reason the
749
+ # read executors hold a persistent connection.
750
+ _probe_pool = threading.local()
751
+
752
+
753
+ def get_pooled_duckdb_connection(
754
+ temp_dir: str,
755
+ for_paths: Optional[List[str]] = None,
756
+ memory_limit: str = "1GB",
757
+ ) -> duckdb.DuckDBPyConnection:
758
+ """Return this thread's pooled probe connection, building it on first use.
759
+
760
+ The cold build goes through ``new_duckdb_connection`` so the pinned
761
+ ``home_directory`` / pragma contract is byte-for-byte identical to a
762
+ transient connection. On a *warm* connection httpfs/S3 is re-applied for
763
+ remote paths so a connection first built for local paths can still serve a
764
+ later remote probe and credentials always reflect the current environment
765
+ (``configure_httpfs_and_s3`` re-reads env each call and is idempotent).
766
+ """
767
+ con = getattr(_probe_pool, "con", None)
768
+ if con is None:
769
+ con = new_duckdb_connection(
770
+ temp_dir=temp_dir, for_paths=for_paths, memory_limit=memory_limit
771
+ )
772
+ _probe_pool.con = con
773
+ elif for_paths and any("://" in str(p) for p in for_paths):
736
774
  configure_httpfs_and_s3(con, for_paths)
737
775
  return con
738
776
 
739
777
 
778
+ def reset_pooled_duckdb_connections() -> None:
779
+ """Close and drop the calling thread's pooled probe connection.
780
+
781
+ A no-op when the thread has none. Used for test determinism and as an
782
+ eviction hook; the pool slot is cleared before the close so a failing close
783
+ still leaves the thread ready to rebuild.
784
+ """
785
+ con = getattr(_probe_pool, "con", None)
786
+ if con is not None:
787
+ _probe_pool.con = None
788
+ try:
789
+ con.close()
790
+ except Exception:
791
+ pass
792
+
793
+
740
794
  def apply_runtime_pragmas(con: duckdb.DuckDBPyConnection, cfg) -> None:
741
795
  """Re-apply the session-settable DuckDB pragmas from a live engine config.
742
796
 
@@ -86,6 +86,21 @@ def _mock_redis_catalog():
86
86
  yield
87
87
 
88
88
 
89
+ @pytest.fixture(autouse=True)
90
+ def _reset_probe_pool():
91
+ """Clear the thread-local write-probe connection pool around every test.
92
+
93
+ The probe now reuses a pooled connection across writes, so tests that
94
+ assert how many times ``new_duckdb_connection`` is built must start from a
95
+ cold pool; resetting afterwards keeps the connection from leaking into the
96
+ next test.
97
+ """
98
+ from supertable.engine.engine_common import reset_pooled_duckdb_connections
99
+ reset_pooled_duckdb_connections()
100
+ yield
101
+ reset_pooled_duckdb_connections()
102
+
103
+
89
104
  @pytest.fixture()
90
105
  def duckdb_con():
91
106
  """Provide a real in-memory DuckDB connection, closed after each test."""
@@ -587,6 +587,38 @@ class TestReadWriteDuckDBParity:
587
587
  # for_paths forwarded so httpfs is loaded for remote scans.
588
588
  assert "for_paths" in calls[0][1]
589
589
 
590
+ def test_probe_reuses_pooled_connection(self, tmp_path, monkeypatch):
591
+ # A second probe on the same thread must REUSE the pooled connection,
592
+ # so new_duckdb_connection is built exactly once — the ~150ms warmup is
593
+ # paid on the cold probe and amortised on every subsequent write.
594
+ import polars
595
+ from supertable import processing as _processing
596
+
597
+ monkeypatch.setattr(_processing, "_get_storage", lambda: object())
598
+
599
+ f1 = str(tmp_path / "f1.parquet")
600
+ polars.DataFrame({"__rowid__": [10, 20], "id": [1, 2]}).write_parquet(f1)
601
+
602
+ calls = []
603
+ real = _engine_common.new_duckdb_connection
604
+ monkeypatch.setattr(
605
+ _engine_common,
606
+ "new_duckdb_connection",
607
+ lambda *a, **k: (calls.append((a, k)), real(*a, **k))[1],
608
+ )
609
+
610
+ def _probe():
611
+ return _processing._duckdb_probe_overlap_matches(
612
+ overlap_true_files=[(f1, 0)],
613
+ overwrite_columns=["id"],
614
+ newer_than_col=None,
615
+ incoming_keys=polars.DataFrame({"id": [2]}),
616
+ )
617
+
618
+ assert _probe() is not None
619
+ assert _probe() is not None
620
+ assert len(calls) == 1 # built on the cold probe, reused on the warm one
621
+
590
622
  def test_probe_matches_rows_on_local_parquet(self, tmp_path, monkeypatch):
591
623
  import polars
592
624
  from supertable import processing as _processing
@@ -212,12 +212,17 @@ def concat_many_with_union(frames: List[polars.DataFrame]) -> polars.DataFrame:
212
212
  # Safe storage I/O helpers
213
213
  # =========================
214
214
 
215
- def _safe_exists(path: str, profiler: Optional[Profiler] = None) -> bool:
215
+ def _safe_exists(path: str, profiler: Optional[Profiler] = None, strict: bool = False) -> bool:
216
216
  p = profiler or get_null_profiler()
217
217
  try:
218
218
  with p.span("io.exists"):
219
219
  return _get_storage().exists(path)
220
220
  except Exception:
221
+ # A failed existence probe is normally treated as "absent" (lenient).
222
+ # *strict* callers (carry-forward reads) must not mistake a backend
223
+ # error for a genuine absence, so re-raise instead.
224
+ if strict:
225
+ raise
221
226
  return False
222
227
 
223
228
 
@@ -226,9 +231,21 @@ def _read_parquet_safe(
226
231
  profiler: Optional[Profiler] = None,
227
232
  file_size: int = 0,
228
233
  columns: Optional[List[str]] = None,
234
+ required: bool = False,
229
235
  ) -> Optional[polars.DataFrame]:
236
+ """Read a parquet object into polars, or ``None`` when it is absent.
237
+
238
+ When *required* is True a genuine read failure — the object exists but cannot
239
+ be read (corrupt body, transient/persistent backend error) — is re-raised
240
+ instead of being swallowed to ``None``. Absence still returns ``None`` even
241
+ when required (a missing object, or one sunset by a concurrent writer, is a
242
+ legitimate "no previous artifact" signal). Carry-forward callers that would
243
+ otherwise silently drop a still-referenced artifact — the deletion-vector —
244
+ must pass ``required=True`` so a failed read aborts the write rather than
245
+ persisting a truncated successor (which would resurrect deleted rows).
246
+ """
230
247
  p = profiler or get_null_profiler()
231
- if not _safe_exists(path, profiler=p):
248
+ if not _safe_exists(path, profiler=p, strict=required):
232
249
  logging.info(f"[race] file already sunset by another writer: {path}")
233
250
  return None
234
251
  try:
@@ -251,6 +268,8 @@ def _read_parquet_safe(
251
268
  return None
252
269
  except Exception as e:
253
270
  logging.warning(f"[read] failed to read parquet at {path}: {e}")
271
+ if required:
272
+ raise
254
273
  return None
255
274
 
256
275
 
@@ -553,6 +572,7 @@ def compact_resources(
553
572
  def write_parquet_and_collect_resources(
554
573
  write_df, overwrite_columns, data_dir, new_resources, compression_level=10,
555
574
  profiler: Optional[Profiler] = None,
575
+ footer_md_out: Optional[Dict] = None,
556
576
  ):
557
577
  """Write a DataFrame as one or more Parquet files and append resource dicts.
558
578
 
@@ -593,7 +613,7 @@ def write_parquet_and_collect_resources(
593
613
 
594
614
  if has_nulls:
595
615
  null_df = partitioned.filter(null_mask).drop(["__p_year__", "__p_month__", "__p_day__"])
596
- _write_single_parquet_file(null_df, overwrite_columns, data_dir, new_resources, compression_level, profiler=profiler)
616
+ _write_single_parquet_file(null_df, overwrite_columns, data_dir, new_resources, compression_level, profiler=profiler, footer_md_out=footer_md_out)
597
617
  partitioned = partitioned.filter(~null_mask)
598
618
 
599
619
  if partitioned.height > 0:
@@ -613,16 +633,17 @@ def write_parquet_and_collect_resources(
613
633
  )
614
634
  _write_single_parquet_file(
615
635
  group_df, overwrite_columns, partition_dir, new_resources, compression_level,
616
- profiler=profiler,
636
+ profiler=profiler, footer_md_out=footer_md_out,
617
637
  )
618
638
  else:
619
639
  # --- Flat write path (no __timestamp__) — backward compatible ---
620
- _write_single_parquet_file(write_df, overwrite_columns, data_dir, new_resources, compression_level, profiler=profiler)
640
+ _write_single_parquet_file(write_df, overwrite_columns, data_dir, new_resources, compression_level, profiler=profiler, footer_md_out=footer_md_out)
621
641
 
622
642
 
623
643
  def _write_single_parquet_file(
624
644
  write_df, overwrite_columns, target_dir, new_resources, compression_level=10,
625
645
  profiler: Optional[Profiler] = None,
646
+ footer_md_out: Optional[Dict] = None,
626
647
  ):
627
648
  """Write a single Parquet file into *target_dir* and append a resource entry.
628
649
 
@@ -676,6 +697,17 @@ def _write_single_parquet_file(
676
697
  if hasattr(_get_storage(), "write_bytes"):
677
698
  with p.span("write.upload_bytes"):
678
699
  _get_storage().write_bytes(new_parquet_path, data)
700
+ # The uploaded bytes ARE ``data`` here, so parse the footer in memory
701
+ # (footer-only, no decode, no network round-trip) for stats reuse.
702
+ # ONLY on this path: the write_parquet / polars fallbacks below
703
+ # re-encode via a different writer, so their on-disk row-group layout
704
+ # and statistics need not match ``data`` — reusing it there could
705
+ # mis-prune row groups on read.
706
+ if footer_md_out is not None:
707
+ try:
708
+ footer_md_out[new_parquet_path] = pq.read_metadata(io.BytesIO(data))
709
+ except Exception:
710
+ pass
679
711
  elif hasattr(_get_storage(), "write_parquet"):
680
712
  with p.span("write.upload_parquet"):
681
713
  _get_storage().write_parquet(arrow_tbl, new_parquet_path)
@@ -799,9 +831,14 @@ def filter_stale_incoming_rows(
799
831
  polars.col(newer_than_col).max().alias("__existing_max__")
800
832
  )
801
833
 
802
- # Left join incoming against existing max
834
+ # Left join incoming against existing max. nulls_equal=True so a NULL key
835
+ # compares against the existing NULL group's max, consistent with the
836
+ # null-safe delete semi-join — otherwise an older NULL-keyed row would skip
837
+ # the stale filter yet still tombstone the newer existing NULL-keyed row.
803
838
  with p.span("newer_than.join_filter"):
804
- joined = incoming_df.join(existing_max, on=overwrite_columns, how="left")
839
+ joined = incoming_df.join(
840
+ existing_max, on=overwrite_columns, how="left", nulls_equal=True
841
+ )
805
842
 
806
843
  # Keep rows where:
807
844
  # - no existing data for this key (null max → new key)
@@ -970,7 +1007,12 @@ def identify_all_rowids(
970
1007
  if file_cache is not None and file in file_cache:
971
1008
  existing_df = file_cache.get(file)
972
1009
  else:
973
- existing_df = _read_parquet_safe(file, profiler=p, file_size=file_size)
1010
+ # Only __rowid__ is consumed below, so read just that column chunk.
1011
+ # A delete-all can touch every file; a full-width read would pull all
1012
+ # columns of every file into memory for nothing.
1013
+ existing_df = _read_parquet_safe(
1014
+ file, profiler=p, file_size=file_size, columns=[ROWID_COL]
1015
+ )
974
1016
  if existing_df is None or ROWID_COL not in existing_df.columns:
975
1017
  continue
976
1018
  rowids = existing_df.get_column(ROWID_COL).drop_nulls().to_list()
@@ -1072,7 +1114,7 @@ def _duckdb_probe_overlap_matches(
1072
1114
  try:
1073
1115
  import duckdb # noqa: F401 (imported for availability check / errors)
1074
1116
  from supertable.engine.engine_common import (
1075
- new_duckdb_connection,
1117
+ get_pooled_duckdb_connection,
1076
1118
  configure_httpfs_and_s3,
1077
1119
  escape_parquet_path,
1078
1120
  quote_if_needed,
@@ -1129,11 +1171,12 @@ def _duckdb_probe_overlap_matches(
1129
1171
 
1130
1172
  con = None
1131
1173
  try:
1132
- # Build the connection exactly like the read path (same pragmas, and a
1133
- # pinned home_directory) so the probe never falls back to the OS home —
1134
- # which is absent under a restricted service user. httpfs/S3 is loaded
1135
- # by the helper only when duck_paths contain a remote URL.
1136
- con = new_duckdb_connection(temp_dir="write_probe", for_paths=duck_paths)
1174
+ # Reuse this thread's pooled connection (cold-built exactly like the
1175
+ # read path: same pragmas, pinned home_directory so the probe never
1176
+ # falls back to the OS home, which is absent under a restricted service
1177
+ # user). The pool re-applies httpfs/S3 for remote paths, so a warm
1178
+ # connection is configured for the current probe's object store.
1179
+ con = get_pooled_duckdb_connection(temp_dir="write_probe", for_paths=duck_paths)
1137
1180
  con.register(ik_name, incoming_keys.to_arrow())
1138
1181
  try:
1139
1182
  matched = _run(duck_paths)
@@ -1157,14 +1200,13 @@ def _duckdb_probe_overlap_matches(
1157
1200
  return None
1158
1201
  finally:
1159
1202
  if con is not None:
1203
+ # Return the connection to the thread-local pool (do NOT close it);
1204
+ # only drop the per-probe registered relation so the uuid-named
1205
+ # keys table can't accumulate across reuses.
1160
1206
  try:
1161
1207
  con.unregister(ik_name)
1162
1208
  except Exception:
1163
1209
  pass
1164
- try:
1165
- con.close()
1166
- except Exception:
1167
- pass
1168
1210
 
1169
1211
  if matched is None or "filename" not in matched.columns:
1170
1212
  return None
@@ -1240,7 +1282,11 @@ def _derive_stale_and_deletes(
1240
1282
  polars.col(newer_than_col).max().alias("__existing_max__")
1241
1283
  )
1242
1284
  with p.span("newer_than.join_filter"):
1243
- joined = incoming_df.join(existing_max, on=overwrite_columns, how="left")
1285
+ # nulls_equal=True keeps this consistent with the null-safe delete
1286
+ # semi-join below and the polars fallback oracle.
1287
+ joined = incoming_df.join(
1288
+ existing_max, on=overwrite_columns, how="left", nulls_equal=True
1289
+ )
1244
1290
  filtered = joined.filter(
1245
1291
  polars.col("__existing_max__").is_null()
1246
1292
  | (polars.col(newer_than_col) > polars.col("__existing_max__"))
@@ -1380,7 +1426,9 @@ def build_tombstone_file(
1380
1426
  )
1381
1427
 
1382
1428
  if prev_df is None and prev_tombstone_path:
1383
- prev_df = _read_parquet_safe(prev_tombstone_path, profiler=p)
1429
+ # required=True: refuse to build a truncated deletion-vector if the
1430
+ # previous one exists but cannot be read (would resurrect dead rows).
1431
+ prev_df = _read_parquet_safe(prev_tombstone_path, profiler=p, required=True)
1384
1432
  if prev_df is not None and prev_df.height > 0 and ROWID_COL in prev_df.columns:
1385
1433
  combined = polars.concat(
1386
1434
  [prev_df.select([TOMBSTONE_FILE_COL, ROWID_COL]), new_df],
@@ -1658,6 +1706,7 @@ def _empty_stats_df() -> polars.DataFrame:
1658
1706
  def extract_stats_rows(
1659
1707
  file_paths: List[str],
1660
1708
  profiler: Optional[Profiler] = None,
1709
+ footer_md_cache: Optional[Dict] = None,
1661
1710
  ) -> polars.DataFrame:
1662
1711
  """Read the footers of *file_paths* and return their stats rows.
1663
1712
 
@@ -1665,13 +1714,23 @@ def extract_stats_rows(
1665
1714
  ``__rowid__`` / ``__timestamp__`` columns. Files whose footer cannot be
1666
1715
  read (race / corruption) are skipped. Returns a frame with ``STATS_SCHEMA``
1667
1716
  (possibly empty).
1717
+
1718
+ *footer_md_cache* (optional) maps a file path to a parquet ``FileMetaData``
1719
+ already parsed in memory at write time (from the exact bytes that were
1720
+ uploaded). When a path is present its footer is reused directly, skipping a
1721
+ full-file re-download; otherwise the footer is read back from storage.
1668
1722
  """
1669
1723
  p = profiler or get_null_profiler()
1724
+ cache = footer_md_cache or {}
1670
1725
  all_rows: List[dict] = []
1671
1726
  for path in file_paths:
1672
1727
  if not path:
1673
1728
  continue
1674
- md = _read_footer_metadata(path, profiler=p)
1729
+ md = cache.get(path)
1730
+ if md is None:
1731
+ md = _read_footer_metadata(path, profiler=p)
1732
+ else:
1733
+ p.add("stats_footer_cache_hit", 1)
1675
1734
  if md is None:
1676
1735
  continue
1677
1736
  all_rows.extend(_stats_rows_for_metadata(path, md))
@@ -2244,7 +2303,18 @@ def compact_tombstones(
2244
2303
  # File already sunset by an earlier compaction — skip.
2245
2304
  continue
2246
2305
  file_size = int(resource.get("file_size") or 0)
2247
- existing_df = _read_parquet_safe(file_path, profiler=p, file_size=file_size)
2306
+ # required=True: this is the ONLY physical drain (Phase B is row-preserving
2307
+ # and never re-drops these rows), and the callers clear the deletion-vector
2308
+ # pointer unconditionally once the vector was non-empty. If a transient
2309
+ # backend error here were swallowed to None, this file's tombstoned rows
2310
+ # would be silently skipped yet the pointer cleared -> the rows RESURRECT on
2311
+ # read. Failing loud aborts the write/compact with the prior snapshot +
2312
+ # vector intact for retry (matches the carry-forward DV-pointer reads). A
2313
+ # genuine absence still returns None (file already sunset/raced -> its rows
2314
+ # are gone, so skipping it is correct).
2315
+ existing_df = _read_parquet_safe(
2316
+ file_path, profiler=p, file_size=file_size, required=True
2317
+ )
2248
2318
  if existing_df is None or ROWID_COL not in existing_df.columns:
2249
2319
  continue
2250
2320