supertable 2.3.5__tar.gz → 2.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. {supertable-2.3.5/supertable.egg-info → supertable-2.3.7}/PKG-INFO +1 -1
  2. {supertable-2.3.5 → supertable-2.3.7}/pyproject.toml +1 -1
  3. {supertable-2.3.5 → supertable-2.3.7}/setup.py +1 -1
  4. {supertable-2.3.5 → supertable-2.3.7}/supertable/__init__.py +1 -1
  5. {supertable-2.3.5 → supertable-2.3.7}/supertable/config/settings.py +8 -0
  6. {supertable-2.3.5 → supertable-2.3.7}/supertable/data_writer.py +121 -24
  7. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/engine_common.py +56 -2
  8. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/conftest.py +15 -0
  9. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/test_engine.py +32 -0
  10. {supertable-2.3.5 → supertable-2.3.7}/supertable/processing.py +107 -29
  11. {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/local_storage.py +12 -1
  12. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_data_writer_compact.py +36 -5
  13. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_processing_stats.py +60 -0
  14. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_resolve_overwrite_writes.py +22 -5
  15. supertable-2.3.7/supertable/tests/test_write_probe_gate.py +130 -0
  16. {supertable-2.3.5 → supertable-2.3.7/supertable.egg-info}/PKG-INFO +1 -1
  17. {supertable-2.3.5 → supertable-2.3.7}/supertable.egg-info/SOURCES.txt +1 -0
  18. {supertable-2.3.5 → supertable-2.3.7}/LICENSE +0 -0
  19. {supertable-2.3.5 → supertable-2.3.7}/README.md +0 -0
  20. {supertable-2.3.5 → supertable-2.3.7}/requirements.txt +0 -0
  21. {supertable-2.3.5 → supertable-2.3.7}/setup.cfg +0 -0
  22. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/__init__.py +0 -0
  23. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/admin.py +0 -0
  24. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/chain.py +0 -0
  25. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/consumers.py +0 -0
  26. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/crypto.py +0 -0
  27. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/events.py +0 -0
  28. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/export.py +0 -0
  29. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/logger.py +0 -0
  30. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/middleware.py +0 -0
  31. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/reader.py +0 -0
  32. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/retention.py +0 -0
  33. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/__init__.py +0 -0
  34. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/test_chain.py +0 -0
  35. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/test_crypto.py +0 -0
  36. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/test_emit.py +0 -0
  37. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/test_events.py +0 -0
  38. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/tests/test_retention.py +0 -0
  39. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/writer_parquet.py +0 -0
  40. {supertable-2.3.5 → supertable-2.3.7}/supertable/audit/writer_redis.py +0 -0
  41. {supertable-2.3.5 → supertable-2.3.7}/supertable/config/__init__.py +0 -0
  42. {supertable-2.3.5 → supertable-2.3.7}/supertable/config/defaults.py +0 -0
  43. {supertable-2.3.5 → supertable-2.3.7}/supertable/config/homedir.py +0 -0
  44. {supertable-2.3.5 → supertable-2.3.7}/supertable/config/tests/__init__.py +0 -0
  45. {supertable-2.3.5 → supertable-2.3.7}/supertable/config/tests/test_defaults.py +0 -0
  46. {supertable-2.3.5 → supertable-2.3.7}/supertable/config/tests/test_homedir.py +0 -0
  47. {supertable-2.3.5 → supertable-2.3.7}/supertable/config/tests/test_settings.py +0 -0
  48. {supertable-2.3.5 → supertable-2.3.7}/supertable/data_classes.py +0 -0
  49. {supertable-2.3.5 → supertable-2.3.7}/supertable/data_reader.py +0 -0
  50. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/__init__.py +0 -0
  51. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/__init__.py +0 -0
  52. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/__main__.py +0 -0
  53. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/check_filter_builder.py +0 -0
  54. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/controller.py +0 -0
  55. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/data_writer_helpers.py +0 -0
  56. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/defaults.py +0 -0
  57. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/dummy_data.py +0 -0
  58. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/read_parquet_header.py +0 -0
  59. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s01_01_01_create_super_table.py +0 -0
  60. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s01_01_02_enable_mirroring_formats.py +0 -0
  61. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s01_02_create_roles.py +0 -0
  62. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s01_03_create_users.py +0 -0
  63. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_01_write_dummy_data.py +0 -0
  64. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_02_write_single_data.py +0 -0
  65. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_03_01_write_staging.py +0 -0
  66. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_03_02_create_pipe.py +0 -0
  67. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_04_01_write_monitoring_simple.py +0 -0
  68. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_04_02_write_monitoring_parallel.py +0 -0
  69. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s02_05_write_tombstone.py +0 -0
  70. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_01_read_data_error.py +0 -0
  71. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_02_01_read_super_data_ok.py +0 -0
  72. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_02_02_read_table_data_ok.py +0 -0
  73. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_03_read_meta.py +0 -0
  74. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_04_read_staging.py +0 -0
  75. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_06_01_read_roles.py +0 -0
  76. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_06_02_read_user.py +0 -0
  77. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_07_01_estimate_read.py +0 -0
  78. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_07_02_estimate_files.py +0 -0
  79. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s03_08_read_snapshot_history.py +0 -0
  80. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s04_01_03_delete_pipe.py +0 -0
  81. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s05_01_delete_table.py +0 -0
  82. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/quickstart/s05_02_delete_super_table.py +0 -0
  83. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/__init__.py +0 -0
  84. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/core.py +0 -0
  85. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/defaults.py +0 -0
  86. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/generate.py +0 -0
  87. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/load.py +0 -0
  88. {supertable-2.3.5 → supertable-2.3.7}/supertable/demo/webshop/topup.py +0 -0
  89. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/__init__.py +0 -0
  90. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/data_estimator.py +0 -0
  91. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/duckdb_lite.py +0 -0
  92. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/duckdb_pro.py +0 -0
  93. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/engine_config.py +0 -0
  94. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/engine_enum.py +0 -0
  95. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/executor.py +0 -0
  96. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/plan_stats.py +0 -0
  97. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/spark_thrift.py +0 -0
  98. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/__init__.py +0 -0
  99. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/test_engine_config.py +0 -0
  100. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/test_engine_routing.py +0 -0
  101. {supertable-2.3.5 → supertable-2.3.7}/supertable/engine/tests/test_engine_spill.py +0 -0
  102. {supertable-2.3.5 → supertable-2.3.7}/supertable/errors.py +0 -0
  103. {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/__init__.py +0 -0
  104. {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/benchmarks/__init__.py +0 -0
  105. {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/benchmarks/benchmark_locking.py +0 -0
  106. {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/benchmarks/measure_lock_speed.py +0 -0
  107. {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/benchmarks/measure_lock_time.py +0 -0
  108. {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/file_lock.py +0 -0
  109. {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/redis_lock.py +0 -0
  110. {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/tests/__init__.py +0 -0
  111. {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/tests/test_file_lock.py +0 -0
  112. {supertable-2.3.5 → supertable-2.3.7}/supertable/locking/tests/test_redis_lock.py +0 -0
  113. {supertable-2.3.5 → supertable-2.3.7}/supertable/logging.py +0 -0
  114. {supertable-2.3.5 → supertable-2.3.7}/supertable/meta_reader.py +0 -0
  115. {supertable-2.3.5 → supertable-2.3.7}/supertable/mirroring/__init__.py +0 -0
  116. {supertable-2.3.5 → supertable-2.3.7}/supertable/mirroring/mirror_delta.py +0 -0
  117. {supertable-2.3.5 → supertable-2.3.7}/supertable/mirroring/mirror_formats.py +0 -0
  118. {supertable-2.3.5 → supertable-2.3.7}/supertable/mirroring/mirror_iceberg.py +0 -0
  119. {supertable-2.3.5 → supertable-2.3.7}/supertable/mirroring/mirror_parquet.py +0 -0
  120. {supertable-2.3.5 → supertable-2.3.7}/supertable/monitoring/__init__.py +0 -0
  121. {supertable-2.3.5 → supertable-2.3.7}/supertable/monitoring/partitions.py +0 -0
  122. {supertable-2.3.5 → supertable-2.3.7}/supertable/monitoring_writer.py +0 -0
  123. {supertable-2.3.5 → supertable-2.3.7}/supertable/plan_extender.py +0 -0
  124. {supertable-2.3.5 → supertable-2.3.7}/supertable/query_plan_manager.py +0 -0
  125. {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/__init__.py +0 -0
  126. {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/access_control.py +0 -0
  127. {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/filter_builder.py +0 -0
  128. {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/permissions.py +0 -0
  129. {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/role_manager.py +0 -0
  130. {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/row_column_security.py +0 -0
  131. {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/tests/test_filter_builder.py +0 -0
  132. {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/tests/test_rbac.py +0 -0
  133. {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/tests/test_rbac_per_table.py +0 -0
  134. {supertable-2.3.5 → supertable-2.3.7}/supertable/rbac/user_manager.py +0 -0
  135. {supertable-2.3.5 → supertable-2.3.7}/supertable/redis_catalog.py +0 -0
  136. {supertable-2.3.5 → supertable-2.3.7}/supertable/redis_connector.py +0 -0
  137. {supertable-2.3.5 → supertable-2.3.7}/supertable/redis_infra.py +0 -0
  138. {supertable-2.3.5 → supertable-2.3.7}/supertable/redis_keys.py +0 -0
  139. {supertable-2.3.5 → supertable-2.3.7}/supertable/simple_table.py +0 -0
  140. {supertable-2.3.5 → supertable-2.3.7}/supertable/staging_area.py +0 -0
  141. {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/__init__.py +0 -0
  142. {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/azure_storage.py +0 -0
  143. {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/gcp_storage.py +0 -0
  144. {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/minio_storage.py +0 -0
  145. {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/s3_storage.py +0 -0
  146. {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/storage_factory.py +0 -0
  147. {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/storage_interface.py +0 -0
  148. {supertable-2.3.5 → supertable-2.3.7}/supertable/storage/tests/test_storage.py +0 -0
  149. {supertable-2.3.5 → supertable-2.3.7}/supertable/super_pipe.py +0 -0
  150. {supertable-2.3.5 → supertable-2.3.7}/supertable/super_table.py +0 -0
  151. {supertable-2.3.5 → supertable-2.3.7}/supertable/system_query.py +0 -0
  152. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/__init__.py +0 -0
  153. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_align_to_schema_fix.py +0 -0
  154. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_create_if_missing.py +0 -0
  155. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_data_reader.py +0 -0
  156. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_data_reader_preflight.py +0 -0
  157. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_data_writer.py +0 -0
  158. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_data_writer_comprehensive.py +0 -0
  159. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_errors.py +0 -0
  160. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_meta_reader.py +0 -0
  161. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_monitoring_partitions.py +0 -0
  162. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_monitoring_sink_guard.py +0 -0
  163. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_newer_than.py +0 -0
  164. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_parquet_statistics.py +0 -0
  165. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_processing.py +0 -0
  166. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_processing_compact_resources.py +0 -0
  167. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_query_sql.py +0 -0
  168. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_read_pruning_differential.py +0 -0
  169. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_read_pruning_integration.py +0 -0
  170. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_redis_key_prefix.py +0 -0
  171. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_simple_table.py +0 -0
  172. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_stats_cache.py +0 -0
  173. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_stats_pruning.py +0 -0
  174. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_stats_schema_snapshot.py +0 -0
  175. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_super_table.py +0 -0
  176. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_supertable_all.py +0 -0
  177. {supertable-2.3.5 → supertable-2.3.7}/supertable/tests/test_system_query.py +0 -0
  178. {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/__init__.py +0 -0
  179. {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/helper.py +0 -0
  180. {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/profiler.py +0 -0
  181. {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/sql_parser.py +0 -0
  182. {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/tests/test_sql_parser_columns.py +0 -0
  183. {supertable-2.3.5 → supertable-2.3.7}/supertable/utils/timer.py +0 -0
  184. {supertable-2.3.5 → supertable-2.3.7}/supertable.egg-info/dependency_links.txt +0 -0
  185. {supertable-2.3.5 → supertable-2.3.7}/supertable.egg-info/entry_points.txt +0 -0
  186. {supertable-2.3.5 → supertable-2.3.7}/supertable.egg-info/requires.txt +0 -0
  187. {supertable-2.3.5 → supertable-2.3.7}/supertable.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: supertable
3
- Version: 2.3.5
3
+ Version: 2.3.7
4
4
  Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
5
5
  Author: Levente Kupas
6
6
  Author-email: Levente Kupas <lkupas@kladnasoft.com>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "supertable"
7
- version = "2.3.5"
7
+ version = "2.3.7"
8
8
  description = "SuperTable — versioned data lake library for SQL analytics on Parquet + Redis."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
19
19
 
20
20
  setup(
21
21
  name="supertable",
22
- version="2.3.5",
22
+ version="2.3.7",
23
23
  description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
24
24
  long_description=long_description,
25
25
  long_description_content_type="text/markdown",
@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
25
25
  project documentation for the full API surface.
26
26
  """
27
27
 
28
- __version__ = "2.3.5"
28
+ __version__ = "2.3.7"
29
29
 
30
30
  # Re-export the core public surface so users can do ``from supertable import …``
31
31
  # instead of remembering submodule paths.
@@ -157,6 +157,13 @@ class Settings:
157
157
  SUPERTABLE_DUCKDB_MATERIALIZE: str = "view" # SUPERTABLE_DUCKDB_MATERIALIZE
158
158
  SUPERTABLE_DUCKDB_PRESIGNED: bool = False # SUPERTABLE_DUCKDB_PRESIGNED
159
159
  SUPERTABLE_DUCKDB_USE_HTTPFS: bool = False # SUPERTABLE_DUCKDB_USE_HTTPFS
160
+ # Write-path overwrite/delete resolution via the DuckDB pushdown probe.
161
+ # Disabled by default: the polars fallback reads only the projected key
162
+ # columns through the storage SDK and needs no httpfs extension, so it works
163
+ # in environments without one (or without internet to install it). Enable
164
+ # only where httpfs is available and the probe's row-group skipping is worth
165
+ # it (e.g. very wide tables / many overlapping files).
166
+ SUPERTABLE_DUCKDB_WRITE_PROBE: bool = False # SUPERTABLE_DUCKDB_WRITE_PROBE
160
167
  # Deletion-vector (tombstone) table cache. Each entry is a small
161
168
  # `DISTINCT __rowid__` table keyed by the stable tombstone path; the
162
169
  # tombstone view ANTI JOINs it instead of re-reading the parquet every
@@ -437,6 +444,7 @@ def _build_settings() -> Settings:
437
444
  SUPERTABLE_DUCKDB_MATERIALIZE=_env_str("SUPERTABLE_DUCKDB_MATERIALIZE", "view"),
438
445
  SUPERTABLE_DUCKDB_PRESIGNED=_env_bool("SUPERTABLE_DUCKDB_PRESIGNED", False),
439
446
  SUPERTABLE_DUCKDB_USE_HTTPFS=_env_bool("SUPERTABLE_DUCKDB_USE_HTTPFS", False),
447
+ SUPERTABLE_DUCKDB_WRITE_PROBE=_env_bool("SUPERTABLE_DUCKDB_WRITE_PROBE", False),
440
448
  SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_MAX_PER_TABLE=_env_int("SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_MAX_PER_TABLE", 8),
441
449
  SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_TTL_SEC=_env_int("SUPERTABLE_DUCKDB_TOMBSTONE_CACHE_TTL_SEC", 300),
442
450
  SUPERTABLE_DEBUG_TIMINGS=_env_bool("SUPERTABLE_DEBUG_TIMINGS", False),
@@ -5,6 +5,7 @@ import json
5
5
  import os
6
6
  import time
7
7
  import uuid
8
+ from concurrent.futures import ThreadPoolExecutor
8
9
  from datetime import datetime, timezone
9
10
  import re
10
11
 
@@ -343,8 +344,18 @@ class DataWriter:
343
344
  # layout and tight row-group zonemaps). Together with __rowid__ it
344
345
  # is hidden from query output by the read view's
345
346
  # ``EXCLUDE (__rowid__, __timestamp__)`` projection.
347
+ #
348
+ # System-owned, exactly like __rowid__ above: ALWAYS overwrite any
349
+ # caller-supplied __timestamp__ instead of preserving it. It is a
350
+ # reserved internal column that is both the dedup ORDER BY key (newest
351
+ # per key wins) and the source of the __p_year__/month/day partition
352
+ # derivation (processing.py); letting a caller inject an arbitrary value
353
+ # (wrong dtype, non-UTC, or chosen to game which row wins) would
354
+ # silently corrupt partitioning and dedup. ``newer_than`` is the
355
+ # supported, explicit mechanism for caller-controlled conflict
356
+ # resolution.
346
357
  table_config = self._get_table_config(simple_name)
347
- if not delete_only and "__timestamp__" not in dataframe.columns:
358
+ if not delete_only:
348
359
  dataframe = dataframe.with_columns(
349
360
  polars.lit(datetime.now(timezone.utc)).alias("__timestamp__")
350
361
  )
@@ -511,12 +522,21 @@ class DataWriter:
511
522
  # Load the current deletion-vector once: used both to exclude
512
523
  # already-tombstoned rows from this write's deletes (below) and,
513
524
  # via prev_df, to extend the vector without a second read.
525
+ # required=True: a DV that exists but cannot be read must abort
526
+ # the write, never be treated as empty — silently dropping the
527
+ # carried-forward vector would resurrect previously deleted rows.
514
528
  prev_dv_df = (
515
- _read_parquet_safe(prev_tombstone_path, profiler=profiler)
529
+ _read_parquet_safe(prev_tombstone_path, profiler=profiler, required=True)
516
530
  if prev_tombstone_path else None
517
531
  )
532
+ # The rowid set is consumed only by the idempotency filter below,
533
+ # which runs only when this write actually tombstones rows
534
+ # (overwrite or delete_only). Pure appends tombstone nothing, so
535
+ # skip materialising the whole deletion-vector as a Python set —
536
+ # prev_dv_df is still carried forward into build_tombstone_file.
518
537
  prev_dv_rowids = set()
519
- if prev_dv_df is not None and "__rowid__" in prev_dv_df.columns:
538
+ if (overwrite_columns or delete_only) and prev_dv_df is not None \
539
+ and "__rowid__" in prev_dv_df.columns:
520
540
  prev_dv_rowids = set(prev_dv_df.get_column("__rowid__").to_list())
521
541
 
522
542
  # 1. Identify which existing rows this write deletes/replaces.
@@ -555,38 +575,87 @@ class DataWriter:
555
575
  f"(excluded {len(prev_dv_rowids)} row(s) already in the deletion-vector)"
556
576
  ))
557
577
 
558
- # 2. Write the incoming rows as a new file (insert/upsert side).
559
- # delete_only carries only predicate columns — nothing to insert.
560
- if not delete_only and dataframe.height > 0:
578
+ # 2. + 3. Write the incoming rows as a new data file (insert/
579
+ # upsert side) AND carry-forward/extend the deletion-vector
580
+ # tombstone file. These two object-store PUTs are independent:
581
+ # neither reads the other's output and they write to disjoint
582
+ # dirs (data/ vs tombstone/), so they run concurrently to
583
+ # overlap the two round-trips. delete_only carries only
584
+ # predicate columns → nothing to insert. No new deletes →
585
+ # build_tombstone reuses the previous file (combined_df=None).
586
+ #
587
+ # Profiler is NOT thread-safe, so each branch records into its
588
+ # own sub-profiler which the parent merges after the join;
589
+ # each branch also measures its own wall time so the per-phase
590
+ # monitoring timings stay meaningful despite the overlap.
591
+ # Footers of files written via the write_bytes path are captured
592
+ # in footer_md_cache so stats extraction (step 6) reuses them
593
+ # instead of re-downloading each freshly-written file.
594
+ footer_md_cache = {}
595
+ tombstone_dir = os.path.join(simple_table.simple_dir, "tombstone")
596
+ do_insert = (not delete_only and dataframe.height > 0)
597
+
598
+ def _write_data_branch():
599
+ sub = Profiler()
600
+ t = time.perf_counter()
561
601
  write_parquet_and_collect_resources(
562
602
  write_df=dataframe,
563
603
  overwrite_columns=[],
564
604
  data_dir=simple_table.data_dir,
565
605
  new_resources=new_resources,
566
606
  compression_level=compression_level,
567
- profiler=profiler,
607
+ profiler=sub,
608
+ footer_md_out=footer_md_cache,
568
609
  )
610
+ return sub, time.perf_counter() - t
611
+
612
+ def _write_tombstone_branch():
613
+ sub = Profiler()
614
+ t = time.perf_counter()
615
+ tp, cdf = build_tombstone_file(
616
+ tombstone_dir=tombstone_dir,
617
+ prev_tombstone_path=prev_tombstone_path,
618
+ new_pairs=new_delete_pairs,
619
+ compression_level=compression_level,
620
+ profiler=sub,
621
+ prev_df=prev_dv_df,
622
+ )
623
+ return tp, cdf, sub, time.perf_counter() - t
624
+
625
+ if do_insert:
626
+ with ThreadPoolExecutor(max_workers=2) as _ex:
627
+ _f_data = _ex.submit(_write_data_branch)
628
+ _f_tomb = _ex.submit(_write_tombstone_branch)
629
+ # .result() re-raises in the parent: a failure in either
630
+ # PUT aborts the write before any snapshot commit, exactly
631
+ # as the former sequential path did (an orphaned immutable
632
+ # file no snapshot references is harmless garbage).
633
+ data_sub, data_secs = _f_data.result()
634
+ tombstone_path, combined_tombstone_df, tomb_sub, tomb_secs = (
635
+ _f_tomb.result()
636
+ )
637
+ profiler.merge(data_sub)
638
+ profiler.merge(tomb_sub)
569
639
  inserted = dataframe.height
570
640
  else:
641
+ tombstone_path, combined_tombstone_df, tomb_sub, tomb_secs = (
642
+ _write_tombstone_branch()
643
+ )
644
+ profiler.merge(tomb_sub)
645
+ data_secs = 0.0
571
646
  inserted = 0
572
- mark("write_parquet")
647
+
648
+ # Assign the two per-phase timings from each branch's own measured
649
+ # wall time (they overlapped, so the serial mark() deltas would
650
+ # misattribute the time), then advance the mark() baseline.
651
+ timings["write_parquet"] = data_secs
652
+ timings["build_tombstone"] = tomb_secs
653
+ t_last = time.time()
573
654
  logger.debug(lp(
574
655
  f"step[write]: appended {inserted} incoming row(s) as {len(new_resources)} "
575
656
  f"new immutable file(s) (no existing data file rewritten)"
576
657
  ))
577
658
 
578
- # 3. Carry forward + extend the deletion-vector tombstone file.
579
- # No new deletes → reuse the previous file (combined_df=None).
580
- tombstone_dir = os.path.join(simple_table.simple_dir, "tombstone")
581
- tombstone_path, combined_tombstone_df = build_tombstone_file(
582
- tombstone_dir=tombstone_dir,
583
- prev_tombstone_path=prev_tombstone_path,
584
- new_pairs=new_delete_pairs,
585
- compression_level=compression_level,
586
- profiler=profiler,
587
- prev_df=prev_dv_df,
588
- )
589
-
590
659
  # Track the live deletion-vector row count so meta reads can
591
660
  # deduct dead rows from the physical resource row totals.
592
661
  # New deletes → combined_tombstone_df is the full deduped DV
@@ -596,7 +665,6 @@ class DataWriter:
596
665
  if combined_tombstone_df is not None
597
666
  else int(last_simple_table.get("tombstone_rows", 0) or 0)
598
667
  )
599
- mark("build_tombstone")
600
668
  logger.debug(lp(
601
669
  f"step[tombstone]: deletion-vector now {tombstone_rows} row(s) "
602
670
  f"({'rewritten' if combined_tombstone_df is not None else 'carried forward unchanged'})"
@@ -745,7 +813,9 @@ class DataWriter:
745
813
  r.get("file") for r in new_resources
746
814
  if isinstance(r, dict) and r.get("file")
747
815
  ]
748
- new_stats_rows = extract_stats_rows(new_data_files, profiler=profiler)
816
+ new_stats_rows = extract_stats_rows(
817
+ new_data_files, profiler=profiler, footer_md_cache=footer_md_cache
818
+ )
749
819
  stats_path, combined_stats_df = build_stats_file(
750
820
  stats_dir=stats_dir,
751
821
  prev_stats_path=last_simple_table.get("stats_file"),
@@ -1181,8 +1251,17 @@ class DataWriter:
1181
1251
  # the *write* path; compact() is explicit maintenance and always
1182
1252
  # consumes the vector.
1183
1253
  tombstone_path = last_simple_table.get("tombstone")
1254
+ # required=True: a DV that exists but cannot be read must abort the
1255
+ # compaction, never be treated as empty. A swallowed read here would
1256
+ # set should_run_tombstones=False, skipping both Phase A and the
1257
+ # pointer-clear below, so Phase B would carry the dead rows into the
1258
+ # new file while the vector kept pointing at the sunset __file__ —
1259
+ # leaving them permanently unreclaimable. Failing loud leaves the
1260
+ # prior snapshot + vector intact for a retry, and matches the
1261
+ # write-path carry-forward read (required=True) above.
1184
1262
  tombstone_df = (
1185
- _read_parquet_safe(tombstone_path) if tombstone_path else None
1263
+ _read_parquet_safe(tombstone_path, required=True)
1264
+ if tombstone_path else None
1186
1265
  )
1187
1266
  tombstone_rows = (
1188
1267
  tombstone_df.height if tombstone_df is not None else 0
@@ -1246,6 +1325,24 @@ class DataWriter:
1246
1325
  r for r in (list(tomb_new_resources) + list(small_new_resources))
1247
1326
  if r.get("file") not in all_sunset
1248
1327
  ]
1328
+ # ``all_new_resources`` is the full set of files written by THIS
1329
+ # compaction; it feeds stats extraction, the schema model_df and the
1330
+ # result metrics below, all of which need every new file.
1331
+ #
1332
+ # For ``simple_table.update`` it must NOT be reused verbatim, though:
1333
+ # Phase A's outputs were already spliced into
1334
+ # ``last_simple_table["resources"]`` (the in-memory baseline that
1335
+ # ``update`` starts from) right after Phase A ran. ``update`` does
1336
+ # ``(baseline - sunset) + new_resources`` with no dedup, so any
1337
+ # Phase-A output that Phase B did NOT consume (left un-sunset because
1338
+ # it exceeded the ``small_only`` threshold, or its read failed) would
1339
+ # be counted once from the baseline AND once from new_resources —
1340
+ # i.e. the same file listed twice in the new snapshot. Hand ``update``
1341
+ # only Phase B's brand-new files, which are the only resources genuinely
1342
+ # absent from that baseline.
1343
+ update_new_resources = [
1344
+ r for r in small_new_resources if r.get("file") not in all_sunset
1345
+ ]
1249
1346
  result["files_compacted"] = considered
1250
1347
  result["new_resources"] = len(all_new_resources)
1251
1348
  result["sunset_files"] = len(all_sunset)
@@ -1338,7 +1435,7 @@ class DataWriter:
1338
1435
  )
1339
1436
 
1340
1437
  new_snapshot_dict, new_snapshot_path = simple_table.update(
1341
- all_new_resources,
1438
+ update_new_resources,
1342
1439
  all_sunset,
1343
1440
  model_df,
1344
1441
  last_snapshot=last_simple_table,
@@ -731,12 +731,66 @@ def new_duckdb_connection(
731
731
  purely local scans.
732
732
  """
733
733
  con = duckdb.connect()
734
- init_connection(con, temp_dir=temp_dir, memory_limit=memory_limit)
735
- if for_paths and any("://" in str(p) for p in for_paths):
734
+ try:
735
+ init_connection(con, temp_dir=temp_dir, memory_limit=memory_limit)
736
+ if for_paths and any("://" in str(p) for p in for_paths):
737
+ configure_httpfs_and_s3(con, for_paths)
738
+ except Exception:
739
+ # Don't leak the half-initialised connection if a pragma / httpfs load
740
+ # raises; re-raise so callers still fall back exactly as before.
741
+ con.close()
742
+ raise
743
+ return con
744
+
745
+
746
+ # Thread-local pool for the write-side probe connection. DuckDB connections are
747
+ # NOT thread-safe, so each thread keeps its own; reusing it amortises the
748
+ # ~150 ms init/warmup across writes on the same thread — the same reason the
749
+ # read executors hold a persistent connection.
750
+ _probe_pool = threading.local()
751
+
752
+
753
+ def get_pooled_duckdb_connection(
754
+ temp_dir: str,
755
+ for_paths: Optional[List[str]] = None,
756
+ memory_limit: str = "1GB",
757
+ ) -> duckdb.DuckDBPyConnection:
758
+ """Return this thread's pooled probe connection, building it on first use.
759
+
760
+ The cold build goes through ``new_duckdb_connection`` so the pinned
761
+ ``home_directory`` / pragma contract is byte-for-byte identical to a
762
+ transient connection. On a *warm* connection httpfs/S3 is re-applied for
763
+ remote paths so a connection first built for local paths can still serve a
764
+ later remote probe and credentials always reflect the current environment
765
+ (``configure_httpfs_and_s3`` re-reads env each call and is idempotent).
766
+ """
767
+ con = getattr(_probe_pool, "con", None)
768
+ if con is None:
769
+ con = new_duckdb_connection(
770
+ temp_dir=temp_dir, for_paths=for_paths, memory_limit=memory_limit
771
+ )
772
+ _probe_pool.con = con
773
+ elif for_paths and any("://" in str(p) for p in for_paths):
736
774
  configure_httpfs_and_s3(con, for_paths)
737
775
  return con
738
776
 
739
777
 
778
+ def reset_pooled_duckdb_connections() -> None:
779
+ """Close and drop the calling thread's pooled probe connection.
780
+
781
+ A no-op when the thread has none. Used for test determinism and as an
782
+ eviction hook; the pool slot is cleared before the close so a failing close
783
+ still leaves the thread ready to rebuild.
784
+ """
785
+ con = getattr(_probe_pool, "con", None)
786
+ if con is not None:
787
+ _probe_pool.con = None
788
+ try:
789
+ con.close()
790
+ except Exception:
791
+ pass
792
+
793
+
740
794
  def apply_runtime_pragmas(con: duckdb.DuckDBPyConnection, cfg) -> None:
741
795
  """Re-apply the session-settable DuckDB pragmas from a live engine config.
742
796
 
@@ -86,6 +86,21 @@ def _mock_redis_catalog():
86
86
  yield
87
87
 
88
88
 
89
+ @pytest.fixture(autouse=True)
90
+ def _reset_probe_pool():
91
+ """Clear the thread-local write-probe connection pool around every test.
92
+
93
+ The probe now reuses a pooled connection across writes, so tests that
94
+ assert how many times ``new_duckdb_connection`` is built must start from a
95
+ cold pool; resetting afterwards keeps the connection from leaking into the
96
+ next test.
97
+ """
98
+ from supertable.engine.engine_common import reset_pooled_duckdb_connections
99
+ reset_pooled_duckdb_connections()
100
+ yield
101
+ reset_pooled_duckdb_connections()
102
+
103
+
89
104
  @pytest.fixture()
90
105
  def duckdb_con():
91
106
  """Provide a real in-memory DuckDB connection, closed after each test."""
@@ -587,6 +587,38 @@ class TestReadWriteDuckDBParity:
587
587
  # for_paths forwarded so httpfs is loaded for remote scans.
588
588
  assert "for_paths" in calls[0][1]
589
589
 
590
+ def test_probe_reuses_pooled_connection(self, tmp_path, monkeypatch):
591
+ # A second probe on the same thread must REUSE the pooled connection,
592
+ # so new_duckdb_connection is built exactly once — the ~150ms warmup is
593
+ # paid on the cold probe and amortised on every subsequent write.
594
+ import polars
595
+ from supertable import processing as _processing
596
+
597
+ monkeypatch.setattr(_processing, "_get_storage", lambda: object())
598
+
599
+ f1 = str(tmp_path / "f1.parquet")
600
+ polars.DataFrame({"__rowid__": [10, 20], "id": [1, 2]}).write_parquet(f1)
601
+
602
+ calls = []
603
+ real = _engine_common.new_duckdb_connection
604
+ monkeypatch.setattr(
605
+ _engine_common,
606
+ "new_duckdb_connection",
607
+ lambda *a, **k: (calls.append((a, k)), real(*a, **k))[1],
608
+ )
609
+
610
+ def _probe():
611
+ return _processing._duckdb_probe_overlap_matches(
612
+ overlap_true_files=[(f1, 0)],
613
+ overwrite_columns=["id"],
614
+ newer_than_col=None,
615
+ incoming_keys=polars.DataFrame({"id": [2]}),
616
+ )
617
+
618
+ assert _probe() is not None
619
+ assert _probe() is not None
620
+ assert len(calls) == 1 # built on the cold probe, reused on the warm one
621
+
590
622
  def test_probe_matches_rows_on_local_parquet(self, tmp_path, monkeypatch):
591
623
  import polars
592
624
  from supertable import processing as _processing