supertable 2.3.1__tar.gz → 2.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. {supertable-2.3.1/supertable.egg-info → supertable-2.3.3}/PKG-INFO +1 -1
  2. {supertable-2.3.1 → supertable-2.3.3}/pyproject.toml +1 -1
  3. {supertable-2.3.1 → supertable-2.3.3}/setup.py +1 -1
  4. {supertable-2.3.1 → supertable-2.3.3}/supertable/__init__.py +1 -1
  5. {supertable-2.3.1 → supertable-2.3.3}/supertable/data_writer.py +154 -77
  6. {supertable-2.3.1 → supertable-2.3.3}/supertable/processing.py +300 -0
  7. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_data_writer.py +343 -66
  8. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_data_writer_comprehensive.py +34 -35
  9. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_processing_compact_resources.py +93 -0
  10. supertable-2.3.3/supertable/tests/test_resolve_overwrite_writes.py +239 -0
  11. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_supertable_all.py +18 -18
  12. {supertable-2.3.1 → supertable-2.3.3/supertable.egg-info}/PKG-INFO +1 -1
  13. {supertable-2.3.1 → supertable-2.3.3}/supertable.egg-info/SOURCES.txt +1 -0
  14. {supertable-2.3.1 → supertable-2.3.3}/LICENSE +0 -0
  15. {supertable-2.3.1 → supertable-2.3.3}/README.md +0 -0
  16. {supertable-2.3.1 → supertable-2.3.3}/requirements.txt +0 -0
  17. {supertable-2.3.1 → supertable-2.3.3}/setup.cfg +0 -0
  18. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/__init__.py +0 -0
  19. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/admin.py +0 -0
  20. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/chain.py +0 -0
  21. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/consumers.py +0 -0
  22. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/crypto.py +0 -0
  23. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/events.py +0 -0
  24. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/export.py +0 -0
  25. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/logger.py +0 -0
  26. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/middleware.py +0 -0
  27. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/reader.py +0 -0
  28. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/retention.py +0 -0
  29. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/tests/__init__.py +0 -0
  30. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/tests/test_chain.py +0 -0
  31. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/tests/test_crypto.py +0 -0
  32. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/tests/test_emit.py +0 -0
  33. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/tests/test_events.py +0 -0
  34. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/tests/test_retention.py +0 -0
  35. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/writer_parquet.py +0 -0
  36. {supertable-2.3.1 → supertable-2.3.3}/supertable/audit/writer_redis.py +0 -0
  37. {supertable-2.3.1 → supertable-2.3.3}/supertable/config/__init__.py +0 -0
  38. {supertable-2.3.1 → supertable-2.3.3}/supertable/config/defaults.py +0 -0
  39. {supertable-2.3.1 → supertable-2.3.3}/supertable/config/homedir.py +0 -0
  40. {supertable-2.3.1 → supertable-2.3.3}/supertable/config/settings.py +0 -0
  41. {supertable-2.3.1 → supertable-2.3.3}/supertable/config/tests/__init__.py +0 -0
  42. {supertable-2.3.1 → supertable-2.3.3}/supertable/config/tests/test_defaults.py +0 -0
  43. {supertable-2.3.1 → supertable-2.3.3}/supertable/config/tests/test_homedir.py +0 -0
  44. {supertable-2.3.1 → supertable-2.3.3}/supertable/config/tests/test_settings.py +0 -0
  45. {supertable-2.3.1 → supertable-2.3.3}/supertable/data_classes.py +0 -0
  46. {supertable-2.3.1 → supertable-2.3.3}/supertable/data_reader.py +0 -0
  47. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/__init__.py +0 -0
  48. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/__init__.py +0 -0
  49. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/__main__.py +0 -0
  50. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/check_filter_builder.py +0 -0
  51. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/controller.py +0 -0
  52. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/data_writer_helpers.py +0 -0
  53. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/defaults.py +0 -0
  54. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/dummy_data.py +0 -0
  55. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/read_parquet_header.py +0 -0
  56. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s01_01_01_create_super_table.py +0 -0
  57. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s01_01_02_enable_mirroring_formats.py +0 -0
  58. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s01_02_create_roles.py +0 -0
  59. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s01_03_create_users.py +0 -0
  60. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s02_01_write_dummy_data.py +0 -0
  61. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s02_02_write_single_data.py +0 -0
  62. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s02_03_01_write_staging.py +0 -0
  63. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s02_03_02_create_pipe.py +0 -0
  64. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s02_04_01_write_monitoring_simple.py +0 -0
  65. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s02_04_02_write_monitoring_parallel.py +0 -0
  66. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s02_05_write_tombstone.py +0 -0
  67. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s03_01_read_data_error.py +0 -0
  68. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s03_02_01_read_super_data_ok.py +0 -0
  69. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s03_02_02_read_table_data_ok.py +0 -0
  70. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s03_03_read_meta.py +0 -0
  71. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s03_04_read_staging.py +0 -0
  72. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s03_06_01_read_roles.py +0 -0
  73. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s03_06_02_read_user.py +0 -0
  74. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s03_07_01_estimate_read.py +0 -0
  75. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s03_07_02_estimate_files.py +0 -0
  76. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s03_08_read_snapshot_history.py +0 -0
  77. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s04_01_03_delete_pipe.py +0 -0
  78. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s05_01_delete_table.py +0 -0
  79. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/quickstart/s05_02_delete_super_table.py +0 -0
  80. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/webshop/__init__.py +0 -0
  81. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/webshop/core.py +0 -0
  82. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/webshop/defaults.py +0 -0
  83. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/webshop/generate.py +0 -0
  84. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/webshop/load.py +0 -0
  85. {supertable-2.3.1 → supertable-2.3.3}/supertable/demo/webshop/topup.py +0 -0
  86. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/__init__.py +0 -0
  87. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/data_estimator.py +0 -0
  88. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/duckdb_lite.py +0 -0
  89. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/duckdb_pro.py +0 -0
  90. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/engine_common.py +0 -0
  91. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/engine_config.py +0 -0
  92. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/engine_enum.py +0 -0
  93. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/executor.py +0 -0
  94. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/plan_stats.py +0 -0
  95. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/spark_thrift.py +0 -0
  96. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/tests/__init__.py +0 -0
  97. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/tests/conftest.py +0 -0
  98. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/tests/test_engine.py +0 -0
  99. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/tests/test_engine_config.py +0 -0
  100. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/tests/test_engine_routing.py +0 -0
  101. {supertable-2.3.1 → supertable-2.3.3}/supertable/engine/tests/test_engine_spill.py +0 -0
  102. {supertable-2.3.1 → supertable-2.3.3}/supertable/errors.py +0 -0
  103. {supertable-2.3.1 → supertable-2.3.3}/supertable/locking/__init__.py +0 -0
  104. {supertable-2.3.1 → supertable-2.3.3}/supertable/locking/benchmarks/__init__.py +0 -0
  105. {supertable-2.3.1 → supertable-2.3.3}/supertable/locking/benchmarks/benchmark_locking.py +0 -0
  106. {supertable-2.3.1 → supertable-2.3.3}/supertable/locking/benchmarks/measure_lock_speed.py +0 -0
  107. {supertable-2.3.1 → supertable-2.3.3}/supertable/locking/benchmarks/measure_lock_time.py +0 -0
  108. {supertable-2.3.1 → supertable-2.3.3}/supertable/locking/file_lock.py +0 -0
  109. {supertable-2.3.1 → supertable-2.3.3}/supertable/locking/redis_lock.py +0 -0
  110. {supertable-2.3.1 → supertable-2.3.3}/supertable/locking/tests/__init__.py +0 -0
  111. {supertable-2.3.1 → supertable-2.3.3}/supertable/locking/tests/test_file_lock.py +0 -0
  112. {supertable-2.3.1 → supertable-2.3.3}/supertable/locking/tests/test_redis_lock.py +0 -0
  113. {supertable-2.3.1 → supertable-2.3.3}/supertable/logging.py +0 -0
  114. {supertable-2.3.1 → supertable-2.3.3}/supertable/meta_reader.py +0 -0
  115. {supertable-2.3.1 → supertable-2.3.3}/supertable/mirroring/__init__.py +0 -0
  116. {supertable-2.3.1 → supertable-2.3.3}/supertable/mirroring/mirror_delta.py +0 -0
  117. {supertable-2.3.1 → supertable-2.3.3}/supertable/mirroring/mirror_formats.py +0 -0
  118. {supertable-2.3.1 → supertable-2.3.3}/supertable/mirroring/mirror_iceberg.py +0 -0
  119. {supertable-2.3.1 → supertable-2.3.3}/supertable/mirroring/mirror_parquet.py +0 -0
  120. {supertable-2.3.1 → supertable-2.3.3}/supertable/monitoring/__init__.py +0 -0
  121. {supertable-2.3.1 → supertable-2.3.3}/supertable/monitoring/partitions.py +0 -0
  122. {supertable-2.3.1 → supertable-2.3.3}/supertable/monitoring_writer.py +0 -0
  123. {supertable-2.3.1 → supertable-2.3.3}/supertable/plan_extender.py +0 -0
  124. {supertable-2.3.1 → supertable-2.3.3}/supertable/query_plan_manager.py +0 -0
  125. {supertable-2.3.1 → supertable-2.3.3}/supertable/rbac/__init__.py +0 -0
  126. {supertable-2.3.1 → supertable-2.3.3}/supertable/rbac/access_control.py +0 -0
  127. {supertable-2.3.1 → supertable-2.3.3}/supertable/rbac/filter_builder.py +0 -0
  128. {supertable-2.3.1 → supertable-2.3.3}/supertable/rbac/permissions.py +0 -0
  129. {supertable-2.3.1 → supertable-2.3.3}/supertable/rbac/role_manager.py +0 -0
  130. {supertable-2.3.1 → supertable-2.3.3}/supertable/rbac/row_column_security.py +0 -0
  131. {supertable-2.3.1 → supertable-2.3.3}/supertable/rbac/tests/test_filter_builder.py +0 -0
  132. {supertable-2.3.1 → supertable-2.3.3}/supertable/rbac/tests/test_rbac.py +0 -0
  133. {supertable-2.3.1 → supertable-2.3.3}/supertable/rbac/tests/test_rbac_per_table.py +0 -0
  134. {supertable-2.3.1 → supertable-2.3.3}/supertable/rbac/user_manager.py +0 -0
  135. {supertable-2.3.1 → supertable-2.3.3}/supertable/redis_catalog.py +0 -0
  136. {supertable-2.3.1 → supertable-2.3.3}/supertable/redis_connector.py +0 -0
  137. {supertable-2.3.1 → supertable-2.3.3}/supertable/redis_infra.py +0 -0
  138. {supertable-2.3.1 → supertable-2.3.3}/supertable/redis_keys.py +0 -0
  139. {supertable-2.3.1 → supertable-2.3.3}/supertable/simple_table.py +0 -0
  140. {supertable-2.3.1 → supertable-2.3.3}/supertable/staging_area.py +0 -0
  141. {supertable-2.3.1 → supertable-2.3.3}/supertable/storage/__init__.py +0 -0
  142. {supertable-2.3.1 → supertable-2.3.3}/supertable/storage/azure_storage.py +0 -0
  143. {supertable-2.3.1 → supertable-2.3.3}/supertable/storage/gcp_storage.py +0 -0
  144. {supertable-2.3.1 → supertable-2.3.3}/supertable/storage/local_storage.py +0 -0
  145. {supertable-2.3.1 → supertable-2.3.3}/supertable/storage/minio_storage.py +0 -0
  146. {supertable-2.3.1 → supertable-2.3.3}/supertable/storage/s3_storage.py +0 -0
  147. {supertable-2.3.1 → supertable-2.3.3}/supertable/storage/storage_factory.py +0 -0
  148. {supertable-2.3.1 → supertable-2.3.3}/supertable/storage/storage_interface.py +0 -0
  149. {supertable-2.3.1 → supertable-2.3.3}/supertable/storage/tests/test_storage.py +0 -0
  150. {supertable-2.3.1 → supertable-2.3.3}/supertable/super_pipe.py +0 -0
  151. {supertable-2.3.1 → supertable-2.3.3}/supertable/super_table.py +0 -0
  152. {supertable-2.3.1 → supertable-2.3.3}/supertable/system_query.py +0 -0
  153. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/__init__.py +0 -0
  154. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_align_to_schema_fix.py +0 -0
  155. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_create_if_missing.py +0 -0
  156. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_data_reader.py +0 -0
  157. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_data_reader_preflight.py +0 -0
  158. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_data_writer_compact.py +0 -0
  159. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_errors.py +0 -0
  160. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_meta_reader.py +0 -0
  161. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_monitoring_partitions.py +0 -0
  162. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_monitoring_sink_guard.py +0 -0
  163. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_newer_than.py +0 -0
  164. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_parquet_statistics.py +0 -0
  165. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_processing.py +0 -0
  166. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_processing_stats.py +0 -0
  167. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_query_sql.py +0 -0
  168. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_read_pruning_differential.py +0 -0
  169. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_read_pruning_integration.py +0 -0
  170. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_redis_key_prefix.py +0 -0
  171. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_simple_table.py +0 -0
  172. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_stats_cache.py +0 -0
  173. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_stats_pruning.py +0 -0
  174. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_stats_schema_snapshot.py +0 -0
  175. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_super_table.py +0 -0
  176. {supertable-2.3.1 → supertable-2.3.3}/supertable/tests/test_system_query.py +0 -0
  177. {supertable-2.3.1 → supertable-2.3.3}/supertable/utils/__init__.py +0 -0
  178. {supertable-2.3.1 → supertable-2.3.3}/supertable/utils/helper.py +0 -0
  179. {supertable-2.3.1 → supertable-2.3.3}/supertable/utils/profiler.py +0 -0
  180. {supertable-2.3.1 → supertable-2.3.3}/supertable/utils/sql_parser.py +0 -0
  181. {supertable-2.3.1 → supertable-2.3.3}/supertable/utils/tests/test_sql_parser_columns.py +0 -0
  182. {supertable-2.3.1 → supertable-2.3.3}/supertable/utils/timer.py +0 -0
  183. {supertable-2.3.1 → supertable-2.3.3}/supertable.egg-info/dependency_links.txt +0 -0
  184. {supertable-2.3.1 → supertable-2.3.3}/supertable.egg-info/entry_points.txt +0 -0
  185. {supertable-2.3.1 → supertable-2.3.3}/supertable.egg-info/requires.txt +0 -0
  186. {supertable-2.3.1 → supertable-2.3.3}/supertable.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: supertable
3
- Version: 2.3.1
3
+ Version: 2.3.3
4
4
  Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
5
5
  Author: Levente Kupas
6
6
  Author-email: Levente Kupas <lkupas@kladnasoft.com>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "supertable"
7
- version = "2.3.1"
7
+ version = "2.3.3"
8
8
  description = "SuperTable — versioned data lake library for SQL analytics on Parquet + Redis."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
19
19
 
20
20
  setup(
21
21
  name="supertable",
22
- version="2.3.1",
22
+ version="2.3.3",
23
23
  description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
24
24
  long_description=long_description,
25
25
  long_description_content_type="text/markdown",
@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
25
25
  project documentation for the full API surface.
26
26
  """
27
27
 
28
- __version__ = "2.3.1"
28
+ __version__ = "2.3.3"
29
29
 
30
30
  # Re-export the core public surface so users can do ``from supertable import …``
31
31
  # instead of remembering submodule paths.
@@ -23,8 +23,7 @@ from supertable.utils.timer import Timer
23
23
  from supertable.utils.profiler import Profiler
24
24
  from supertable.processing import (
25
25
  find_overlapping_files,
26
- filter_stale_incoming_rows,
27
- identify_deleted_rowids,
26
+ resolve_overwrite_writes,
28
27
  identify_all_rowids,
29
28
  build_tombstone_file,
30
29
  build_stats_file,
@@ -36,6 +35,7 @@ from supertable.processing import (
36
35
  write_parquet_and_collect_resources,
37
36
  compact_resources,
38
37
  compact_tombstones,
38
+ should_compact_small_files,
39
39
  _max_tombstone_rows,
40
40
  _read_parquet_safe,
41
41
  )
@@ -398,60 +398,67 @@ class DataWriter:
398
398
  logger.info(lp(f"stats pruning: skipped {pruned}/{before} candidate files"))
399
399
  mark("stats_prune")
400
400
 
401
- # File cache: populated by newer-than filtering, reused by process step
402
- # to avoid double-reading overlapping parquet files from storage.
401
+ # File cache: used only by delete_only's identify_all_rowids below.
403
402
  file_cache = {}
404
403
 
405
- # --- Newer-than filtering (skip stale/replayed rows) ---------------
406
- if newer_than and overwrite_columns:
404
+ # --- Overwrite resolution: stale-row filtering + delete-pair -------
405
+ # identification in one DuckDB-pushdown probe over the overlapping
406
+ # files (column projection, row-group skipping, ranged GETs, native
407
+ # null-safe SEMI JOIN) instead of full-file polars reads. Returns
408
+ # the stale-filtered incoming df plus the (file, __rowid__) delete
409
+ # pairs derived from the surviving keys; falls back to the polars
410
+ # oracle on any probe/derive failure. delete_only (no
411
+ # overwrite_columns) is handled separately in the deletion block.
412
+ resolved_delete_pairs = None
413
+ if overwrite_columns:
407
414
  pre_filter_count = dataframe.height
408
- dataframe = filter_stale_incoming_rows(
415
+ dataframe, resolved_delete_pairs = resolve_overwrite_writes(
409
416
  incoming_df=dataframe,
410
417
  overlapping_files=overlapping_files,
411
418
  overwrite_columns=overwrite_columns,
412
419
  newer_than_col=newer_than,
413
- file_cache=file_cache,
414
420
  profiler=profiler,
415
421
  )
416
- skipped = pre_filter_count - dataframe.height
417
- if skipped > 0:
418
- logger.info(lp(f"newer_than={newer_than}: skipped {skipped}/{pre_filter_count} stale rows"))
419
- if dataframe.height == 0:
420
- logger.info(lp("newer_than: all incoming rows are stale — skipping write"))
421
- mark("newer_than")
422
- total_columns = incoming_columns
423
- result_tuple = (total_columns, 0, 0, 0)
424
- stats_payload = {
425
- "query_id": qid,
426
- "recorded_at": datetime.now(timezone.utc).isoformat(),
427
- "organization": self.super_table.organization,
428
- "super_name": self.super_table.super_name,
429
- "role_name": role_name,
430
- "table_name": simple_name,
431
- "overwrite_columns": overwrite_columns,
432
- "compression_level": compression_level,
433
- "newer_than": newer_than,
434
- "delete_only": delete_only,
435
- "incoming_rows": incoming_rows,
436
- "incoming_columns": incoming_columns,
437
- "inserted": 0,
438
- "deleted": 0,
439
- "total_rows": 0,
440
- "total_columns": total_columns,
441
- "new_resources": 0,
442
- "sunset_files": 0,
443
- "skipped_stale": skipped,
444
- "lineage": _safe_json(lineage or {}),
445
- "duration": round(time.time() - t0, 6),
446
- "timings": profiler.emit_timings(),
447
- "counts": profiler.emit_counts(),
448
- }
449
- # Don't return here — fall through to finally (lock release)
450
- # and the post-finally monitoring block. Returning inside the
451
- # try block would either skip monitoring or run it while the
452
- # Redis data lock is still held.
453
- else:
454
- mark("newer_than")
422
+ if newer_than:
423
+ skipped = pre_filter_count - dataframe.height
424
+ if skipped > 0:
425
+ logger.info(lp(f"newer_than={newer_than}: skipped {skipped}/{pre_filter_count} stale rows"))
426
+ if dataframe.height == 0:
427
+ logger.info(lp("newer_than: all incoming rows are stale — skipping write"))
428
+ mark("newer_than")
429
+ total_columns = incoming_columns
430
+ result_tuple = (total_columns, 0, 0, 0)
431
+ stats_payload = {
432
+ "query_id": qid,
433
+ "recorded_at": datetime.now(timezone.utc).isoformat(),
434
+ "organization": self.super_table.organization,
435
+ "super_name": self.super_table.super_name,
436
+ "role_name": role_name,
437
+ "table_name": simple_name,
438
+ "overwrite_columns": overwrite_columns,
439
+ "compression_level": compression_level,
440
+ "newer_than": newer_than,
441
+ "delete_only": delete_only,
442
+ "incoming_rows": incoming_rows,
443
+ "incoming_columns": incoming_columns,
444
+ "inserted": 0,
445
+ "deleted": 0,
446
+ "total_rows": 0,
447
+ "total_columns": total_columns,
448
+ "new_resources": 0,
449
+ "sunset_files": 0,
450
+ "skipped_stale": skipped,
451
+ "lineage": _safe_json(lineage or {}),
452
+ "duration": round(time.time() - t0, 6),
453
+ "timings": profiler.emit_timings(),
454
+ "counts": profiler.emit_counts(),
455
+ }
456
+ # Don't return here fall through to finally (lock release)
457
+ # and the post-finally monitoring block. Returning inside the
458
+ # try block would either skip monitoring or run it while the
459
+ # Redis data lock is still held.
460
+ else:
461
+ mark("newer_than")
455
462
 
456
463
  # --- Deletion-vector (tombstone) logic ----------------------------
457
464
  # Merge-on-read model: every write tombstones the __rowid__s of the
@@ -467,16 +474,12 @@ class DataWriter:
467
474
 
468
475
  # 1. Identify which existing rows this write deletes/replaces.
469
476
  # overwrite_columns drives the anti-join key (delete + upsert);
470
- # pure appends (no overwrite_columns) tombstone nothing.
477
+ # pure appends (no overwrite_columns) tombstone nothing. The
478
+ # pairs were already derived (from the surviving keys) by the
479
+ # resolve_overwrite_writes probe above.
471
480
  new_delete_pairs = []
472
481
  if overwrite_columns:
473
- new_delete_pairs = identify_deleted_rowids(
474
- dataframe,
475
- overlapping_files,
476
- overwrite_columns,
477
- file_cache=file_cache,
478
- profiler=profiler,
479
- )
482
+ new_delete_pairs = resolved_delete_pairs or []
480
483
  elif delete_only:
481
484
  # delete-all: no overwrite_columns → tombstone every row.
482
485
  new_delete_pairs = identify_all_rowids(
@@ -525,28 +528,52 @@ class DataWriter:
525
528
  )
526
529
  mark("build_tombstone")
527
530
 
528
- # 4. Threshold compaction: physically drop dead rows once the
529
- # deletion-vector grows past max_tombstone_rows, then clear it.
530
- if (
531
+ # 4. Threshold compaction (two triggers, same physical step):
532
+ # (a) the deletion-vector grew past max_tombstone_rows, or
533
+ # (b) the small files tripped the auto-compaction gate.
534
+ # Both must FIRST physically drop tombstoned rows (Phase A)
535
+ # and only THEN merge small files (Phase B): compact_resources
536
+ # rewrites data files WITHOUT consulting the deletion-vector,
537
+ # so sunsetting a vector-referenced file would orphan its dead
538
+ # rows (hidden on read, never reclaimable). Draining first
539
+ # guarantees Phase B only ever sees vector-free survivors.
540
+ post_write_resources = (
541
+ (last_simple_table.get("resources") or []) + new_resources
542
+ )
543
+ compaction_gate = should_compact_small_files(
544
+ post_write_resources, table_config
545
+ )
546
+ tombstone_threshold_hit = (
531
547
  combined_tombstone_df is not None
532
548
  and combined_tombstone_df.height >= _max_tombstone_rows(table_config)
533
- ):
534
- removed, compact_new, compact_sunset = compact_tombstones(
535
- snapshot=last_simple_table,
536
- tombstone_df=combined_tombstone_df,
537
- data_dir=simple_table.data_dir,
538
- compression_level=compression_level,
539
- table_config=table_config,
540
- profiler=profiler,
541
- )
542
- new_resources.extend(compact_new)
543
- sunset_files |= compact_sunset
544
- tombstone_path = None # deletion-vector fully consumed
545
- tombstone_rows = 0
546
- logger.info(lp(
547
- f"tombstone compaction removed {removed} rows "
548
- f"from {len(compact_sunset)} files"
549
- ))
549
+ )
550
+
551
+ # Phase A — drain the deletion-vector when either trigger fires
552
+ # and a vector is actually live (freshly built this write OR
553
+ # carried forward from a prior one).
554
+ if tombstone_threshold_hit or compaction_gate:
555
+ dv_to_drain = combined_tombstone_df
556
+ if dv_to_drain is None and tombstone_path:
557
+ # Pure carry-forward: load the live vector so the merge
558
+ # below never sunsets a file it still references.
559
+ dv_to_drain = _read_parquet_safe(tombstone_path, profiler=profiler)
560
+ if dv_to_drain is not None and dv_to_drain.height > 0:
561
+ removed, tomb_new, tomb_sunset = compact_tombstones(
562
+ snapshot=last_simple_table,
563
+ tombstone_df=dv_to_drain,
564
+ data_dir=simple_table.data_dir,
565
+ compression_level=compression_level,
566
+ table_config=table_config,
567
+ profiler=profiler,
568
+ )
569
+ new_resources.extend(tomb_new)
570
+ sunset_files |= tomb_sunset
571
+ tombstone_path = None # deletion-vector fully consumed
572
+ tombstone_rows = 0
573
+ logger.info(lp(
574
+ f"tombstone compaction removed {removed} rows "
575
+ f"from {len(tomb_sunset)} files"
576
+ ))
550
577
 
551
578
  # 5. Pin the (carried-forward / new / cleared) tombstone pointer
552
579
  # and its row count.
@@ -554,6 +581,45 @@ class DataWriter:
554
581
  last_simple_table["tombstone_rows"] = tombstone_rows
555
582
  mark("compact_tombstones")
556
583
 
584
+ # Phase B — auto small-file compaction. Merge the accumulated
585
+ # small files (existing survivors + the file just written) once
586
+ # the gate is open so the file count stays bounded. The vector
587
+ # was drained above, so every surviving file is safe to sunset.
588
+ # Result folds into the SAME snapshot commit below (new_resources
589
+ # / sunset_files feed build_stats and simple_table.update).
590
+ compaction_ran = False
591
+ if compaction_gate:
592
+ live_resources = [
593
+ r for r in (last_simple_table.get("resources") or [])
594
+ if r.get("file") not in sunset_files
595
+ ]
596
+ live_resources += [
597
+ r for r in new_resources if r.get("file") not in sunset_files
598
+ ]
599
+ considered, comp_rows, comp_new, comp_sunset = compact_resources(
600
+ snapshot={"resources": live_resources},
601
+ data_dir=simple_table.data_dir,
602
+ compression_level=compression_level,
603
+ table_config=table_config,
604
+ small_only=True,
605
+ )
606
+ if comp_new or comp_sunset:
607
+ sunset_files |= comp_sunset
608
+ # A file written above (incoming or tombstone survivor)
609
+ # may have been re-merged here; drop any new_resources
610
+ # entry that is now sunset so the snapshot never lists a
611
+ # file as both live and gone.
612
+ new_resources = [
613
+ r for r in (new_resources + comp_new)
614
+ if r.get("file") not in sunset_files
615
+ ]
616
+ compaction_ran = True
617
+ logger.info(lp(
618
+ f"auto-compaction merged {considered} small files "
619
+ f"into {len(comp_new)} file(s) ({comp_rows} rows)"
620
+ ))
621
+ mark("compact_small")
622
+
557
623
  # 6. Carry forward + extend the external column-statistics parquet.
558
624
  # Read the footers of the newly written data files, drop the
559
625
  # rows of any sunset file, and append the new ones. No new
@@ -612,7 +678,18 @@ class DataWriter:
612
678
  # model_df would shrink schema / schemaString to that partial
613
679
  # shape even though all parquet files still have full schema.
614
680
  # See docs/03_data_model.md "Schema Field Semantics".
615
- schema_model_df = None if delete_only else dataframe
681
+ #
682
+ # When auto-compaction merged files this write, derive the
683
+ # schema from the compacted output instead: a merged file may
684
+ # union in columns from older files that the incoming frame
685
+ # lacks (schema-evolving tables), so `dataframe` would narrow
686
+ # the metadata even though the Parquet is wider.
687
+ if compaction_ran:
688
+ schema_model_df = self._build_compact_model_df(
689
+ new_resources, last_simple_table
690
+ )
691
+ else:
692
+ schema_model_df = None if delete_only else dataframe
616
693
  new_snapshot_dict, new_snapshot_path = simple_table.update(
617
694
  new_resources, sunset_files, schema_model_df,
618
695
  last_snapshot=last_simple_table,
@@ -718,7 +795,7 @@ class DataWriter:
718
795
  f"lock={timings.get('lock', 0):.3f} | snapshot={timings.get('snapshot', 0):.3f} | "
719
796
  f"overlap={timings.get('overlap', 0):.3f} | stats_prune={timings.get('stats_prune', 0):.3f} | newer_than={timings.get('newer_than', 0):.3f} | "
720
797
  f"identify_deletes={timings.get('identify_deletes', 0):.3f} | write_parquet={timings.get('write_parquet', 0):.3f} | "
721
- f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
798
+ f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | compact_small={timings.get('compact_small', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
722
799
  f"update_simple={timings.get('update_simple', 0):.3f} | bump_root={timings.get('bump_root', 0):.3f} | "
723
800
  f"mirror={timings.get('mirror', 0):.3f} | prepare_monitor={timings.get('prepare_monitor', 0):.3f}"
724
801
  )
@@ -6,6 +6,7 @@ import os
6
6
  import io
7
7
  import time
8
8
  import threading
9
+ import uuid
9
10
  from collections import OrderedDict
10
11
  from datetime import datetime, date, timezone
11
12
  from typing import Dict, List, Set, Tuple, Optional
@@ -291,6 +292,33 @@ def prune_not_overlapping_files_by_threshold(
291
292
  return result
292
293
 
293
294
 
295
+ def should_compact_small_files(
296
+ resources: List[Dict],
297
+ table_config: Optional[dict] = None,
298
+ ) -> bool:
299
+ """Return True when accumulated small files trip the auto-compaction gate.
300
+
301
+ Mirrors the threshold in ``prune_not_overlapping_files_by_threshold``: a
302
+ file is "small" when its ``file_size`` is strictly smaller than
303
+ ``max_memory_chunk_size``. The gate opens when EITHER the small-file count
304
+ reaches ``max_overlapping_files`` OR the combined small-file size exceeds
305
+ ``max_memory_chunk_size``. Files already at/above the chunk size are big
306
+ enough on their own and are never counted.
307
+
308
+ ``resources`` is a snapshot's resource list (dicts with ``file`` /
309
+ ``file_size``). Limits resolve per-table via ``_resolve_limits``.
310
+ """
311
+ max_mem, max_files = _resolve_limits(table_config)
312
+ small_sizes = [
313
+ int(r.get("file_size") or 0)
314
+ for r in (resources or [])
315
+ if r.get("file") and int(r.get("file_size") or 0) < max_mem
316
+ ]
317
+ if not small_sizes:
318
+ return False
319
+ return len(small_sizes) >= max_files or sum(small_sizes) > max_mem
320
+
321
+
294
322
  # =========================
295
323
  # Public API: Overlap selection (with compaction triggers)
296
324
  # =========================
@@ -942,6 +970,278 @@ def identify_all_rowids(
942
970
  return pairs
943
971
 
944
972
 
973
+ # =========================
974
+ # Pushdown overwrite resolution (DuckDB probe, polars fallback)
975
+ # =========================
976
+ #
977
+ # The legacy path (``filter_stale_incoming_rows`` + ``identify_deleted_rowids``)
978
+ # reads EVERY overlapping data file FULLY (all columns, all rows) into polars,
979
+ # then group/join over the whole table — cost O(table size), independent of how
980
+ # few rows are actually written. ``resolve_overwrite_writes`` replaces both with
981
+ # ONE column-projected DuckDB ``parquet_scan`` that reads only the key /
982
+ # ``__rowid__`` / newer-than columns and only the rows whose key matches an
983
+ # incoming key (null-safe SEMI JOIN), then derives both results in-memory from
984
+ # that small matched set. The two legacy functions are retained as the exact
985
+ # semantic oracle and the fallback for any environment/schema the probe can't
986
+ # handle.
987
+
988
+
989
+ def _storage_duckdb_path(storage, key: str) -> str:
990
+ """Resolve a storage key to a path string DuckDB can read directly.
991
+
992
+ Object stores expose ``to_duckdb_path`` (→ ``s3://`` or ``http(s)://``);
993
+ local storage has none, so the on-disk path is already DuckDB-readable and
994
+ returned unchanged. Anything already a URL passes through untouched.
995
+ """
996
+ if not key or "://" in key:
997
+ return key
998
+ fn = getattr(storage, "to_duckdb_path", None)
999
+ if callable(fn):
1000
+ try:
1001
+ url = fn(key)
1002
+ if isinstance(url, str) and url:
1003
+ return url
1004
+ except NotImplementedError:
1005
+ pass
1006
+ except Exception as e:
1007
+ logging.debug(f"[write-probe] to_duckdb_path failed for {key}: {e}")
1008
+ return key
1009
+
1010
+
1011
+ def _duckdb_probe_overlap_matches(
1012
+ overlap_true_files: List[Tuple[str, int]],
1013
+ overwrite_columns: List[str],
1014
+ newer_than_col: Optional[str],
1015
+ incoming_keys: polars.DataFrame,
1016
+ profiler: Optional[Profiler] = None,
1017
+ ) -> Optional[polars.DataFrame]:
1018
+ """Column-projected pushdown probe over the overlapping data files.
1019
+
1020
+ Runs one ``parquet_scan`` (union_by_name, ranged GETs, row-group skipping)
1021
+ null-safe ``SEMI JOIN``-ed against the unique *incoming_keys*, projecting only
1022
+ ``__rowid__`` + the overwrite columns (+ *newer_than_col* when given) plus the
1023
+ source ``filename``. Returns a polars frame with columns ``__file__`` (the
1024
+ original storage key), ``__rowid__``, the overwrite columns and the
1025
+ newer-than column — i.e. every existing row whose key matches an incoming
1026
+ key. Returns ``None`` on any failure or unsupported schema (e.g. a referenced
1027
+ column absent from EVERY candidate file → DuckDB binder error), signalling the
1028
+ caller to fall back to the polars full-read path.
1029
+ """
1030
+ p = profiler or get_null_profiler()
1031
+ if not overlap_true_files or not overwrite_columns:
1032
+ return None
1033
+
1034
+ try:
1035
+ import duckdb
1036
+ from supertable.engine.engine_common import (
1037
+ configure_httpfs_and_s3,
1038
+ escape_parquet_path,
1039
+ quote_if_needed,
1040
+ )
1041
+ except Exception as e:
1042
+ logging.info(f"[write-probe] duckdb unavailable, using polars path: {e}")
1043
+ return None
1044
+
1045
+ storage = _get_storage()
1046
+ duck_to_key: Dict[str, str] = {}
1047
+ duck_paths: List[str] = []
1048
+ for file_key, _sz in overlap_true_files:
1049
+ dp = _storage_duckdb_path(storage, file_key)
1050
+ duck_to_key[dp] = file_key
1051
+ duck_paths.append(dp)
1052
+
1053
+ select_cols = ["filename", quote_if_needed(ROWID_COL)]
1054
+ select_cols += [quote_if_needed(c) for c in overwrite_columns]
1055
+ if newer_than_col:
1056
+ select_cols.append(quote_if_needed(newer_than_col))
1057
+ join_cond = " AND ".join(
1058
+ f"src.{quote_if_needed(c)} IS NOT DISTINCT FROM k.{quote_if_needed(c)}"
1059
+ for c in overwrite_columns
1060
+ )
1061
+ files_sql = ", ".join(f"'{escape_parquet_path(dp)}'" for dp in duck_paths)
1062
+ ik_name = f"__st_ik_{uuid.uuid4().hex}"
1063
+
1064
+ con = None
1065
+ try:
1066
+ con = duckdb.connect()
1067
+ if any("://" in dp for dp in duck_paths):
1068
+ configure_httpfs_and_s3(con, duck_paths)
1069
+ con.register(ik_name, incoming_keys.to_arrow())
1070
+ sql = (
1071
+ f"SELECT {', '.join(select_cols)} "
1072
+ f"FROM parquet_scan([{files_sql}], union_by_name=TRUE, "
1073
+ f"filename=TRUE, hive_partitioning=FALSE) AS src "
1074
+ f"SEMI JOIN {ik_name} AS k ON {join_cond}"
1075
+ )
1076
+ with p.span("io.duckdb_probe"):
1077
+ matched = con.execute(sql).pl()
1078
+ except Exception as e:
1079
+ logging.info(f"[write-probe] probe failed, using polars path: {e}")
1080
+ return None
1081
+ finally:
1082
+ if con is not None:
1083
+ try:
1084
+ con.unregister(ik_name)
1085
+ except Exception:
1086
+ pass
1087
+ try:
1088
+ con.close()
1089
+ except Exception:
1090
+ pass
1091
+
1092
+ if matched is None or "filename" not in matched.columns:
1093
+ return None
1094
+ # Restore the original storage key (DuckDB's ``filename`` is the path we
1095
+ # passed in) as __file__ via a join so the tombstone stores keys, not URLs.
1096
+ map_df = polars.DataFrame(
1097
+ {"filename": list(duck_to_key.keys()),
1098
+ TOMBSTONE_FILE_COL: list(duck_to_key.values())}
1099
+ )
1100
+ matched = matched.join(map_df, on="filename", how="left").drop("filename")
1101
+ if matched.get_column(TOMBSTONE_FILE_COL).null_count() > 0:
1102
+ # A returned filename did not map back — refuse to emit ambiguous
1103
+ # tombstones; let the caller fall back to the polars path.
1104
+ logging.info("[write-probe] unmapped filename in probe result; using polars path")
1105
+ return None
1106
+ p.add("probe_files", len(duck_paths))
1107
+ p.add("probe_rows_matched", int(matched.height))
1108
+ return matched
1109
+
1110
+
1111
+ def _align_keys_to_incoming(
1112
+ matched: polars.DataFrame,
1113
+ incoming_df: polars.DataFrame,
1114
+ overwrite_columns: List[str],
1115
+ newer_than_col: Optional[str],
1116
+ ) -> polars.DataFrame:
1117
+ """Cast probe-result key / newer-than columns to the incoming df's dtypes.
1118
+
1119
+ DuckDB → Arrow → polars round-trips can yield a different (if compatible)
1120
+ dtype than the in-memory incoming frame; polars joins/comparisons want
1121
+ matching dtypes. Casts are best-effort; an unrepresentable cast raises and
1122
+ the caller falls back to the polars path.
1123
+ """
1124
+ casts = []
1125
+ for c in overwrite_columns:
1126
+ if c in matched.columns and c in incoming_df.columns:
1127
+ if matched.schema[c] != incoming_df.schema[c]:
1128
+ casts.append(polars.col(c).cast(incoming_df.schema[c]))
1129
+ if newer_than_col and newer_than_col in matched.columns and newer_than_col in incoming_df.columns:
1130
+ if matched.schema[newer_than_col] != incoming_df.schema[newer_than_col]:
1131
+ casts.append(polars.col(newer_than_col).cast(incoming_df.schema[newer_than_col]))
1132
+ return matched.with_columns(casts) if casts else matched
1133
+
1134
+
1135
+ def _derive_stale_and_deletes(
1136
+ incoming_df: polars.DataFrame,
1137
+ matched: polars.DataFrame,
1138
+ overwrite_columns: List[str],
1139
+ newer_than_col: Optional[str],
1140
+ profiler: Optional[Profiler] = None,
1141
+ ) -> Tuple[polars.DataFrame, List[Tuple[str, int]]]:
1142
+ """Derive (filtered incoming df, delete pairs) from the probe's matched rows.
1143
+
1144
+ Mirrors the legacy two-function semantics exactly:
1145
+ * stale filter — drop incoming rows whose newer-than value is <= the max
1146
+ existing value for that key (null existing max ⇒ new/legacy key ⇒ keep);
1147
+ skipped entirely when *newer_than_col* is falsy;
1148
+ * delete pairs — ``(file, __rowid__)`` of existing rows matched by the
1149
+ SURVIVING incoming keys (null-safe), so stale rows tombstone nothing and
1150
+ rows without a ``__rowid__`` (legacy files) are dropped.
1151
+ """
1152
+ p = profiler or get_null_profiler()
1153
+ matched = _align_keys_to_incoming(matched, incoming_df, overwrite_columns, newer_than_col)
1154
+
1155
+ if newer_than_col and newer_than_col in matched.columns:
1156
+ with p.span("newer_than.group_agg"):
1157
+ existing_max = matched.group_by(overwrite_columns).agg(
1158
+ polars.col(newer_than_col).max().alias("__existing_max__")
1159
+ )
1160
+ with p.span("newer_than.join_filter"):
1161
+ joined = incoming_df.join(existing_max, on=overwrite_columns, how="left")
1162
+ filtered = joined.filter(
1163
+ polars.col("__existing_max__").is_null()
1164
+ | (polars.col(newer_than_col) > polars.col("__existing_max__"))
1165
+ ).drop("__existing_max__")
1166
+ else:
1167
+ filtered = incoming_df
1168
+
1169
+ pairs: List[Tuple[str, int]] = []
1170
+ if ROWID_COL in matched.columns:
1171
+ surviving_keys = filtered.select(overwrite_columns).unique()
1172
+ with p.span("delete.semi_join"):
1173
+ matched_surviving = matched.join(
1174
+ surviving_keys, on=overwrite_columns, how="semi", nulls_equal=True
1175
+ )
1176
+ dv = matched_surviving.select([TOMBSTONE_FILE_COL, ROWID_COL]).drop_nulls()
1177
+ pairs = [(file, int(rid)) for file, rid in dv.iter_rows()]
1178
+ p.add("delete_rows_matched", len(pairs))
1179
+ return filtered, pairs
1180
+
1181
+
1182
+ def resolve_overwrite_writes(
1183
+ incoming_df: polars.DataFrame,
1184
+ overlapping_files: Set[Tuple[str, bool, int]],
1185
+ overwrite_columns: List[str],
1186
+ newer_than_col: Optional[str] = None,
1187
+ profiler: Optional[Profiler] = None,
1188
+ ) -> Tuple[polars.DataFrame, List[Tuple[str, int]]]:
1189
+ """Single-pass overwrite resolution: stale filtering + delete-vector pairs.
1190
+
1191
+ Returns ``(filtered_incoming_df, delete_pairs)`` computed from ONE DuckDB
1192
+ pushdown probe over the overlapping files. Falls back to the original polars
1193
+ full-read path (``filter_stale_incoming_rows`` + ``identify_deleted_rowids``)
1194
+ when DuckDB is unavailable, the probe fails, or the file schema can't be
1195
+ probed — semantics are identical on both paths.
1196
+
1197
+ *newer_than_col* falsy ⇒ no stale filtering (delete/upsert without conflict
1198
+ resolution); the incoming df is returned unchanged and every overlapping row
1199
+ matched by an incoming key is tombstoned.
1200
+ """
1201
+ p = profiler or get_null_profiler()
1202
+ overlap_true = [(f, sz) for f, has_overlap, sz in overlapping_files if has_overlap]
1203
+ if not overlap_true or not overwrite_columns:
1204
+ return incoming_df, []
1205
+
1206
+ key_cols = [c for c in overwrite_columns if c in incoming_df.columns]
1207
+ if key_cols != list(overwrite_columns):
1208
+ # Incoming df lacks a key column → no existing row can match (mirrors the
1209
+ # polars path, which returns no pairs and filters nothing).
1210
+ return incoming_df, []
1211
+
1212
+ incoming_keys = incoming_df.select(overwrite_columns).unique()
1213
+ matched = _duckdb_probe_overlap_matches(
1214
+ overlap_true, overwrite_columns, newer_than_col, incoming_keys, profiler=p,
1215
+ )
1216
+ if matched is not None:
1217
+ try:
1218
+ return _derive_stale_and_deletes(
1219
+ incoming_df, matched, overwrite_columns, newer_than_col, profiler=p,
1220
+ )
1221
+ except Exception as e:
1222
+ logging.warning(f"[write-probe] derive failed, using polars path: {e}")
1223
+
1224
+ # ---- Fallback: original polars full-read path (semantics oracle) ----
1225
+ p.add("overwrite_resolve_fallback", 1)
1226
+ file_cache: Dict[str, polars.DataFrame] = {}
1227
+ if newer_than_col:
1228
+ filtered = filter_stale_incoming_rows(
1229
+ incoming_df=incoming_df,
1230
+ overlapping_files=overlapping_files,
1231
+ overwrite_columns=overwrite_columns,
1232
+ newer_than_col=newer_than_col,
1233
+ file_cache=file_cache,
1234
+ profiler=p,
1235
+ )
1236
+ else:
1237
+ filtered = incoming_df
1238
+ pairs = identify_deleted_rowids(
1239
+ filtered, overlapping_files, overwrite_columns,
1240
+ file_cache=file_cache, profiler=p,
1241
+ )
1242
+ return filtered, pairs
1243
+
1244
+
945
1245
  def build_tombstone_file(
946
1246
  tombstone_dir: str,
947
1247
  prev_tombstone_path: Optional[str],