supertable 2.3.1__tar.gz → 2.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. {supertable-2.3.1/supertable.egg-info → supertable-2.3.2}/PKG-INFO +1 -1
  2. {supertable-2.3.1 → supertable-2.3.2}/pyproject.toml +1 -1
  3. {supertable-2.3.1 → supertable-2.3.2}/setup.py +1 -1
  4. {supertable-2.3.1 → supertable-2.3.2}/supertable/__init__.py +1 -1
  5. {supertable-2.3.1 → supertable-2.3.2}/supertable/data_writer.py +57 -55
  6. {supertable-2.3.1 → supertable-2.3.2}/supertable/processing.py +273 -0
  7. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_data_writer.py +60 -67
  8. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_data_writer_comprehensive.py +34 -35
  9. supertable-2.3.2/supertable/tests/test_resolve_overwrite_writes.py +239 -0
  10. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_supertable_all.py +18 -18
  11. {supertable-2.3.1 → supertable-2.3.2/supertable.egg-info}/PKG-INFO +1 -1
  12. {supertable-2.3.1 → supertable-2.3.2}/supertable.egg-info/SOURCES.txt +1 -0
  13. {supertable-2.3.1 → supertable-2.3.2}/LICENSE +0 -0
  14. {supertable-2.3.1 → supertable-2.3.2}/README.md +0 -0
  15. {supertable-2.3.1 → supertable-2.3.2}/requirements.txt +0 -0
  16. {supertable-2.3.1 → supertable-2.3.2}/setup.cfg +0 -0
  17. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/__init__.py +0 -0
  18. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/admin.py +0 -0
  19. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/chain.py +0 -0
  20. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/consumers.py +0 -0
  21. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/crypto.py +0 -0
  22. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/events.py +0 -0
  23. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/export.py +0 -0
  24. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/logger.py +0 -0
  25. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/middleware.py +0 -0
  26. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/reader.py +0 -0
  27. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/retention.py +0 -0
  28. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/tests/__init__.py +0 -0
  29. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/tests/test_chain.py +0 -0
  30. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/tests/test_crypto.py +0 -0
  31. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/tests/test_emit.py +0 -0
  32. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/tests/test_events.py +0 -0
  33. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/tests/test_retention.py +0 -0
  34. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/writer_parquet.py +0 -0
  35. {supertable-2.3.1 → supertable-2.3.2}/supertable/audit/writer_redis.py +0 -0
  36. {supertable-2.3.1 → supertable-2.3.2}/supertable/config/__init__.py +0 -0
  37. {supertable-2.3.1 → supertable-2.3.2}/supertable/config/defaults.py +0 -0
  38. {supertable-2.3.1 → supertable-2.3.2}/supertable/config/homedir.py +0 -0
  39. {supertable-2.3.1 → supertable-2.3.2}/supertable/config/settings.py +0 -0
  40. {supertable-2.3.1 → supertable-2.3.2}/supertable/config/tests/__init__.py +0 -0
  41. {supertable-2.3.1 → supertable-2.3.2}/supertable/config/tests/test_defaults.py +0 -0
  42. {supertable-2.3.1 → supertable-2.3.2}/supertable/config/tests/test_homedir.py +0 -0
  43. {supertable-2.3.1 → supertable-2.3.2}/supertable/config/tests/test_settings.py +0 -0
  44. {supertable-2.3.1 → supertable-2.3.2}/supertable/data_classes.py +0 -0
  45. {supertable-2.3.1 → supertable-2.3.2}/supertable/data_reader.py +0 -0
  46. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/__init__.py +0 -0
  47. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/__init__.py +0 -0
  48. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/__main__.py +0 -0
  49. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/check_filter_builder.py +0 -0
  50. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/controller.py +0 -0
  51. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/data_writer_helpers.py +0 -0
  52. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/defaults.py +0 -0
  53. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/dummy_data.py +0 -0
  54. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/read_parquet_header.py +0 -0
  55. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s01_01_01_create_super_table.py +0 -0
  56. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s01_01_02_enable_mirroring_formats.py +0 -0
  57. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s01_02_create_roles.py +0 -0
  58. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s01_03_create_users.py +0 -0
  59. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s02_01_write_dummy_data.py +0 -0
  60. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s02_02_write_single_data.py +0 -0
  61. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s02_03_01_write_staging.py +0 -0
  62. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s02_03_02_create_pipe.py +0 -0
  63. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s02_04_01_write_monitoring_simple.py +0 -0
  64. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s02_04_02_write_monitoring_parallel.py +0 -0
  65. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s02_05_write_tombstone.py +0 -0
  66. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s03_01_read_data_error.py +0 -0
  67. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s03_02_01_read_super_data_ok.py +0 -0
  68. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s03_02_02_read_table_data_ok.py +0 -0
  69. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s03_03_read_meta.py +0 -0
  70. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s03_04_read_staging.py +0 -0
  71. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s03_06_01_read_roles.py +0 -0
  72. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s03_06_02_read_user.py +0 -0
  73. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s03_07_01_estimate_read.py +0 -0
  74. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s03_07_02_estimate_files.py +0 -0
  75. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s03_08_read_snapshot_history.py +0 -0
  76. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s04_01_03_delete_pipe.py +0 -0
  77. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s05_01_delete_table.py +0 -0
  78. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/quickstart/s05_02_delete_super_table.py +0 -0
  79. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/webshop/__init__.py +0 -0
  80. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/webshop/core.py +0 -0
  81. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/webshop/defaults.py +0 -0
  82. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/webshop/generate.py +0 -0
  83. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/webshop/load.py +0 -0
  84. {supertable-2.3.1 → supertable-2.3.2}/supertable/demo/webshop/topup.py +0 -0
  85. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/__init__.py +0 -0
  86. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/data_estimator.py +0 -0
  87. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/duckdb_lite.py +0 -0
  88. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/duckdb_pro.py +0 -0
  89. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/engine_common.py +0 -0
  90. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/engine_config.py +0 -0
  91. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/engine_enum.py +0 -0
  92. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/executor.py +0 -0
  93. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/plan_stats.py +0 -0
  94. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/spark_thrift.py +0 -0
  95. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/tests/__init__.py +0 -0
  96. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/tests/conftest.py +0 -0
  97. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/tests/test_engine.py +0 -0
  98. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/tests/test_engine_config.py +0 -0
  99. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/tests/test_engine_routing.py +0 -0
  100. {supertable-2.3.1 → supertable-2.3.2}/supertable/engine/tests/test_engine_spill.py +0 -0
  101. {supertable-2.3.1 → supertable-2.3.2}/supertable/errors.py +0 -0
  102. {supertable-2.3.1 → supertable-2.3.2}/supertable/locking/__init__.py +0 -0
  103. {supertable-2.3.1 → supertable-2.3.2}/supertable/locking/benchmarks/__init__.py +0 -0
  104. {supertable-2.3.1 → supertable-2.3.2}/supertable/locking/benchmarks/benchmark_locking.py +0 -0
  105. {supertable-2.3.1 → supertable-2.3.2}/supertable/locking/benchmarks/measure_lock_speed.py +0 -0
  106. {supertable-2.3.1 → supertable-2.3.2}/supertable/locking/benchmarks/measure_lock_time.py +0 -0
  107. {supertable-2.3.1 → supertable-2.3.2}/supertable/locking/file_lock.py +0 -0
  108. {supertable-2.3.1 → supertable-2.3.2}/supertable/locking/redis_lock.py +0 -0
  109. {supertable-2.3.1 → supertable-2.3.2}/supertable/locking/tests/__init__.py +0 -0
  110. {supertable-2.3.1 → supertable-2.3.2}/supertable/locking/tests/test_file_lock.py +0 -0
  111. {supertable-2.3.1 → supertable-2.3.2}/supertable/locking/tests/test_redis_lock.py +0 -0
  112. {supertable-2.3.1 → supertable-2.3.2}/supertable/logging.py +0 -0
  113. {supertable-2.3.1 → supertable-2.3.2}/supertable/meta_reader.py +0 -0
  114. {supertable-2.3.1 → supertable-2.3.2}/supertable/mirroring/__init__.py +0 -0
  115. {supertable-2.3.1 → supertable-2.3.2}/supertable/mirroring/mirror_delta.py +0 -0
  116. {supertable-2.3.1 → supertable-2.3.2}/supertable/mirroring/mirror_formats.py +0 -0
  117. {supertable-2.3.1 → supertable-2.3.2}/supertable/mirroring/mirror_iceberg.py +0 -0
  118. {supertable-2.3.1 → supertable-2.3.2}/supertable/mirroring/mirror_parquet.py +0 -0
  119. {supertable-2.3.1 → supertable-2.3.2}/supertable/monitoring/__init__.py +0 -0
  120. {supertable-2.3.1 → supertable-2.3.2}/supertable/monitoring/partitions.py +0 -0
  121. {supertable-2.3.1 → supertable-2.3.2}/supertable/monitoring_writer.py +0 -0
  122. {supertable-2.3.1 → supertable-2.3.2}/supertable/plan_extender.py +0 -0
  123. {supertable-2.3.1 → supertable-2.3.2}/supertable/query_plan_manager.py +0 -0
  124. {supertable-2.3.1 → supertable-2.3.2}/supertable/rbac/__init__.py +0 -0
  125. {supertable-2.3.1 → supertable-2.3.2}/supertable/rbac/access_control.py +0 -0
  126. {supertable-2.3.1 → supertable-2.3.2}/supertable/rbac/filter_builder.py +0 -0
  127. {supertable-2.3.1 → supertable-2.3.2}/supertable/rbac/permissions.py +0 -0
  128. {supertable-2.3.1 → supertable-2.3.2}/supertable/rbac/role_manager.py +0 -0
  129. {supertable-2.3.1 → supertable-2.3.2}/supertable/rbac/row_column_security.py +0 -0
  130. {supertable-2.3.1 → supertable-2.3.2}/supertable/rbac/tests/test_filter_builder.py +0 -0
  131. {supertable-2.3.1 → supertable-2.3.2}/supertable/rbac/tests/test_rbac.py +0 -0
  132. {supertable-2.3.1 → supertable-2.3.2}/supertable/rbac/tests/test_rbac_per_table.py +0 -0
  133. {supertable-2.3.1 → supertable-2.3.2}/supertable/rbac/user_manager.py +0 -0
  134. {supertable-2.3.1 → supertable-2.3.2}/supertable/redis_catalog.py +0 -0
  135. {supertable-2.3.1 → supertable-2.3.2}/supertable/redis_connector.py +0 -0
  136. {supertable-2.3.1 → supertable-2.3.2}/supertable/redis_infra.py +0 -0
  137. {supertable-2.3.1 → supertable-2.3.2}/supertable/redis_keys.py +0 -0
  138. {supertable-2.3.1 → supertable-2.3.2}/supertable/simple_table.py +0 -0
  139. {supertable-2.3.1 → supertable-2.3.2}/supertable/staging_area.py +0 -0
  140. {supertable-2.3.1 → supertable-2.3.2}/supertable/storage/__init__.py +0 -0
  141. {supertable-2.3.1 → supertable-2.3.2}/supertable/storage/azure_storage.py +0 -0
  142. {supertable-2.3.1 → supertable-2.3.2}/supertable/storage/gcp_storage.py +0 -0
  143. {supertable-2.3.1 → supertable-2.3.2}/supertable/storage/local_storage.py +0 -0
  144. {supertable-2.3.1 → supertable-2.3.2}/supertable/storage/minio_storage.py +0 -0
  145. {supertable-2.3.1 → supertable-2.3.2}/supertable/storage/s3_storage.py +0 -0
  146. {supertable-2.3.1 → supertable-2.3.2}/supertable/storage/storage_factory.py +0 -0
  147. {supertable-2.3.1 → supertable-2.3.2}/supertable/storage/storage_interface.py +0 -0
  148. {supertable-2.3.1 → supertable-2.3.2}/supertable/storage/tests/test_storage.py +0 -0
  149. {supertable-2.3.1 → supertable-2.3.2}/supertable/super_pipe.py +0 -0
  150. {supertable-2.3.1 → supertable-2.3.2}/supertable/super_table.py +0 -0
  151. {supertable-2.3.1 → supertable-2.3.2}/supertable/system_query.py +0 -0
  152. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/__init__.py +0 -0
  153. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_align_to_schema_fix.py +0 -0
  154. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_create_if_missing.py +0 -0
  155. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_data_reader.py +0 -0
  156. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_data_reader_preflight.py +0 -0
  157. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_data_writer_compact.py +0 -0
  158. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_errors.py +0 -0
  159. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_meta_reader.py +0 -0
  160. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_monitoring_partitions.py +0 -0
  161. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_monitoring_sink_guard.py +0 -0
  162. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_newer_than.py +0 -0
  163. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_parquet_statistics.py +0 -0
  164. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_processing.py +0 -0
  165. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_processing_compact_resources.py +0 -0
  166. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_processing_stats.py +0 -0
  167. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_query_sql.py +0 -0
  168. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_read_pruning_differential.py +0 -0
  169. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_read_pruning_integration.py +0 -0
  170. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_redis_key_prefix.py +0 -0
  171. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_simple_table.py +0 -0
  172. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_stats_cache.py +0 -0
  173. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_stats_pruning.py +0 -0
  174. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_stats_schema_snapshot.py +0 -0
  175. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_super_table.py +0 -0
  176. {supertable-2.3.1 → supertable-2.3.2}/supertable/tests/test_system_query.py +0 -0
  177. {supertable-2.3.1 → supertable-2.3.2}/supertable/utils/__init__.py +0 -0
  178. {supertable-2.3.1 → supertable-2.3.2}/supertable/utils/helper.py +0 -0
  179. {supertable-2.3.1 → supertable-2.3.2}/supertable/utils/profiler.py +0 -0
  180. {supertable-2.3.1 → supertable-2.3.2}/supertable/utils/sql_parser.py +0 -0
  181. {supertable-2.3.1 → supertable-2.3.2}/supertable/utils/tests/test_sql_parser_columns.py +0 -0
  182. {supertable-2.3.1 → supertable-2.3.2}/supertable/utils/timer.py +0 -0
  183. {supertable-2.3.1 → supertable-2.3.2}/supertable.egg-info/dependency_links.txt +0 -0
  184. {supertable-2.3.1 → supertable-2.3.2}/supertable.egg-info/entry_points.txt +0 -0
  185. {supertable-2.3.1 → supertable-2.3.2}/supertable.egg-info/requires.txt +0 -0
  186. {supertable-2.3.1 → supertable-2.3.2}/supertable.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: supertable
3
- Version: 2.3.1
3
+ Version: 2.3.2
4
4
  Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
5
5
  Author: Levente Kupas
6
6
  Author-email: Levente Kupas <lkupas@kladnasoft.com>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "supertable"
7
- version = "2.3.1"
7
+ version = "2.3.2"
8
8
  description = "SuperTable — versioned data lake library for SQL analytics on Parquet + Redis."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
19
19
 
20
20
  setup(
21
21
  name="supertable",
22
- version="2.3.1",
22
+ version="2.3.2",
23
23
  description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
24
24
  long_description=long_description,
25
25
  long_description_content_type="text/markdown",
@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
25
25
  project documentation for the full API surface.
26
26
  """
27
27
 
28
- __version__ = "2.3.1"
28
+ __version__ = "2.3.2"
29
29
 
30
30
  # Re-export the core public surface so users can do ``from supertable import …``
31
31
  # instead of remembering submodule paths.
@@ -23,8 +23,7 @@ from supertable.utils.timer import Timer
23
23
  from supertable.utils.profiler import Profiler
24
24
  from supertable.processing import (
25
25
  find_overlapping_files,
26
- filter_stale_incoming_rows,
27
- identify_deleted_rowids,
26
+ resolve_overwrite_writes,
28
27
  identify_all_rowids,
29
28
  build_tombstone_file,
30
29
  build_stats_file,
@@ -398,60 +397,67 @@ class DataWriter:
398
397
  logger.info(lp(f"stats pruning: skipped {pruned}/{before} candidate files"))
399
398
  mark("stats_prune")
400
399
 
401
- # File cache: populated by newer-than filtering, reused by process step
402
- # to avoid double-reading overlapping parquet files from storage.
400
+ # File cache: used only by delete_only's identify_all_rowids below.
403
401
  file_cache = {}
404
402
 
405
- # --- Newer-than filtering (skip stale/replayed rows) ---------------
406
- if newer_than and overwrite_columns:
403
+ # --- Overwrite resolution: stale-row filtering + delete-pair -------
404
+ # identification in one DuckDB-pushdown probe over the overlapping
405
+ # files (column projection, row-group skipping, ranged GETs, native
406
+ # null-safe SEMI JOIN) instead of full-file polars reads. Returns
407
+ # the stale-filtered incoming df plus the (file, __rowid__) delete
408
+ # pairs derived from the surviving keys; falls back to the polars
409
+ # oracle on any probe/derive failure. delete_only (no
410
+ # overwrite_columns) is handled separately in the deletion block.
411
+ resolved_delete_pairs = None
412
+ if overwrite_columns:
407
413
  pre_filter_count = dataframe.height
408
- dataframe = filter_stale_incoming_rows(
414
+ dataframe, resolved_delete_pairs = resolve_overwrite_writes(
409
415
  incoming_df=dataframe,
410
416
  overlapping_files=overlapping_files,
411
417
  overwrite_columns=overwrite_columns,
412
418
  newer_than_col=newer_than,
413
- file_cache=file_cache,
414
419
  profiler=profiler,
415
420
  )
416
- skipped = pre_filter_count - dataframe.height
417
- if skipped > 0:
418
- logger.info(lp(f"newer_than={newer_than}: skipped {skipped}/{pre_filter_count} stale rows"))
419
- if dataframe.height == 0:
420
- logger.info(lp("newer_than: all incoming rows are stale — skipping write"))
421
- mark("newer_than")
422
- total_columns = incoming_columns
423
- result_tuple = (total_columns, 0, 0, 0)
424
- stats_payload = {
425
- "query_id": qid,
426
- "recorded_at": datetime.now(timezone.utc).isoformat(),
427
- "organization": self.super_table.organization,
428
- "super_name": self.super_table.super_name,
429
- "role_name": role_name,
430
- "table_name": simple_name,
431
- "overwrite_columns": overwrite_columns,
432
- "compression_level": compression_level,
433
- "newer_than": newer_than,
434
- "delete_only": delete_only,
435
- "incoming_rows": incoming_rows,
436
- "incoming_columns": incoming_columns,
437
- "inserted": 0,
438
- "deleted": 0,
439
- "total_rows": 0,
440
- "total_columns": total_columns,
441
- "new_resources": 0,
442
- "sunset_files": 0,
443
- "skipped_stale": skipped,
444
- "lineage": _safe_json(lineage or {}),
445
- "duration": round(time.time() - t0, 6),
446
- "timings": profiler.emit_timings(),
447
- "counts": profiler.emit_counts(),
448
- }
449
- # Don't return here — fall through to finally (lock release)
450
- # and the post-finally monitoring block. Returning inside the
451
- # try block would either skip monitoring or run it while the
452
- # Redis data lock is still held.
453
- else:
454
- mark("newer_than")
421
+ if newer_than:
422
+ skipped = pre_filter_count - dataframe.height
423
+ if skipped > 0:
424
+ logger.info(lp(f"newer_than={newer_than}: skipped {skipped}/{pre_filter_count} stale rows"))
425
+ if dataframe.height == 0:
426
+ logger.info(lp("newer_than: all incoming rows are stale — skipping write"))
427
+ mark("newer_than")
428
+ total_columns = incoming_columns
429
+ result_tuple = (total_columns, 0, 0, 0)
430
+ stats_payload = {
431
+ "query_id": qid,
432
+ "recorded_at": datetime.now(timezone.utc).isoformat(),
433
+ "organization": self.super_table.organization,
434
+ "super_name": self.super_table.super_name,
435
+ "role_name": role_name,
436
+ "table_name": simple_name,
437
+ "overwrite_columns": overwrite_columns,
438
+ "compression_level": compression_level,
439
+ "newer_than": newer_than,
440
+ "delete_only": delete_only,
441
+ "incoming_rows": incoming_rows,
442
+ "incoming_columns": incoming_columns,
443
+ "inserted": 0,
444
+ "deleted": 0,
445
+ "total_rows": 0,
446
+ "total_columns": total_columns,
447
+ "new_resources": 0,
448
+ "sunset_files": 0,
449
+ "skipped_stale": skipped,
450
+ "lineage": _safe_json(lineage or {}),
451
+ "duration": round(time.time() - t0, 6),
452
+ "timings": profiler.emit_timings(),
453
+ "counts": profiler.emit_counts(),
454
+ }
455
+ # Don't return here fall through to finally (lock release)
456
+ # and the post-finally monitoring block. Returning inside the
457
+ # try block would either skip monitoring or run it while the
458
+ # Redis data lock is still held.
459
+ else:
460
+ mark("newer_than")
455
461
 
456
462
  # --- Deletion-vector (tombstone) logic ----------------------------
457
463
  # Merge-on-read model: every write tombstones the __rowid__s of the
@@ -467,16 +473,12 @@ class DataWriter:
467
473
 
468
474
  # 1. Identify which existing rows this write deletes/replaces.
469
475
  # overwrite_columns drives the anti-join key (delete + upsert);
470
- # pure appends (no overwrite_columns) tombstone nothing.
476
+ # pure appends (no overwrite_columns) tombstone nothing. The
477
+ # pairs were already derived (from the surviving keys) by the
478
+ # resolve_overwrite_writes probe above.
471
479
  new_delete_pairs = []
472
480
  if overwrite_columns:
473
- new_delete_pairs = identify_deleted_rowids(
474
- dataframe,
475
- overlapping_files,
476
- overwrite_columns,
477
- file_cache=file_cache,
478
- profiler=profiler,
479
- )
481
+ new_delete_pairs = resolved_delete_pairs or []
480
482
  elif delete_only:
481
483
  # delete-all: no overwrite_columns → tombstone every row.
482
484
  new_delete_pairs = identify_all_rowids(
@@ -6,6 +6,7 @@ import os
6
6
  import io
7
7
  import time
8
8
  import threading
9
+ import uuid
9
10
  from collections import OrderedDict
10
11
  from datetime import datetime, date, timezone
11
12
  from typing import Dict, List, Set, Tuple, Optional
@@ -942,6 +943,278 @@ def identify_all_rowids(
942
943
  return pairs
943
944
 
944
945
 
946
+ # =========================
947
+ # Pushdown overwrite resolution (DuckDB probe, polars fallback)
948
+ # =========================
949
+ #
950
+ # The legacy path (``filter_stale_incoming_rows`` + ``identify_deleted_rowids``)
951
+ # reads EVERY overlapping data file FULLY (all columns, all rows) into polars,
952
+ # then group/join over the whole table — cost O(table size), independent of how
953
+ # few rows are actually written. ``resolve_overwrite_writes`` replaces both with
954
+ # ONE column-projected DuckDB ``parquet_scan`` that reads only the key /
955
+ # ``__rowid__`` / newer-than columns and only the rows whose key matches an
956
+ # incoming key (null-safe SEMI JOIN), then derives both results in-memory from
957
+ # that small matched set. The two legacy functions are retained as the exact
958
+ # semantic oracle and the fallback for any environment/schema the probe can't
959
+ # handle.
960
+
961
+
962
+ def _storage_duckdb_path(storage, key: str) -> str:
963
+ """Resolve a storage key to a path string DuckDB can read directly.
964
+
965
+ Object stores expose ``to_duckdb_path`` (→ ``s3://`` or ``http(s)://``);
966
+ local storage has none, so the on-disk path is already DuckDB-readable and
967
+ returned unchanged. Anything already a URL passes through untouched.
968
+ """
969
+ if not key or "://" in key:
970
+ return key
971
+ fn = getattr(storage, "to_duckdb_path", None)
972
+ if callable(fn):
973
+ try:
974
+ url = fn(key)
975
+ if isinstance(url, str) and url:
976
+ return url
977
+ except NotImplementedError:
978
+ pass
979
+ except Exception as e:
980
+ logging.debug(f"[write-probe] to_duckdb_path failed for {key}: {e}")
981
+ return key
982
+
983
+
984
+ def _duckdb_probe_overlap_matches(
985
+ overlap_true_files: List[Tuple[str, int]],
986
+ overwrite_columns: List[str],
987
+ newer_than_col: Optional[str],
988
+ incoming_keys: polars.DataFrame,
989
+ profiler: Optional[Profiler] = None,
990
+ ) -> Optional[polars.DataFrame]:
991
+ """Column-projected pushdown probe over the overlapping data files.
992
+
993
+ Runs one ``parquet_scan`` (union_by_name, ranged GETs, row-group skipping)
994
+ null-safe ``SEMI JOIN``-ed against the unique *incoming_keys*, projecting only
995
+ ``__rowid__`` + the overwrite columns (+ *newer_than_col* when given) plus the
996
+ source ``filename``. Returns a polars frame with columns ``__file__`` (the
997
+ original storage key), ``__rowid__``, the overwrite columns and the
998
+ newer-than column — i.e. every existing row whose key matches an incoming
999
+ key. Returns ``None`` on any failure or unsupported schema (e.g. a referenced
1000
+ column absent from EVERY candidate file → DuckDB binder error), signalling the
1001
+ caller to fall back to the polars full-read path.
1002
+ """
1003
+ p = profiler or get_null_profiler()
1004
+ if not overlap_true_files or not overwrite_columns:
1005
+ return None
1006
+
1007
+ try:
1008
+ import duckdb
1009
+ from supertable.engine.engine_common import (
1010
+ configure_httpfs_and_s3,
1011
+ escape_parquet_path,
1012
+ quote_if_needed,
1013
+ )
1014
+ except Exception as e:
1015
+ logging.info(f"[write-probe] duckdb unavailable, using polars path: {e}")
1016
+ return None
1017
+
1018
+ storage = _get_storage()
1019
+ duck_to_key: Dict[str, str] = {}
1020
+ duck_paths: List[str] = []
1021
+ for file_key, _sz in overlap_true_files:
1022
+ dp = _storage_duckdb_path(storage, file_key)
1023
+ duck_to_key[dp] = file_key
1024
+ duck_paths.append(dp)
1025
+
1026
+ select_cols = ["filename", quote_if_needed(ROWID_COL)]
1027
+ select_cols += [quote_if_needed(c) for c in overwrite_columns]
1028
+ if newer_than_col:
1029
+ select_cols.append(quote_if_needed(newer_than_col))
1030
+ join_cond = " AND ".join(
1031
+ f"src.{quote_if_needed(c)} IS NOT DISTINCT FROM k.{quote_if_needed(c)}"
1032
+ for c in overwrite_columns
1033
+ )
1034
+ files_sql = ", ".join(f"'{escape_parquet_path(dp)}'" for dp in duck_paths)
1035
+ ik_name = f"__st_ik_{uuid.uuid4().hex}"
1036
+
1037
+ con = None
1038
+ try:
1039
+ con = duckdb.connect()
1040
+ if any("://" in dp for dp in duck_paths):
1041
+ configure_httpfs_and_s3(con, duck_paths)
1042
+ con.register(ik_name, incoming_keys.to_arrow())
1043
+ sql = (
1044
+ f"SELECT {', '.join(select_cols)} "
1045
+ f"FROM parquet_scan([{files_sql}], union_by_name=TRUE, "
1046
+ f"filename=TRUE, hive_partitioning=FALSE) AS src "
1047
+ f"SEMI JOIN {ik_name} AS k ON {join_cond}"
1048
+ )
1049
+ with p.span("io.duckdb_probe"):
1050
+ matched = con.execute(sql).pl()
1051
+ except Exception as e:
1052
+ logging.info(f"[write-probe] probe failed, using polars path: {e}")
1053
+ return None
1054
+ finally:
1055
+ if con is not None:
1056
+ try:
1057
+ con.unregister(ik_name)
1058
+ except Exception:
1059
+ pass
1060
+ try:
1061
+ con.close()
1062
+ except Exception:
1063
+ pass
1064
+
1065
+ if matched is None or "filename" not in matched.columns:
1066
+ return None
1067
+ # Restore the original storage key (DuckDB's ``filename`` is the path we
1068
+ # passed in) as __file__ via a join so the tombstone stores keys, not URLs.
1069
+ map_df = polars.DataFrame(
1070
+ {"filename": list(duck_to_key.keys()),
1071
+ TOMBSTONE_FILE_COL: list(duck_to_key.values())}
1072
+ )
1073
+ matched = matched.join(map_df, on="filename", how="left").drop("filename")
1074
+ if matched.get_column(TOMBSTONE_FILE_COL).null_count() > 0:
1075
+ # A returned filename did not map back — refuse to emit ambiguous
1076
+ # tombstones; let the caller fall back to the polars path.
1077
+ logging.info("[write-probe] unmapped filename in probe result; using polars path")
1078
+ return None
1079
+ p.add("probe_files", len(duck_paths))
1080
+ p.add("probe_rows_matched", int(matched.height))
1081
+ return matched
1082
+
1083
+
1084
+ def _align_keys_to_incoming(
1085
+ matched: polars.DataFrame,
1086
+ incoming_df: polars.DataFrame,
1087
+ overwrite_columns: List[str],
1088
+ newer_than_col: Optional[str],
1089
+ ) -> polars.DataFrame:
1090
+ """Cast probe-result key / newer-than columns to the incoming df's dtypes.
1091
+
1092
+ DuckDB → Arrow → polars round-trips can yield a different (if compatible)
1093
+ dtype than the in-memory incoming frame; polars joins/comparisons want
1094
+ matching dtypes. Casts are best-effort; an unrepresentable cast raises and
1095
+ the caller falls back to the polars path.
1096
+ """
1097
+ casts = []
1098
+ for c in overwrite_columns:
1099
+ if c in matched.columns and c in incoming_df.columns:
1100
+ if matched.schema[c] != incoming_df.schema[c]:
1101
+ casts.append(polars.col(c).cast(incoming_df.schema[c]))
1102
+ if newer_than_col and newer_than_col in matched.columns and newer_than_col in incoming_df.columns:
1103
+ if matched.schema[newer_than_col] != incoming_df.schema[newer_than_col]:
1104
+ casts.append(polars.col(newer_than_col).cast(incoming_df.schema[newer_than_col]))
1105
+ return matched.with_columns(casts) if casts else matched
1106
+
1107
+
1108
+ def _derive_stale_and_deletes(
1109
+ incoming_df: polars.DataFrame,
1110
+ matched: polars.DataFrame,
1111
+ overwrite_columns: List[str],
1112
+ newer_than_col: Optional[str],
1113
+ profiler: Optional[Profiler] = None,
1114
+ ) -> Tuple[polars.DataFrame, List[Tuple[str, int]]]:
1115
+ """Derive (filtered incoming df, delete pairs) from the probe's matched rows.
1116
+
1117
+ Mirrors the legacy two-function semantics exactly:
1118
+ * stale filter — drop incoming rows whose newer-than value is <= the max
1119
+ existing value for that key (null existing max ⇒ new/legacy key ⇒ keep);
1120
+ skipped entirely when *newer_than_col* is falsy;
1121
+ * delete pairs — ``(file, __rowid__)`` of existing rows matched by the
1122
+ SURVIVING incoming keys (null-safe), so stale rows tombstone nothing and
1123
+ rows without a ``__rowid__`` (legacy files) are dropped.
1124
+ """
1125
+ p = profiler or get_null_profiler()
1126
+ matched = _align_keys_to_incoming(matched, incoming_df, overwrite_columns, newer_than_col)
1127
+
1128
+ if newer_than_col and newer_than_col in matched.columns:
1129
+ with p.span("newer_than.group_agg"):
1130
+ existing_max = matched.group_by(overwrite_columns).agg(
1131
+ polars.col(newer_than_col).max().alias("__existing_max__")
1132
+ )
1133
+ with p.span("newer_than.join_filter"):
1134
+ joined = incoming_df.join(existing_max, on=overwrite_columns, how="left")
1135
+ filtered = joined.filter(
1136
+ polars.col("__existing_max__").is_null()
1137
+ | (polars.col(newer_than_col) > polars.col("__existing_max__"))
1138
+ ).drop("__existing_max__")
1139
+ else:
1140
+ filtered = incoming_df
1141
+
1142
+ pairs: List[Tuple[str, int]] = []
1143
+ if ROWID_COL in matched.columns:
1144
+ surviving_keys = filtered.select(overwrite_columns).unique()
1145
+ with p.span("delete.semi_join"):
1146
+ matched_surviving = matched.join(
1147
+ surviving_keys, on=overwrite_columns, how="semi", nulls_equal=True
1148
+ )
1149
+ dv = matched_surviving.select([TOMBSTONE_FILE_COL, ROWID_COL]).drop_nulls()
1150
+ pairs = [(file, int(rid)) for file, rid in dv.iter_rows()]
1151
+ p.add("delete_rows_matched", len(pairs))
1152
+ return filtered, pairs
1153
+
1154
+
1155
+ def resolve_overwrite_writes(
1156
+ incoming_df: polars.DataFrame,
1157
+ overlapping_files: Set[Tuple[str, bool, int]],
1158
+ overwrite_columns: List[str],
1159
+ newer_than_col: Optional[str] = None,
1160
+ profiler: Optional[Profiler] = None,
1161
+ ) -> Tuple[polars.DataFrame, List[Tuple[str, int]]]:
1162
+ """Single-pass overwrite resolution: stale filtering + delete-vector pairs.
1163
+
1164
+ Returns ``(filtered_incoming_df, delete_pairs)`` computed from ONE DuckDB
1165
+ pushdown probe over the overlapping files. Falls back to the original polars
1166
+ full-read path (``filter_stale_incoming_rows`` + ``identify_deleted_rowids``)
1167
+ when DuckDB is unavailable, the probe fails, or the file schema can't be
1168
+ probed — semantics are identical on both paths.
1169
+
1170
+ *newer_than_col* falsy ⇒ no stale filtering (delete/upsert without conflict
1171
+ resolution); the incoming df is returned unchanged and every overlapping row
1172
+ matched by an incoming key is tombstoned.
1173
+ """
1174
+ p = profiler or get_null_profiler()
1175
+ overlap_true = [(f, sz) for f, has_overlap, sz in overlapping_files if has_overlap]
1176
+ if not overlap_true or not overwrite_columns:
1177
+ return incoming_df, []
1178
+
1179
+ key_cols = [c for c in overwrite_columns if c in incoming_df.columns]
1180
+ if key_cols != list(overwrite_columns):
1181
+ # Incoming df lacks a key column → no existing row can match (mirrors the
1182
+ # polars path, which returns no pairs and filters nothing).
1183
+ return incoming_df, []
1184
+
1185
+ incoming_keys = incoming_df.select(overwrite_columns).unique()
1186
+ matched = _duckdb_probe_overlap_matches(
1187
+ overlap_true, overwrite_columns, newer_than_col, incoming_keys, profiler=p,
1188
+ )
1189
+ if matched is not None:
1190
+ try:
1191
+ return _derive_stale_and_deletes(
1192
+ incoming_df, matched, overwrite_columns, newer_than_col, profiler=p,
1193
+ )
1194
+ except Exception as e:
1195
+ logging.warning(f"[write-probe] derive failed, using polars path: {e}")
1196
+
1197
+ # ---- Fallback: original polars full-read path (semantics oracle) ----
1198
+ p.add("overwrite_resolve_fallback", 1)
1199
+ file_cache: Dict[str, polars.DataFrame] = {}
1200
+ if newer_than_col:
1201
+ filtered = filter_stale_incoming_rows(
1202
+ incoming_df=incoming_df,
1203
+ overlapping_files=overlapping_files,
1204
+ overwrite_columns=overwrite_columns,
1205
+ newer_than_col=newer_than_col,
1206
+ file_cache=file_cache,
1207
+ profiler=p,
1208
+ )
1209
+ else:
1210
+ filtered = incoming_df
1211
+ pairs = identify_deleted_rowids(
1212
+ filtered, overlapping_files, overwrite_columns,
1213
+ file_cache=file_cache, profiler=p,
1214
+ )
1215
+ return filtered, pairs
1216
+
1217
+
945
1218
  def build_tombstone_file(
946
1219
  tombstone_dir: str,
947
1220
  prev_tombstone_path: Optional[str],