supertable 2.3.4__tar.gz → 2.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. {supertable-2.3.4/supertable.egg-info → supertable-2.3.5}/PKG-INFO +1 -1
  2. {supertable-2.3.4 → supertable-2.3.5}/pyproject.toml +1 -1
  3. {supertable-2.3.4 → supertable-2.3.5}/setup.py +1 -1
  4. {supertable-2.3.4 → supertable-2.3.5}/supertable/__init__.py +1 -1
  5. {supertable-2.3.4 → supertable-2.3.5}/supertable/config/defaults.py +6 -0
  6. supertable-2.3.5/supertable/config/homedir.py +96 -0
  7. {supertable-2.3.4 → supertable-2.3.5}/supertable/data_writer.py +49 -2
  8. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/defaults.py +3 -1
  9. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/engine_common.py +60 -59
  10. {supertable-2.3.4 → supertable-2.3.5}/supertable/processing.py +51 -13
  11. {supertable-2.3.4 → supertable-2.3.5}/supertable/simple_table.py +0 -9
  12. {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/azure_storage.py +7 -2
  13. {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/gcp_storage.py +7 -2
  14. {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/local_storage.py +4 -4
  15. {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/minio_storage.py +7 -2
  16. {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/s3_storage.py +7 -2
  17. {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/storage_interface.py +21 -2
  18. {supertable-2.3.4 → supertable-2.3.5}/supertable/super_table.py +0 -6
  19. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_data_writer_comprehensive.py +2 -1
  20. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_processing_stats.py +3 -1
  21. {supertable-2.3.4 → supertable-2.3.5/supertable.egg-info}/PKG-INFO +1 -1
  22. supertable-2.3.4/supertable/config/homedir.py +0 -62
  23. {supertable-2.3.4 → supertable-2.3.5}/LICENSE +0 -0
  24. {supertable-2.3.4 → supertable-2.3.5}/README.md +0 -0
  25. {supertable-2.3.4 → supertable-2.3.5}/requirements.txt +0 -0
  26. {supertable-2.3.4 → supertable-2.3.5}/setup.cfg +0 -0
  27. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/__init__.py +0 -0
  28. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/admin.py +0 -0
  29. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/chain.py +0 -0
  30. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/consumers.py +0 -0
  31. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/crypto.py +0 -0
  32. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/events.py +0 -0
  33. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/export.py +0 -0
  34. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/logger.py +0 -0
  35. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/middleware.py +0 -0
  36. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/reader.py +0 -0
  37. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/retention.py +0 -0
  38. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/__init__.py +0 -0
  39. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/test_chain.py +0 -0
  40. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/test_crypto.py +0 -0
  41. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/test_emit.py +0 -0
  42. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/test_events.py +0 -0
  43. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/tests/test_retention.py +0 -0
  44. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/writer_parquet.py +0 -0
  45. {supertable-2.3.4 → supertable-2.3.5}/supertable/audit/writer_redis.py +0 -0
  46. {supertable-2.3.4 → supertable-2.3.5}/supertable/config/__init__.py +0 -0
  47. {supertable-2.3.4 → supertable-2.3.5}/supertable/config/settings.py +0 -0
  48. {supertable-2.3.4 → supertable-2.3.5}/supertable/config/tests/__init__.py +0 -0
  49. {supertable-2.3.4 → supertable-2.3.5}/supertable/config/tests/test_defaults.py +0 -0
  50. {supertable-2.3.4 → supertable-2.3.5}/supertable/config/tests/test_homedir.py +0 -0
  51. {supertable-2.3.4 → supertable-2.3.5}/supertable/config/tests/test_settings.py +0 -0
  52. {supertable-2.3.4 → supertable-2.3.5}/supertable/data_classes.py +0 -0
  53. {supertable-2.3.4 → supertable-2.3.5}/supertable/data_reader.py +0 -0
  54. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/__init__.py +0 -0
  55. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/__init__.py +0 -0
  56. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/__main__.py +0 -0
  57. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/check_filter_builder.py +0 -0
  58. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/controller.py +0 -0
  59. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/data_writer_helpers.py +0 -0
  60. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/dummy_data.py +0 -0
  61. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/read_parquet_header.py +0 -0
  62. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s01_01_01_create_super_table.py +0 -0
  63. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s01_01_02_enable_mirroring_formats.py +0 -0
  64. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s01_02_create_roles.py +0 -0
  65. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s01_03_create_users.py +0 -0
  66. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_01_write_dummy_data.py +0 -0
  67. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_02_write_single_data.py +0 -0
  68. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_03_01_write_staging.py +0 -0
  69. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_03_02_create_pipe.py +0 -0
  70. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_04_01_write_monitoring_simple.py +0 -0
  71. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_04_02_write_monitoring_parallel.py +0 -0
  72. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s02_05_write_tombstone.py +0 -0
  73. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_01_read_data_error.py +0 -0
  74. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_02_01_read_super_data_ok.py +0 -0
  75. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_02_02_read_table_data_ok.py +0 -0
  76. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_03_read_meta.py +0 -0
  77. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_04_read_staging.py +0 -0
  78. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_06_01_read_roles.py +0 -0
  79. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_06_02_read_user.py +0 -0
  80. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_07_01_estimate_read.py +0 -0
  81. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_07_02_estimate_files.py +0 -0
  82. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s03_08_read_snapshot_history.py +0 -0
  83. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s04_01_03_delete_pipe.py +0 -0
  84. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s05_01_delete_table.py +0 -0
  85. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/quickstart/s05_02_delete_super_table.py +0 -0
  86. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/__init__.py +0 -0
  87. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/core.py +0 -0
  88. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/defaults.py +0 -0
  89. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/generate.py +0 -0
  90. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/load.py +0 -0
  91. {supertable-2.3.4 → supertable-2.3.5}/supertable/demo/webshop/topup.py +0 -0
  92. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/__init__.py +0 -0
  93. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/data_estimator.py +0 -0
  94. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/duckdb_lite.py +0 -0
  95. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/duckdb_pro.py +0 -0
  96. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/engine_config.py +0 -0
  97. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/engine_enum.py +0 -0
  98. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/executor.py +0 -0
  99. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/plan_stats.py +0 -0
  100. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/spark_thrift.py +0 -0
  101. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/__init__.py +0 -0
  102. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/conftest.py +0 -0
  103. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/test_engine.py +0 -0
  104. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/test_engine_config.py +0 -0
  105. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/test_engine_routing.py +0 -0
  106. {supertable-2.3.4 → supertable-2.3.5}/supertable/engine/tests/test_engine_spill.py +0 -0
  107. {supertable-2.3.4 → supertable-2.3.5}/supertable/errors.py +0 -0
  108. {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/__init__.py +0 -0
  109. {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/benchmarks/__init__.py +0 -0
  110. {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/benchmarks/benchmark_locking.py +0 -0
  111. {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/benchmarks/measure_lock_speed.py +0 -0
  112. {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/benchmarks/measure_lock_time.py +0 -0
  113. {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/file_lock.py +0 -0
  114. {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/redis_lock.py +0 -0
  115. {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/tests/__init__.py +0 -0
  116. {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/tests/test_file_lock.py +0 -0
  117. {supertable-2.3.4 → supertable-2.3.5}/supertable/locking/tests/test_redis_lock.py +0 -0
  118. {supertable-2.3.4 → supertable-2.3.5}/supertable/logging.py +0 -0
  119. {supertable-2.3.4 → supertable-2.3.5}/supertable/meta_reader.py +0 -0
  120. {supertable-2.3.4 → supertable-2.3.5}/supertable/mirroring/__init__.py +0 -0
  121. {supertable-2.3.4 → supertable-2.3.5}/supertable/mirroring/mirror_delta.py +0 -0
  122. {supertable-2.3.4 → supertable-2.3.5}/supertable/mirroring/mirror_formats.py +0 -0
  123. {supertable-2.3.4 → supertable-2.3.5}/supertable/mirroring/mirror_iceberg.py +0 -0
  124. {supertable-2.3.4 → supertable-2.3.5}/supertable/mirroring/mirror_parquet.py +0 -0
  125. {supertable-2.3.4 → supertable-2.3.5}/supertable/monitoring/__init__.py +0 -0
  126. {supertable-2.3.4 → supertable-2.3.5}/supertable/monitoring/partitions.py +0 -0
  127. {supertable-2.3.4 → supertable-2.3.5}/supertable/monitoring_writer.py +0 -0
  128. {supertable-2.3.4 → supertable-2.3.5}/supertable/plan_extender.py +0 -0
  129. {supertable-2.3.4 → supertable-2.3.5}/supertable/query_plan_manager.py +0 -0
  130. {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/__init__.py +0 -0
  131. {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/access_control.py +0 -0
  132. {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/filter_builder.py +0 -0
  133. {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/permissions.py +0 -0
  134. {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/role_manager.py +0 -0
  135. {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/row_column_security.py +0 -0
  136. {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/tests/test_filter_builder.py +0 -0
  137. {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/tests/test_rbac.py +0 -0
  138. {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/tests/test_rbac_per_table.py +0 -0
  139. {supertable-2.3.4 → supertable-2.3.5}/supertable/rbac/user_manager.py +0 -0
  140. {supertable-2.3.4 → supertable-2.3.5}/supertable/redis_catalog.py +0 -0
  141. {supertable-2.3.4 → supertable-2.3.5}/supertable/redis_connector.py +0 -0
  142. {supertable-2.3.4 → supertable-2.3.5}/supertable/redis_infra.py +0 -0
  143. {supertable-2.3.4 → supertable-2.3.5}/supertable/redis_keys.py +0 -0
  144. {supertable-2.3.4 → supertable-2.3.5}/supertable/staging_area.py +0 -0
  145. {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/__init__.py +0 -0
  146. {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/storage_factory.py +0 -0
  147. {supertable-2.3.4 → supertable-2.3.5}/supertable/storage/tests/test_storage.py +0 -0
  148. {supertable-2.3.4 → supertable-2.3.5}/supertable/super_pipe.py +0 -0
  149. {supertable-2.3.4 → supertable-2.3.5}/supertable/system_query.py +0 -0
  150. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/__init__.py +0 -0
  151. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_align_to_schema_fix.py +0 -0
  152. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_create_if_missing.py +0 -0
  153. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_data_reader.py +0 -0
  154. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_data_reader_preflight.py +0 -0
  155. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_data_writer.py +0 -0
  156. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_data_writer_compact.py +0 -0
  157. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_errors.py +0 -0
  158. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_meta_reader.py +0 -0
  159. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_monitoring_partitions.py +0 -0
  160. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_monitoring_sink_guard.py +0 -0
  161. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_newer_than.py +0 -0
  162. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_parquet_statistics.py +0 -0
  163. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_processing.py +0 -0
  164. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_processing_compact_resources.py +0 -0
  165. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_query_sql.py +0 -0
  166. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_read_pruning_differential.py +0 -0
  167. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_read_pruning_integration.py +0 -0
  168. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_redis_key_prefix.py +0 -0
  169. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_resolve_overwrite_writes.py +0 -0
  170. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_simple_table.py +0 -0
  171. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_stats_cache.py +0 -0
  172. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_stats_pruning.py +0 -0
  173. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_stats_schema_snapshot.py +0 -0
  174. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_super_table.py +0 -0
  175. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_supertable_all.py +0 -0
  176. {supertable-2.3.4 → supertable-2.3.5}/supertable/tests/test_system_query.py +0 -0
  177. {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/__init__.py +0 -0
  178. {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/helper.py +0 -0
  179. {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/profiler.py +0 -0
  180. {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/sql_parser.py +0 -0
  181. {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/tests/test_sql_parser_columns.py +0 -0
  182. {supertable-2.3.4 → supertable-2.3.5}/supertable/utils/timer.py +0 -0
  183. {supertable-2.3.4 → supertable-2.3.5}/supertable.egg-info/SOURCES.txt +0 -0
  184. {supertable-2.3.4 → supertable-2.3.5}/supertable.egg-info/dependency_links.txt +0 -0
  185. {supertable-2.3.4 → supertable-2.3.5}/supertable.egg-info/entry_points.txt +0 -0
  186. {supertable-2.3.4 → supertable-2.3.5}/supertable.egg-info/requires.txt +0 -0
  187. {supertable-2.3.4 → supertable-2.3.5}/supertable.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: supertable
3
- Version: 2.3.4
3
+ Version: 2.3.5
4
4
  Summary: SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.
5
5
  Author: Levente Kupas
6
6
  Author-email: Levente Kupas <lkupas@kladnasoft.com>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "supertable"
7
- version = "2.3.4"
7
+ version = "2.3.5"
8
8
  description = "SuperTable — versioned data lake library for SQL analytics on Parquet + Redis."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -19,7 +19,7 @@ long_description = readme.read_text(encoding="utf-8") if readme.exists() else ""
19
19
 
20
20
  setup(
21
21
  name="supertable",
22
- version="2.3.4",
22
+ version="2.3.5",
23
23
  description="SuperTable — versioned data lake library for SQL analytics on Parquet + Redis.",
24
24
  long_description=long_description,
25
25
  long_description_content_type="text/markdown",
@@ -25,7 +25,7 @@ See the ``supertable.demo`` package for runnable end-to-end demos and the
25
25
  project documentation for the full API surface.
26
26
  """
27
27
 
28
- __version__ = "2.3.4"
28
+ __version__ = "2.3.5"
29
29
 
30
30
  # Re-export the core public surface so users can do ``from supertable import …``
31
31
  # instead of remembering submodule paths.
@@ -17,6 +17,12 @@ handler.setFormatter(colorlog.ColoredFormatter(
17
17
  logging.basicConfig(level=logging.INFO, handlers=[handler])
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
+ # Quiet noisy third-party HTTP client loggers. At DEBUG these emit one line
21
+ # per request (connection setup + every HEAD/GET/PUT), which drowns out
22
+ # SuperTable's own logs. WARNING keeps genuine connection problems visible.
23
+ for _noisy_logger in ("urllib3", "botocore", "boto3", "s3transfer", "boto"):
24
+ logging.getLogger(_noisy_logger).setLevel(logging.WARNING)
25
+
20
26
  _VALID_LOG_LEVELS = frozenset({"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"})
21
27
 
22
28
  @dataclass(slots=True)
@@ -0,0 +1,96 @@
1
+ import os
2
+ import sys
3
+ import tempfile
4
+
5
+ from supertable.config.settings import settings
6
+ from supertable.config.defaults import logger
7
+
8
+ # If this file is located in a subdirectory, adjust the path logic as needed.
9
+ # Currently appending ".." from __file__ to add the project root directory
10
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
11
+
12
+ # ---------- lazy home directory resolution ----------
13
+ _resolved_home: str | None = None
14
+
15
+ def _is_writable_dir(path: str) -> bool:
16
+ """Create *path* if needed and verify we can actually write a file in it.
17
+
18
+ ``os.access(..., W_OK)`` is unreliable under containers, ACLs and
19
+ root-squashed mounts, so probe with a real create+unlink: this is the
20
+ difference between a home that merely *resolves* and one DuckDB can root
21
+ its temp/spill, cache and extension dirs under.
22
+ """
23
+ try:
24
+ os.makedirs(path, exist_ok=True)
25
+ with tempfile.NamedTemporaryFile(dir=path):
26
+ pass
27
+ return True
28
+ except OSError:
29
+ return False
30
+
31
+ def _resolve_app_home() -> str:
32
+ """
33
+ Resolve, expand, and normalise the application home directory once.
34
+
35
+ The home must be *writable*, not just resolvable: DuckDB roots its
36
+ temp/spill, external-cache and extension directories here, so a
37
+ non-writable home silently breaks every write (the probe fails with
38
+ ``errno 13`` and falls back to the slow full-read path). We therefore
39
+ verify writability and, when the configured home is not usable, fall back
40
+ to ``<tempdir>/supertable`` with a loud warning rather than returning a
41
+ path that only looks valid.
42
+ """
43
+ global _resolved_home
44
+ if _resolved_home is not None:
45
+ return _resolved_home
46
+
47
+ raw = settings.SUPERTABLE_HOME
48
+ expanded = os.path.abspath(os.path.expanduser(raw))
49
+
50
+ if _is_writable_dir(expanded):
51
+ logger.debug(f"Ensured app home directory exists: {expanded}")
52
+ _resolved_home = expanded
53
+ return _resolved_home
54
+
55
+ fallback = os.path.join(tempfile.gettempdir(), "supertable")
56
+ if _is_writable_dir(fallback):
57
+ logger.warning(
58
+ f"SUPERTABLE_HOME={expanded!r} is not writable; falling back to "
59
+ f"{fallback!r}. Set SUPERTABLE_HOME to a writable directory to "
60
+ f"silence this — DuckDB temp/spill, cache and extensions live under it."
61
+ )
62
+ _resolved_home = fallback
63
+ return _resolved_home
64
+
65
+ raise RuntimeError(
66
+ f"No writable application home: tried SUPERTABLE_HOME={expanded!r} and "
67
+ f"fallback {fallback!r}. Set SUPERTABLE_HOME to a writable directory."
68
+ )
69
+
70
+ def change_to_app_home(home_dir: str | None = None) -> None:
71
+ """
72
+ Attempts to change the current working directory to `home_dir`.
73
+ If home_dir is not provided, uses the resolved app home.
74
+ Logs the outcome.
75
+ """
76
+ target = home_dir if home_dir else _resolve_app_home()
77
+ expanded_dir = os.path.expanduser(target)
78
+ try:
79
+ os.chdir(expanded_dir)
80
+ logger.debug(f"Changed working directory to {expanded_dir}")
81
+ except Exception as e:
82
+ logger.error(f"Failed to change working directory to {expanded_dir}: {e}")
83
+
84
+ # ---------- eager init (preserves original import-time behaviour) ----------
85
+ _app_home = _resolve_app_home()
86
+ change_to_app_home(_app_home)
87
+ logger.debug(f"Current working directory: {os.getcwd()}")
88
+
89
+ # ---------- public API ----------
90
+
91
+ # Kept for backward compatibility; prefer get_app_home() for the expanded path.
92
+ app_home = _app_home
93
+
94
+ def get_app_home() -> str:
95
+ """Return the fully expanded, absolute application home directory."""
96
+ return _resolve_app_home()
@@ -371,6 +371,14 @@ class DataWriter:
371
371
  profiler=profiler,
372
372
  )
373
373
  mark("overlap")
374
+ if overwrite_columns:
375
+ _snap_files = len(last_simple_table.get("resources") or [])
376
+ _cand = sum(1 for _, ov, _ in overlapping_files if ov)
377
+ logger.debug(lp(
378
+ f"step[overlap]: {_cand}/{_snap_files} existing file(s) are overwrite "
379
+ f"candidates on {overwrite_columns} "
380
+ f"(snapshot has no per-file key stats → every file is suspect)"
381
+ ))
374
382
 
375
383
  # --- Stats-driven file pruning (consumer 5a) ----------------------
376
384
  # Narrow the overwrite/delete candidate set using the external stats
@@ -387,6 +395,10 @@ class DataWriter:
387
395
  stored_stats_df = load_stats(stats_file, allow_cache=True, profiler=profiler)
388
396
  if stored_stats_df is not None and stored_stats_df.height > 0:
389
397
  probe = probe_ranges_from_df(dataframe, overwrite_columns)
398
+ _probe_desc = {
399
+ c: (f"{v[0]}[{v[1]}..{v[2]}]" if v else "unconstrained(null/unsupported)")
400
+ for c, v in probe.items()
401
+ }
390
402
  before = len(overlapping_files)
391
403
  overlapping_files = prune_overlapping_files_by_stats(
392
404
  overlapping_files,
@@ -395,8 +407,21 @@ class DataWriter:
395
407
  profiler=profiler,
396
408
  )
397
409
  pruned = before - len(overlapping_files)
410
+ logger.debug(lp(
411
+ f"step[stats-prune]: df-probe {_probe_desc} vs {stored_stats_df.height} "
412
+ f"stored stat row(s) → kept {len(overlapping_files)}/{before}, "
413
+ f"pruned {pruned} (no data file opened)"
414
+ ))
398
415
  if pruned > 0:
399
416
  logger.info(lp(f"stats pruning: skipped {pruned}/{before} candidate files"))
417
+ else:
418
+ logger.debug(lp(
419
+ "step[stats-prune]: stats artifact empty → no pruning, all candidates retained"
420
+ ))
421
+ else:
422
+ logger.debug(lp(
423
+ "step[stats-prune]: snapshot has no stats_file → no pruning, all candidates retained"
424
+ ))
400
425
  mark("stats_prune")
401
426
 
402
427
  # File cache: used only by delete_only's identify_all_rowids below.
@@ -420,6 +445,16 @@ class DataWriter:
420
445
  newer_than_col=newer_than,
421
446
  profiler=profiler,
422
447
  )
448
+ mark("resolve_overwrite")
449
+ _counts = profiler.counts
450
+ _fallback = bool(_counts.get("overwrite_resolve_fallback"))
451
+ logger.debug(lp(
452
+ f"step[probe-resolve] via {'polars-fallback' if _fallback else 'duckdb-pushdown'}: "
453
+ f"matched {_counts.get('probe_rows_matched', _counts.get('delete_rows_matched', 0))} "
454
+ f"existing row(s) on {overwrite_columns} → "
455
+ f"{len(resolved_delete_pairs or [])} (file,__rowid__) delete pair(s); "
456
+ f"{dataframe.height}/{pre_filter_count} incoming row(s) survive"
457
+ ))
423
458
  if newer_than:
424
459
  skipped = pre_filter_count - dataframe.height
425
460
  if skipped > 0:
@@ -515,6 +550,10 @@ class DataWriter:
515
550
  ]
516
551
  deleted = len(new_delete_pairs)
517
552
  mark("identify_deletes")
553
+ logger.debug(lp(
554
+ f"step[deletes]: tombstoning {deleted} live row(s) this write "
555
+ f"(excluded {len(prev_dv_rowids)} row(s) already in the deletion-vector)"
556
+ ))
518
557
 
519
558
  # 2. Write the incoming rows as a new file (insert/upsert side).
520
559
  # delete_only carries only predicate columns — nothing to insert.
@@ -531,6 +570,10 @@ class DataWriter:
531
570
  else:
532
571
  inserted = 0
533
572
  mark("write_parquet")
573
+ logger.debug(lp(
574
+ f"step[write]: appended {inserted} incoming row(s) as {len(new_resources)} "
575
+ f"new immutable file(s) (no existing data file rewritten)"
576
+ ))
534
577
 
535
578
  # 3. Carry forward + extend the deletion-vector tombstone file.
536
579
  # No new deletes → reuse the previous file (combined_df=None).
@@ -554,6 +597,10 @@ class DataWriter:
554
597
  else int(last_simple_table.get("tombstone_rows", 0) or 0)
555
598
  )
556
599
  mark("build_tombstone")
600
+ logger.debug(lp(
601
+ f"step[tombstone]: deletion-vector now {tombstone_rows} row(s) "
602
+ f"({'rewritten' if combined_tombstone_df is not None else 'carried forward unchanged'})"
603
+ ))
557
604
 
558
605
  # 3b. Eager reclamation of fully-dead files. Any existing data
559
606
  # file whose every physical row is now tombstoned is 100%
@@ -812,7 +859,7 @@ class DataWriter:
812
859
  schema_json = "{}"
813
860
  _org, _sup = self.super_table.organization, self.super_table.super_name
814
861
  self.catalog.r.set(RK.schema(_org, _sup, simple_name), schema_json)
815
- self.catalog.r.sadd(RK.table_names(_org, _sup), simple_name)
862
+ self.catalog.r.sadd(RK.meta_table_names(_org, _sup), simple_name)
816
863
  except Exception as e:
817
864
  logger.debug(f"[data-writer] schema/table_names Redis write failed: {e}")
818
865
 
@@ -862,7 +909,7 @@ class DataWriter:
862
909
  f"total={total_duration:.3f} | "
863
910
  f"convert={timings.get('convert', 0):.3f} | dedup_ts={timings.get('dedup_ts', 0):.3f} | validate={timings.get('validate', 0):.3f} | "
864
911
  f"lock={timings.get('lock', 0):.3f} | snapshot={timings.get('snapshot', 0):.3f} | "
865
- f"overlap={timings.get('overlap', 0):.3f} | stats_prune={timings.get('stats_prune', 0):.3f} | newer_than={timings.get('newer_than', 0):.3f} | "
912
+ f"overlap={timings.get('overlap', 0):.3f} | stats_prune={timings.get('stats_prune', 0):.3f} | resolve_overwrite={timings.get('resolve_overwrite', 0):.3f} | newer_than={timings.get('newer_than', 0):.3f} | "
866
913
  f"identify_deletes={timings.get('identify_deletes', 0):.3f} | write_parquet={timings.get('write_parquet', 0):.3f} | "
867
914
  f"build_tombstone={timings.get('build_tombstone', 0):.3f} | compact_tombstones={timings.get('compact_tombstones', 0):.3f} | compact_small={timings.get('compact_small', 0):.3f} | build_stats={timings.get('build_stats', 0):.3f} | "
868
915
  f"update_simple={timings.get('update_simple', 0):.3f} | bump_root={timings.get('bump_root', 0):.3f} | "
@@ -14,7 +14,9 @@ from enum import Enum
14
14
 
15
15
  from supertable.config import defaults
16
16
 
17
- logging.getLogger("supertable").setLevel(logging.INFO)
17
+ # Follow the configured SUPERTABLE_LOG_LEVEL (resolved in supertable.config.defaults)
18
+ # instead of hard-pinning INFO, so DEBUG surfaces the detailed write step[...] logs.
19
+ logging.getLogger("supertable").setLevel(defaults.default.LOG_LEVEL)
18
20
 
19
21
  defaults.default.IS_SHOW_TIMING = True
20
22
 
@@ -285,39 +285,35 @@ def configure_httpfs_and_s3(
285
285
  "true" if meta_cache_on else "false",
286
286
  )
287
287
 
288
- # External file cache — an in-memory cache of remote data blocks so
289
- # repeated queries do not re-download the same row groups. It is only
290
- # enabled when a size cap can be enforced: DuckDB builds without
291
- # external_file_cache_max_size (e.g. 1.5.x) cannot bound it, and an
292
- # uncapped cache grows to memory_limit on the persistent connection.
293
- # Without an enforceable cap we keep it OFF to protect memory.
288
+ # External file cache — an in-memory cache of external (e.g. remote
289
+ # Parquet) data blocks so repeated queries do not re-download the same row
290
+ # groups. Enabled whenever a cache size is configured. The cache is
291
+ # bounded by the global memory_limit (DuckDB enforces that bound), which is
292
+ # the effective cap; a dedicated per-cache cap is applied only on builds
293
+ # that expose external_file_cache_max_size (no released DuckDB through
294
+ # 1.5.x does), so in practice memory_limit is the bound.
294
295
  cache_size = settings.SUPERTABLE_DUCKDB_EXTERNAL_CACHE_SIZE
295
296
  can_cap = "external_file_cache_max_size" in supported
296
- if cache_size and can_cap:
297
+ if cache_size:
297
298
  set_if_supported("enable_external_file_cache", "true")
298
- set_if_supported("external_file_cache_max_size", f"'{cache_size}'")
299
- cache_dir_raw = settings.SUPERTABLE_DUCKDB_EXTERNAL_CACHE_DIR
300
- if not cache_dir_raw:
301
- # Derive from SUPERTABLE_HOME — single env var controls all paths.
302
- cache_dir_raw = os.path.join(get_app_home(), "duckdb_cache")
303
- # Expand ~ so DuckDB receives an absolute path.
304
- cache_dir = os.path.expanduser(cache_dir_raw)
305
- os.makedirs(cache_dir, exist_ok=True)
306
- set_if_supported("external_file_cache_directory", f"'{cache_dir}'")
307
- logger.info(
299
+ if can_cap:
300
+ set_if_supported("external_file_cache_max_size", f"'{cache_size}'")
301
+ cache_dir_raw = settings.SUPERTABLE_DUCKDB_EXTERNAL_CACHE_DIR
302
+ if not cache_dir_raw:
303
+ # Derive from SUPERTABLE_HOME — single env var controls all paths.
304
+ cache_dir_raw = os.path.join(get_app_home(), "duckdb_cache")
305
+ # Expand ~ so DuckDB receives an absolute path.
306
+ cache_dir = os.path.expanduser(cache_dir_raw)
307
+ os.makedirs(cache_dir, exist_ok=True)
308
+ set_if_supported("external_file_cache_directory", f"'{cache_dir}'")
309
+ logger.debug(
308
310
  "[duckdb.cache] external file cache enabled"
309
- + (f" size={cache_size}" if cache_size else "")
310
- + f", dir={cache_dir}"
311
+ + (f", capped at {cache_size}" if can_cap
312
+ else f", bounded by memory_limit (size={cache_size}; "
313
+ "this DuckDB build has no dedicated cap)")
311
314
  )
312
315
  else:
313
- # Uncappable (or disabled via empty size) — turn it off explicitly so
314
- # the DuckDB 1.5.x default-on cache cannot accumulate in memory.
315
316
  set_if_supported("enable_external_file_cache", "false")
316
- if cache_size and not can_cap:
317
- logger.info(
318
- "[duckdb.cache] external file cache disabled: this DuckDB build "
319
- "cannot cap it (no external_file_cache_max_size)"
320
- )
321
317
 
322
318
 
323
319
  # =========================================================
@@ -589,14 +585,13 @@ def rewrite_query_with_hashed_tables(
589
585
  # =========================================================
590
586
 
591
587
  def _external_file_cache_cappable(con: duckdb.DuckDBPyConnection) -> bool:
592
- """True when this DuckDB build can bound the external file cache size.
593
-
594
- DuckDB 1.5.x enables ``enable_external_file_cache`` by default but does
595
- not expose ``external_file_cache_max_size``. An enabled-but-uncapped
596
- cache is held in memory (not on disk) and grows to ``memory_limit`` on
597
- the long-lived persistent connection a sustained-memory / OOM hazard.
598
- When this returns False the cache is disabled outright rather than left
599
- running unbounded.
588
+ """True when this DuckDB build exposes a dedicated external-file-cache cap.
589
+
590
+ No released DuckDB (through 1.5.x) exposes ``external_file_cache_max_size``;
591
+ the cache is in-memory and bounded by ``memory_limit``, which DuckDB
592
+ enforces. This predicate gates only the *dedicated* per-cache size cap:
593
+ when it returns False the cache still runs, bounded by ``memory_limit``
594
+ rather than a separate cap.
600
595
  """
601
596
  try:
602
597
  return bool(con.execute(
@@ -679,16 +674,19 @@ def init_connection(
679
674
  except Exception:
680
675
  pass # older DuckDB builds may not support this setting
681
676
 
682
- # External file cache baseline. DuckDB 1.5.x turns the cache ON by
683
- # default but cannot cap it, so an uncapped in-memory cache accumulates
684
- # remote data up to memory_limit on the persistent connection. Disable
685
- # it here when uncappable; configure_httpfs_and_s3 / apply_runtime_pragmas
686
- # re-enable it (capped) only on builds that support a size cap.
687
- if not _external_file_cache_cappable(con):
688
- try:
689
- con.execute("SET enable_external_file_cache=false;")
690
- except Exception:
691
- pass
677
+ # External file cache baseline. DuckDB (>=1.3.0) ships an in-memory cache
678
+ # of external files, bounded by memory_limit (DuckDB enforces that bound).
679
+ # Honour the configured default here: enable when a cache size is set,
680
+ # otherwise off. configure_httpfs_and_s3 / apply_runtime_pragmas refine
681
+ # this (and apply a dedicated cap on builds that expose one).
682
+ try:
683
+ con.execute(
684
+ "SET enable_external_file_cache="
685
+ + ("true" if settings.SUPERTABLE_DUCKDB_EXTERNAL_CACHE_SIZE else "false")
686
+ + ";"
687
+ )
688
+ except Exception:
689
+ pass
692
690
 
693
691
  # Thread count.
694
692
  # If SUPERTABLE_DUCKDB_THREADS is set explicitly, honour it exactly.
@@ -783,17 +781,19 @@ def apply_runtime_pragmas(con: duckdb.DuckDBPyConnection, cfg) -> None:
783
781
  except Exception:
784
782
  pass
785
783
 
786
- # External file cache: only run it when the size cap is enforceable.
787
- # On DuckDB builds without external_file_cache_max_size (e.g. 1.5.x) an
788
- # enabled cache is in-memory and unbounded it grows to memory_limit on
789
- # the persistent connection so we disable it instead of running uncapped.
784
+ # External file cache: enable whenever the org configures a cache size.
785
+ # The cache is in-memory and bounded by memory_limit (DuckDB enforces that
786
+ # bound), which is the effective cap. A dedicated per-cache cap is applied
787
+ # only on builds that expose external_file_cache_max_size (no released
788
+ # DuckDB through 1.5.x does); otherwise memory_limit is the bound.
790
789
  cache_size = normalize_memory_size(cfg.duckdb_external_cache_size, default="")
791
- if cache_size and _external_file_cache_cappable(con):
790
+ if cache_size:
792
791
  try:
793
792
  con.execute("SET enable_external_file_cache=true;")
794
- con.execute(
795
- f"SET external_file_cache_max_size='{sanitize_sql_string(cache_size)}';"
796
- )
793
+ if _external_file_cache_cappable(con):
794
+ con.execute(
795
+ f"SET external_file_cache_max_size='{sanitize_sql_string(cache_size)}';"
796
+ )
797
797
  except Exception as e:
798
798
  logger.warning(f"[duckdb.pragma] external file cache config failed: {e}")
799
799
  else:
@@ -892,7 +892,7 @@ def run_engine_diagnostics(cfg=None, engine: str = "lite") -> Dict[str, Any]:
892
892
  ("external_file_cache_max_size supported — the file cache can be capped"
893
893
  if cappable else
894
894
  "this build has no external_file_cache_max_size — the file cache "
895
- "cannot be capped, so it is disabled to stay memory-safe"),
895
+ "runs bounded by memory_limit rather than a dedicated cap"),
896
896
  version)
897
897
  except Exception as e:
898
898
  add("version", "DuckDB version", "warn", f"version() failed: {e}")
@@ -1039,18 +1039,19 @@ def run_engine_diagnostics(cfg=None, engine: str = "lite") -> Dict[str, Any]:
1039
1039
  getattr(cfg, "duckdb_external_cache_size", ""), default=""
1040
1040
  )
1041
1041
  if efc_on and not cappable:
1042
- add("cache", "External file cache", "fail",
1043
- "Cache is ON but this build cannot cap it it grows to the memory "
1044
- "limit and causes OOM", "on · uncapped")
1042
+ add("cache", "External file cache", "ok",
1043
+ "Cache is ON, bounded by memory_limitthis build has no "
1044
+ "dedicated cap (external_file_cache_max_size), so memory_limit "
1045
+ "is the bound", "on · memory_limit")
1045
1046
  elif efc_on and cappable:
1046
1047
  add("cache", "External file cache", "ok",
1047
1048
  f"Cache is ON and capped at {cache_cfg or 'the configured size'}",
1048
1049
  "on · capped")
1049
1050
  else:
1050
1051
  add("cache", "External file cache", "ok",
1051
- "Cache is OFF — memory-safe; remote files are re-fetched per query "
1052
- "(set a Disk cache size on a cap-capable build to speed up repeats)",
1053
- "off")
1052
+ "Cache is OFF — remote files are re-fetched per query (set "
1053
+ "SUPERTABLE_DUCKDB_EXTERNAL_CACHE_SIZE to enable, bounded by "
1054
+ "memory_limit)", "off")
1054
1055
  except Exception as e:
1055
1056
  add("cache", "External file cache", "warn", f"Could not read cache state: {e}")
1056
1057
 
@@ -225,6 +225,7 @@ def _read_parquet_safe(
225
225
  path: str,
226
226
  profiler: Optional[Profiler] = None,
227
227
  file_size: int = 0,
228
+ columns: Optional[List[str]] = None,
228
229
  ) -> Optional[polars.DataFrame]:
229
230
  p = profiler or get_null_profiler()
230
231
  if not _safe_exists(path, profiler=p):
@@ -232,7 +233,13 @@ def _read_parquet_safe(
232
233
  return None
233
234
  try:
234
235
  with p.span("io.read_parquet"):
235
- tbl = _get_storage().read_parquet(path) # -> pyarrow.Table
236
+ # Project to *columns* when given so only those column chunks are
237
+ # read (memory-bound fallback); gated so storages/test doubles that
238
+ # only accept ``path`` keep working on the unprojected paths.
239
+ tbl = (
240
+ _get_storage().read_parquet(path, columns=columns)
241
+ if columns else _get_storage().read_parquet(path)
242
+ ) # -> pyarrow.Table
236
243
  with p.span("io.arrow_to_polars"):
237
244
  df = polars.from_arrow(tbl)
238
245
  p.add("files_read", 1)
@@ -627,11 +634,12 @@ def _write_single_parquet_file(
627
634
  rows = write_df.shape[0]
628
635
  columns = write_df.shape[1]
629
636
 
630
- # Ensure target directory exists (no-op on object storage)
637
+ # Ensure the target directory exists. makedirs is idempotent on local
638
+ # storage and a no-op on object storage; calling it directly avoids a
639
+ # pointless prefix HEAD (which always 404s) on object stores.
631
640
  with p.span("write.ensure_dir"):
632
641
  try:
633
- if not _get_storage().exists(target_dir):
634
- _get_storage().makedirs(target_dir)
642
+ _get_storage().makedirs(target_dir)
635
643
  except Exception:
636
644
  pass
637
645
 
@@ -728,6 +736,7 @@ def filter_stale_incoming_rows(
728
736
  newer_than_col: str,
729
737
  file_cache: Optional[Dict[str, polars.DataFrame]] = None,
730
738
  profiler: Optional[Profiler] = None,
739
+ read_columns: Optional[List[str]] = None,
731
740
  ) -> polars.DataFrame:
732
741
  """
733
742
  Remove rows from *incoming_df* that are stale or already present in existing data.
@@ -762,7 +771,7 @@ def filter_stale_incoming_rows(
762
771
  # Read and collect relevant rows from overlapping files
763
772
  existing_parts: List[polars.DataFrame] = []
764
773
  for file_path, file_size in overlap_true_files:
765
- part = _read_parquet_safe(file_path, profiler=p, file_size=file_size)
774
+ part = _read_parquet_safe(file_path, profiler=p, file_size=file_size, columns=read_columns)
766
775
  if part is None:
767
776
  continue
768
777
  # Cache the full DataFrame for downstream reuse (avoids double-read)
@@ -880,6 +889,7 @@ def identify_deleted_rowids(
880
889
  overwrite_columns: List[str],
881
890
  file_cache: Optional[Dict[str, polars.DataFrame]] = None,
882
891
  profiler: Optional[Profiler] = None,
892
+ read_columns: Optional[List[str]] = None,
883
893
  ) -> List[Tuple[str, int]]:
884
894
  """Find the ``(file, __rowid__)`` pairs of existing rows matching a delete predicate.
885
895
 
@@ -911,7 +921,7 @@ def identify_deleted_rowids(
911
921
  if file_cache is not None and file in file_cache:
912
922
  existing_df = file_cache.get(file)
913
923
  else:
914
- existing_df = _read_parquet_safe(file, profiler=p, file_size=file_size)
924
+ existing_df = _read_parquet_safe(file, profiler=p, file_size=file_size, columns=read_columns)
915
925
  if existing_df is None:
916
926
  continue
917
927
  if ROWID_COL not in existing_df.columns:
@@ -1110,6 +1120,10 @@ def _duckdb_probe_overlap_matches(
1110
1120
  f"filename=TRUE, hive_partitioning=FALSE) AS src "
1111
1121
  f"SEMI JOIN {ik_name} AS k ON {join_cond}"
1112
1122
  )
1123
+ logging.debug(
1124
+ f"[write-probe] duckdb scan: {len(paths)} file(s), "
1125
+ f"project={select_cols}, semi-join on {incoming_keys.height} key(s)"
1126
+ )
1113
1127
  with p.span("io.duckdb_probe"):
1114
1128
  return con.execute(sql).pl()
1115
1129
 
@@ -1168,6 +1182,11 @@ def _duckdb_probe_overlap_matches(
1168
1182
  return None
1169
1183
  p.add("probe_files", len(duck_paths))
1170
1184
  p.add("probe_rows_matched", int(matched.height))
1185
+ logging.debug(
1186
+ f"[write-probe] duckdb scan matched {matched.height} existing row(s) "
1187
+ f"across {len(duck_paths)} file(s) (only key/__rowid__ columns read, "
1188
+ f"row groups skipped by footer min/max)"
1189
+ )
1171
1190
  return matched
1172
1191
 
1173
1192
 
@@ -1273,6 +1292,11 @@ def resolve_overwrite_writes(
1273
1292
  return incoming_df, []
1274
1293
 
1275
1294
  incoming_keys = incoming_df.select(overwrite_columns).unique()
1295
+ logging.debug(
1296
+ f"[write-probe] resolve: {len(overlap_true)} overlapping file(s), "
1297
+ f"{incoming_keys.height} unique incoming key(s) on {overwrite_columns}, "
1298
+ f"newer_than={newer_than_col}"
1299
+ )
1276
1300
  matched = _duckdb_probe_overlap_matches(
1277
1301
  overlap_true, overwrite_columns, newer_than_col, incoming_keys, profiler=p,
1278
1302
  )
@@ -1286,6 +1310,19 @@ def resolve_overwrite_writes(
1286
1310
 
1287
1311
  # ---- Fallback: original polars full-read path (semantics oracle) ----
1288
1312
  p.add("overwrite_resolve_fallback", 1)
1313
+ # Project reads to only the columns the fallback consumes — overwrite keys
1314
+ # (+ newer-than for stale filtering) + __rowid__ (for the delete vector) —
1315
+ # so wide tables are not fully materialised into memory. The shared
1316
+ # file_cache holds this projected union; each consumer selects its subset.
1317
+ read_columns = list(dict.fromkeys(
1318
+ list(overwrite_columns)
1319
+ + ([newer_than_col] if newer_than_col else [])
1320
+ + [ROWID_COL]
1321
+ ))
1322
+ logging.debug(
1323
+ f"[write-probe] polars full-read fallback over {len(overlap_true)} file(s), "
1324
+ f"reading only {read_columns}"
1325
+ )
1289
1326
  file_cache: Dict[str, polars.DataFrame] = {}
1290
1327
  if newer_than_col:
1291
1328
  filtered = filter_stale_incoming_rows(
@@ -1295,12 +1332,13 @@ def resolve_overwrite_writes(
1295
1332
  newer_than_col=newer_than_col,
1296
1333
  file_cache=file_cache,
1297
1334
  profiler=p,
1335
+ read_columns=read_columns,
1298
1336
  )
1299
1337
  else:
1300
1338
  filtered = incoming_df
1301
1339
  pairs = identify_deleted_rowids(
1302
1340
  filtered, overlapping_files, overwrite_columns,
1303
- file_cache=file_cache, profiler=p,
1341
+ file_cache=file_cache, profiler=p, read_columns=read_columns,
1304
1342
  )
1305
1343
  return filtered, pairs
1306
1344
 
@@ -1354,8 +1392,8 @@ def build_tombstone_file(
1354
1392
  combined = combined.unique(subset=[ROWID_COL], keep="first")
1355
1393
 
1356
1394
  try:
1357
- if not _get_storage().exists(tombstone_dir):
1358
- _get_storage().makedirs(tombstone_dir)
1395
+ # Direct makedirs (idempotent local, no-op object) — avoids a 404 prefix HEAD.
1396
+ _get_storage().makedirs(tombstone_dir)
1359
1397
  except Exception:
1360
1398
  pass
1361
1399
 
@@ -1418,8 +1456,8 @@ def reclaim_fully_dead_files(
1418
1456
  return fully_dead, None, None
1419
1457
 
1420
1458
  try:
1421
- if not _get_storage().exists(tombstone_dir):
1422
- _get_storage().makedirs(tombstone_dir)
1459
+ # Direct makedirs (idempotent local, no-op object) — avoids a 404 prefix HEAD.
1460
+ _get_storage().makedirs(tombstone_dir)
1423
1461
  except Exception:
1424
1462
  pass
1425
1463
 
@@ -1688,8 +1726,8 @@ def build_stats_file(
1688
1726
  combined = new_df
1689
1727
 
1690
1728
  try:
1691
- if not _get_storage().exists(stats_dir):
1692
- _get_storage().makedirs(stats_dir)
1729
+ # Direct makedirs (idempotent local, no-op object) — avoids a 404 prefix HEAD.
1730
+ _get_storage().makedirs(stats_dir)
1693
1731
  except Exception:
1694
1732
  pass
1695
1733
 
@@ -128,19 +128,10 @@ class SimpleTable:
128
128
  self.data_dir = os.path.join(self.simple_dir, "data")
129
129
  self.snapshot_dir = os.path.join(self.simple_dir, "snapshots")
130
130
 
131
- logger.debug(f"simple_dir: {self.simple_dir}")
132
- logger.debug(f"data_dir: {self.data_dir}")
133
- logger.debug(f"snapshot_dir: {self.snapshot_dir}")
134
-
135
131
  # Fast path: if meta:leaf exists, don't touch storage
136
132
  if self.catalog.leaf_exists(
137
133
  self.super_table.organization, self.super_table.super_name, self.simple_name
138
134
  ):
139
- logger.debug(
140
- f"[SimpleTable] Leaf exists in Redis for "
141
- f"{self.super_table.organization}/{self.super_table.super_name}/{self.simple_name}; "
142
- f"skipping storage mkdirs and bootstrap."
143
- )
144
135
  return
145
136
 
146
137
  # Read-only opt-out: refuse to bootstrap as a side effect. The