deltacat 1.1.33__tar.gz → 1.1.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. {deltacat-1.1.33/deltacat.egg-info → deltacat-1.1.35}/PKG-INFO +1 -1
  2. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/__init__.py +1 -1
  3. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/constants.py +16 -1
  4. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/steps/merge.py +47 -1
  5. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/utils/content_type_params.py +17 -0
  6. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/utils/io.py +1 -1
  7. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/resource_estimation/delta.py +19 -1
  8. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +317 -0
  9. deltacat-1.1.35/deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +253 -0
  10. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/test_compact_partition_incremental.py +15 -0
  11. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/test_compact_partition_multiple_rounds.py +15 -0
  12. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/test_compact_partition_rebase.py +15 -0
  13. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +15 -0
  14. {deltacat-1.1.33 → deltacat-1.1.35/deltacat.egg-info}/PKG-INFO +1 -1
  15. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat.egg-info/SOURCES.txt +1 -0
  16. {deltacat-1.1.33 → deltacat-1.1.35}/LICENSE +0 -0
  17. {deltacat-1.1.33 → deltacat-1.1.35}/MANIFEST.in +0 -0
  18. {deltacat-1.1.33 → deltacat-1.1.35}/README.md +0 -0
  19. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/aws/__init__.py +0 -0
  20. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/aws/clients.py +0 -0
  21. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/aws/constants.py +0 -0
  22. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/aws/redshift/__init__.py +0 -0
  23. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/aws/redshift/model/__init__.py +0 -0
  24. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/aws/redshift/model/manifest.py +0 -0
  25. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/aws/s3u.py +0 -0
  26. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/benchmarking/__init__.py +0 -0
  27. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
  28. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/benchmarking/conftest.py +0 -0
  29. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/catalog/__init__.py +0 -0
  30. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/catalog/default_catalog_impl/__init__.py +0 -0
  31. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/catalog/delegate.py +0 -0
  32. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/catalog/interface.py +0 -0
  33. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/catalog/model/__init__.py +0 -0
  34. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/catalog/model/catalog.py +0 -0
  35. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/catalog/model/table_definition.py +0 -0
  36. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/__init__.py +0 -0
  37. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/__init__.py +0 -0
  38. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/compaction_session.py +0 -0
  39. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/__init__.py +0 -0
  40. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
  41. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
  42. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/compactor_version.py +0 -0
  43. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
  44. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
  45. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
  46. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
  47. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
  48. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/materialize_result.py +0 -0
  49. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
  50. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  51. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  52. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
  53. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/model/table_object_store.py +0 -0
  54. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/repartition_session.py +0 -0
  55. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/steps/__init__.py +0 -0
  56. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/steps/dedupe.py +0 -0
  57. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/steps/hash_bucket.py +0 -0
  58. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/steps/materialize.py +0 -0
  59. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/steps/repartition.py +0 -0
  60. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/utils/__init__.py +0 -0
  61. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/utils/io.py +0 -0
  62. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
  63. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
  64. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/utils/sort_key.py +0 -0
  65. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor/utils/system_columns.py +0 -0
  66. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/__init__.py +0 -0
  67. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/compaction_session.py +0 -0
  68. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
  69. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +0 -0
  70. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/deletes/delete_strategy.py +0 -0
  71. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +0 -0
  72. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/deletes/model.py +0 -0
  73. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/deletes/utils.py +0 -0
  74. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/model/__init__.py +0 -0
  75. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -0
  76. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +0 -0
  77. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
  78. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/model/merge_file_group.py +0 -0
  79. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/model/merge_input.py +0 -0
  80. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/model/merge_result.py +0 -0
  81. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/private/__init__.py +0 -0
  82. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/private/compaction_utils.py +0 -0
  83. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  84. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/steps/hash_bucket.py +0 -0
  85. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  86. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/utils/dedupe.py +0 -0
  87. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/utils/delta.py +0 -0
  88. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/utils/merge.py +0 -0
  89. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/utils/primary_key_index.py +0 -0
  90. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/compactor_v2/utils/task_options.py +0 -0
  91. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/merge_on_read/__init__.py +0 -0
  92. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/merge_on_read/daft.py +0 -0
  93. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/merge_on_read/model/__init__.py +0 -0
  94. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -0
  95. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  96. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/merge_on_read/utils/delta.py +0 -0
  97. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/resource_estimation/__init__.py +0 -0
  98. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/resource_estimation/manifest.py +0 -0
  99. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/resource_estimation/model.py +0 -0
  100. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/resource_estimation/parquet.py +0 -0
  101. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/stats/__init__.py +0 -0
  102. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/stats/models/__init__.py +0 -0
  103. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  104. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/stats/models/delta_stats.py +0 -0
  105. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  106. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  107. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/stats/models/stats_result.py +0 -0
  108. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/compute/stats/types.py +0 -0
  109. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/constants.py +0 -0
  110. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/exceptions.py +0 -0
  111. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/__init__.py +0 -0
  112. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/aws/__init__.py +0 -0
  113. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/aws/redshift/__init__.py +0 -0
  114. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/dataset.py +0 -0
  115. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/file_object_store.py +0 -0
  116. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/memcached_object_store.py +0 -0
  117. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/object_store.py +0 -0
  118. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/ray_plasma_object_store.py +0 -0
  119. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/read_api.py +0 -0
  120. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/redis_object_store.py +0 -0
  121. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/io/s3_object_store.py +0 -0
  122. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/logs.py +0 -0
  123. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/__init__.py +0 -0
  124. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/interface.py +0 -0
  125. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/__init__.py +0 -0
  126. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/delete_parameters.py +0 -0
  127. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/delta.py +0 -0
  128. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/list_result.py +0 -0
  129. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/locator.py +0 -0
  130. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/namespace.py +0 -0
  131. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/partition.py +0 -0
  132. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/partition_spec.py +0 -0
  133. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/sort_key.py +0 -0
  134. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/stream.py +0 -0
  135. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/table.py +0 -0
  136. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/table_version.py +0 -0
  137. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/transform.py +0 -0
  138. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/storage/model/types.py +0 -0
  139. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/__init__.py +0 -0
  140. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/aws/__init__.py +0 -0
  141. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/aws/test_clients.py +0 -0
  142. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/aws/test_s3u.py +0 -0
  143. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/catalog/__init__.py +0 -0
  144. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/catalog/test_default_catalog_impl.py +0 -0
  145. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/__init__.py +0 -0
  146. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +0 -0
  147. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compact_partition_rebase_test_cases.py +0 -0
  148. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +0 -0
  149. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compact_partition_test_cases.py +0 -0
  150. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor/__init__.py +0 -0
  151. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  152. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor/steps/test_repartition.py +0 -0
  153. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  154. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor/utils/test_io.py +0 -0
  155. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -0
  156. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  157. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
  158. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor_v2/utils/__init__.py +0 -0
  159. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -0
  160. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -0
  161. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/resource_estimation/__init__.py +0 -0
  162. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/resource_estimation/data/__init__.py +0 -0
  163. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/resource_estimation/test_delta.py +0 -0
  164. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/resource_estimation/test_manifest.py +0 -0
  165. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/test_compact_partition_params.py +0 -0
  166. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/test_util_common.py +0 -0
  167. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/test_util_constant.py +0 -0
  168. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -0
  169. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/io/__init__.py +0 -0
  170. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/io/test_cloudpickle_bug_fix.py +0 -0
  171. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/io/test_file_object_store.py +0 -0
  172. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/io/test_memcached_object_store.py +0 -0
  173. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/io/test_ray_plasma_object_store.py +0 -0
  174. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/io/test_redis_object_store.py +0 -0
  175. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/io/test_s3_object_store.py +0 -0
  176. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/local_deltacat_storage/__init__.py +0 -0
  177. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/local_deltacat_storage/exceptions.py +0 -0
  178. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/test_exceptions.py +0 -0
  179. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/test_logs.py +0 -0
  180. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/test_utils/__init__.py +0 -0
  181. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/test_utils/constants.py +0 -0
  182. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/test_utils/pyarrow.py +0 -0
  183. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/test_utils/storage.py +0 -0
  184. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/test_utils/utils.py +0 -0
  185. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/__init__.py +0 -0
  186. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/data/__init__.py +0 -0
  187. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/ray_utils/__init__.py +0 -0
  188. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/ray_utils/test_concurrency.py +0 -0
  189. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/ray_utils/test_dataset.py +0 -0
  190. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/test_cloudpickle.py +0 -0
  191. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/test_daft.py +0 -0
  192. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/test_metrics.py +0 -0
  193. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/test_placement.py +0 -0
  194. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/test_pyarrow.py +0 -0
  195. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
  196. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/tests/utils/test_resources.py +0 -0
  197. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/types/__init__.py +0 -0
  198. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/types/media.py +0 -0
  199. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/types/partial_download.py +0 -0
  200. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/types/tables.py +0 -0
  201. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/__init__.py +0 -0
  202. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/arguments.py +0 -0
  203. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/cloudpickle.py +0 -0
  204. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/common.py +0 -0
  205. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/daft.py +0 -0
  206. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/metrics.py +0 -0
  207. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/numpy.py +0 -0
  208. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/pandas.py +0 -0
  209. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/performance.py +0 -0
  210. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/placement.py +0 -0
  211. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/pyarrow.py +0 -0
  212. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/ray_utils/__init__.py +0 -0
  213. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/ray_utils/collections.py +0 -0
  214. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/ray_utils/concurrency.py +0 -0
  215. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/ray_utils/dataset.py +0 -0
  216. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/ray_utils/performance.py +0 -0
  217. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/ray_utils/runtime.py +0 -0
  218. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/resources.py +0 -0
  219. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/s3fs.py +0 -0
  220. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat/utils/schema.py +0 -0
  221. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat.egg-info/dependency_links.txt +0 -0
  222. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat.egg-info/requires.txt +0 -0
  223. {deltacat-1.1.33 → deltacat-1.1.35}/deltacat.egg-info/top_level.txt +0 -0
  224. {deltacat-1.1.33 → deltacat-1.1.35}/setup.cfg +0 -0
  225. {deltacat-1.1.33 → deltacat-1.1.35}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.33
3
+ Version: 1.1.35
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.33"
47
+ __version__ = "1.1.35"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -1,4 +1,4 @@
1
- from deltacat.utils.common import env_bool, env_integer
1
+ from deltacat.utils.common import env_bool, env_integer, env_string
2
2
 
3
3
  TOTAL_BYTES_IN_SHA1_HASH = 20
4
4
 
@@ -92,3 +92,18 @@ DEFAULT_NUM_ROUNDS = 1
92
92
  SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
93
93
  "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
94
94
  )
95
+
96
+ # This env variable specifies whether to check bucketing spec
97
+ # compliance of the existing compacted table.
98
+ # PRINT_LOG: Enable logging if any partition is found
99
+ # to be non-compliant with the bucketing spec.
100
+ # ASSERT: Fail the job with ValidationError if the
101
+ # current compacted partition is found to be non-compliant
102
+ # with bucketing spec. Note, logging is implicitly enabled
103
+ # in this case.
104
+ BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
105
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE", None
106
+ )
107
+
108
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
109
+ BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
@@ -32,6 +32,7 @@ from deltacat.utils.resources import (
32
32
  )
33
33
  from deltacat.compute.compactor_v2.utils.primary_key_index import (
34
34
  generate_pk_hash_column,
35
+ pk_digest_to_hash_bucket_index,
35
36
  )
36
37
  from deltacat.storage import (
37
38
  Delta,
@@ -47,6 +48,9 @@ from deltacat.compute.compactor_v2.constants import (
47
48
  MERGE_TIME_IN_SECONDS,
48
49
  MERGE_SUCCESS_COUNT,
49
50
  MERGE_FAILURE_COUNT,
51
+ BUCKETING_SPEC_COMPLIANCE_PROFILE,
52
+ BUCKETING_SPEC_COMPLIANCE_ASSERT,
53
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
50
54
  )
51
55
  from deltacat.exceptions import (
52
56
  categorize_errors,
@@ -188,9 +192,34 @@ def _merge_tables(
188
192
  return final_table
189
193
 
190
194
 
195
+ def _validate_bucketing_spec_compliance(
196
+ table: pa.Table, rcf: RoundCompletionInfo, hb_index: int, primary_keys: List[str]
197
+ ) -> None:
198
+ pki_table = generate_pk_hash_column(
199
+ [table], primary_keys=primary_keys, requires_hash=True
200
+ )[0]
201
+ for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
202
+ hash_bucket = pk_digest_to_hash_bucket_index(hash_value, rcf.hash_bucket_count)
203
+ if hash_bucket != hb_index:
204
+ logger.info(
205
+ f"{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}"
206
+ f".{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}"
207
+ f".{rcf.compacted_delta_locator.partition_values} has non-compliant bucketing spec. "
208
+ f"Expected hash bucket is {hb_index} but found {hash_bucket}."
209
+ )
210
+ if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
211
+ raise AssertionError(
212
+ "Hash bucket drift detected. Expected hash bucket index"
213
+ f" to be {hb_index} but found {hash_bucket}"
214
+ )
215
+ # No further checks necessary
216
+ break
217
+
218
+
191
219
  def _download_compacted_table(
192
220
  hb_index: int,
193
221
  rcf: RoundCompletionInfo,
222
+ primary_keys: List[str],
194
223
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
195
224
  deltacat_storage=unimplemented_deltacat_storage,
196
225
  deltacat_storage_kwargs: Optional[dict] = None,
@@ -214,7 +243,23 @@ def _download_compacted_table(
214
243
 
215
244
  tables.append(table)
216
245
 
217
- return pa.concat_tables(tables)
246
+ compacted_table = pa.concat_tables(tables)
247
+ check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
248
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
249
+ BUCKETING_SPEC_COMPLIANCE_ASSERT,
250
+ ]
251
+
252
+ logger.debug(
253
+ f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
254
+ f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
255
+ )
256
+
257
+ # Bucketing spec compliance isn't required without primary keys
258
+ if primary_keys and check_bucketing_spec:
259
+ _validate_bucketing_spec_compliance(
260
+ compacted_table, rcf, hb_index, primary_keys
261
+ )
262
+ return compacted_table
218
263
 
219
264
 
220
265
  def _copy_all_manifest_files_from_old_hash_buckets(
@@ -543,6 +588,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
543
588
  compacted_table = _download_compacted_table(
544
589
  hb_index=merge_file_group.hb_index,
545
590
  rcf=input.round_completion_info,
591
+ primary_keys=input.primary_keys,
546
592
  read_kwargs_provider=input.read_kwargs_provider,
547
593
  deltacat_storage=input.deltacat_storage,
548
594
  deltacat_storage_kwargs=input.deltacat_storage_kwargs,
@@ -5,6 +5,7 @@ from deltacat.compute.compactor_v2.constants import (
5
5
  TASK_MAX_PARALLELISM,
6
6
  MAX_PARQUET_METADATA_SIZE,
7
7
  )
8
+ from deltacat.utils.common import ReadKwargsProvider
8
9
  from deltacat.utils.ray_utils.concurrency import invoke_parallel
9
10
  from deltacat import logs
10
11
  from deltacat.storage import (
@@ -75,11 +76,21 @@ def _download_parquet_metadata_for_manifest_entry(
75
76
  entry_index: int,
76
77
  deltacat_storage: unimplemented_deltacat_storage,
77
78
  deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
79
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
78
80
  ) -> Dict[str, Any]:
81
+ logger.info(
82
+ f"Downloading the parquet metadata for Delta with locator {delta.locator} and entry_index: {entry_index}"
83
+ )
84
+ if "file_reader_kwargs_provider" in deltacat_storage_kwargs:
85
+ logger.info(
86
+ "'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
87
+ )
88
+ deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
79
89
  pq_file = deltacat_storage.download_delta_manifest_entry(
80
90
  delta,
81
91
  entry_index=entry_index,
82
92
  table_type=TableType.PYARROW_PARQUET,
93
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
83
94
  **deltacat_storage_kwargs,
84
95
  )
85
96
 
@@ -97,11 +108,15 @@ def append_content_type_params(
97
108
  max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
98
109
  deltacat_storage=unimplemented_deltacat_storage,
99
110
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
111
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
100
112
  ) -> bool:
101
113
  """
102
114
  This operation appends content type params into the delta entry. Note
103
115
  that this operation can be time consuming, hence we cache it in a Ray actor.
104
116
  """
117
+ logger.info(
118
+ f"Appending the content type params for Delta with locator {delta.locator}..."
119
+ )
105
120
 
106
121
  if not delta.meta:
107
122
  logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
@@ -159,6 +174,7 @@ def append_content_type_params(
159
174
 
160
175
  def input_provider(index, item) -> Dict:
161
176
  return {
177
+ "file_reader_kwargs_provider": file_reader_kwargs_provider,
162
178
  "deltacat_storage_kwargs": deltacat_storage_kwargs,
163
179
  "deltacat_storage": deltacat_storage,
164
180
  "delta": delta,
@@ -168,6 +184,7 @@ def append_content_type_params(
168
184
  logger.info(
169
185
  f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
170
186
  )
187
+
171
188
  pq_files_promise = invoke_parallel(
172
189
  entry_indices_to_download,
173
190
  ray_task=_download_parquet_metadata_for_manifest_entry,
@@ -101,7 +101,6 @@ def create_uniform_input_deltas(
101
101
  delta_manifest_entries_count = 0
102
102
  estimated_da_bytes = 0
103
103
  input_da_list = []
104
-
105
104
  for delta in input_deltas:
106
105
  if (
107
106
  compact_partition_params.enable_input_split
@@ -118,6 +117,7 @@ def create_uniform_input_deltas(
118
117
  deltacat_storage_kwargs=deltacat_storage_kwargs,
119
118
  task_max_parallelism=compact_partition_params.task_max_parallelism,
120
119
  max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
120
+ file_reader_kwargs_provider=compact_partition_params.read_kwargs_provider,
121
121
  )
122
122
 
123
123
  manifest_entries = delta.manifest.entries
@@ -93,11 +93,29 @@ def _estimate_resources_required_to_process_delta_using_type_params(
93
93
  on_disk_size_bytes=delta.meta.content_length,
94
94
  ),
95
95
  )
96
-
96
+ file_reader_kwargs_provider = kwargs.get(
97
+ "file_reader_kwargs_provider"
98
+ ) or deltacat_storage_kwargs.get("file_reader_kwargs_provider")
99
+
100
+ """
101
+ NOTE: The file_reader_kwargs_provider parameter can be passed in two ways:
102
+ 1. Nested within deltacat_storage_kwargs during resource estimation
103
+ 2. As a top-level attribute of CompactPartitionsParams during compaction
104
+
105
+ This creates an inconsistent parameter path between resource estimation and compaction flows.
106
+ As a long-term solution, this should be unified to use a single consistent path (either always
107
+ nested in deltacat_storage_kwargs or always as a top-level parameter).
108
+
109
+ For now, this implementation handles the resource estimation case by:
110
+ 1. First checking for file_reader_kwargs_provider as a direct kwarg
111
+ 2. Falling back to deltacat_storage_kwargs if not found
112
+ This approach maintains backward compatibility by not modifying the DELTA_RESOURCE_ESTIMATION_FUNCTIONS signatures.
113
+ """
97
114
  appended = append_content_type_params(
98
115
  delta=delta,
99
116
  deltacat_storage=deltacat_storage,
100
117
  deltacat_storage_kwargs=deltacat_storage_kwargs,
118
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
101
119
  )
102
120
 
103
121
  if not appended:
@@ -4,9 +4,11 @@ import os
4
4
  import pyarrow as pa
5
5
  import pytest
6
6
  import boto3
7
+ import json
7
8
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
8
9
  CompactionSessionAuditInfo,
9
10
  )
11
+ from deltacat.exceptions import ValidationError
10
12
  from boto3.resources.base import ServiceResource
11
13
  import deltacat.tests.local_deltacat_storage as ds
12
14
  from deltacat.types.media import ContentType
@@ -88,6 +90,17 @@ def disable_sha1(monkeypatch):
88
90
  )
89
91
 
90
92
 
93
+ @pytest.fixture(scope="function")
94
+ def enable_bucketing_spec_validation(monkeypatch):
95
+ import deltacat.compute.compactor_v2.steps.merge
96
+
97
+ monkeypatch.setattr(
98
+ deltacat.compute.compactor_v2.steps.merge,
99
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE",
100
+ "ASSERT",
101
+ )
102
+
103
+
91
104
  class TestCompactionSession:
92
105
  """
93
106
  This class adds specific tests that aren't part of the parametrized test suite.
@@ -689,3 +702,307 @@ class TestCompactionSession:
689
702
  incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
690
703
  )
691
704
  assert incremental_rcf.compacted_pyarrow_write_result.records == 4
705
+
706
+ def test_compact_partition_when_bucket_spec_validation_fails(
707
+ self,
708
+ s3_resource,
709
+ local_deltacat_storage_kwargs,
710
+ enable_bucketing_spec_validation,
711
+ ):
712
+ """
713
+ A test case which asserts the bucketing spec validation throws an assertion error
714
+ when the validation has failed.
715
+ """
716
+
717
+ # setup
718
+ staged_source = stage_partition_from_file_paths(
719
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
720
+ )
721
+
722
+ source_delta = commit_delta_to_staged_partition(
723
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
724
+ )
725
+
726
+ staged_dest = stage_partition_from_file_paths(
727
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
728
+ )
729
+ dest_partition = ds.commit_partition(
730
+ staged_dest, **local_deltacat_storage_kwargs
731
+ )
732
+
733
+ # action
734
+ rcf_url = compact_partition(
735
+ CompactPartitionParams.of(
736
+ {
737
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
738
+ "compacted_file_content_type": ContentType.PARQUET,
739
+ "dd_max_parallelism_ratio": 1.0,
740
+ "deltacat_storage": ds,
741
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
742
+ "destination_partition_locator": dest_partition.locator,
743
+ "drop_duplicates": True,
744
+ "hash_bucket_count": 4,
745
+ "last_stream_position_to_compact": source_delta.stream_position,
746
+ "list_deltas_kwargs": {
747
+ **local_deltacat_storage_kwargs,
748
+ **{"equivalent_table_types": []},
749
+ },
750
+ "primary_keys": ["pk"],
751
+ "rebase_source_partition_locator": source_delta.partition_locator,
752
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
753
+ "records_per_compacted_file": 1,
754
+ "s3_client_kwargs": {},
755
+ "source_partition_locator": source_delta.partition_locator,
756
+ }
757
+ )
758
+ )
759
+
760
+ backfill_rcf = get_rcf(s3_resource, rcf_url)
761
+ bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
762
+ # Move the records to different hash buckets to simulate a validation failure.
763
+ backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
764
+ s3_resource.Bucket(bucket).put_object(
765
+ Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
766
+ )
767
+
768
+ # Now run an incremental compaction and verify if the previous RCF was read properly.
769
+ new_source_delta = commit_delta_to_partition(
770
+ source_delta.partition_locator,
771
+ [self.INCREMENTAL_FILE_PATH],
772
+ **local_deltacat_storage_kwargs,
773
+ )
774
+
775
+ new_destination_partition = ds.get_partition(
776
+ dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
777
+ )
778
+
779
+ with pytest.raises(ValidationError) as excinfo:
780
+ compact_partition(
781
+ CompactPartitionParams.of(
782
+ {
783
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
784
+ "compacted_file_content_type": ContentType.PARQUET,
785
+ "dd_max_parallelism_ratio": 1.0,
786
+ "deltacat_storage": ds,
787
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
788
+ "destination_partition_locator": new_destination_partition.locator,
789
+ "drop_duplicates": True,
790
+ "hash_bucket_count": 4,
791
+ "last_stream_position_to_compact": new_source_delta.stream_position,
792
+ "list_deltas_kwargs": {
793
+ **local_deltacat_storage_kwargs,
794
+ **{"equivalent_table_types": []},
795
+ },
796
+ "primary_keys": ["pk"],
797
+ "rebase_source_partition_locator": None,
798
+ "rebase_source_partition_high_watermark": None,
799
+ "records_per_compacted_file": 4000,
800
+ "s3_client_kwargs": {},
801
+ "source_partition_locator": new_source_delta.partition_locator,
802
+ }
803
+ )
804
+ )
805
+
806
+ assert (
807
+ "Hash bucket drift detected. Expected hash bucket index to be 1 but found 0"
808
+ in str(excinfo.value)
809
+ )
810
+
811
+ def test_compact_partition_when_bucket_spec_validation_fails_but_env_variable_disabled(
812
+ self,
813
+ s3_resource,
814
+ local_deltacat_storage_kwargs,
815
+ ):
816
+ """
817
+ A test case which asserts even if bucketing spec validation fails, compaction doesn't
818
+ throw an error if the feature is not enabled.
819
+ """
820
+
821
+ # setup
822
+ staged_source = stage_partition_from_file_paths(
823
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
824
+ )
825
+
826
+ source_delta = commit_delta_to_staged_partition(
827
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
828
+ )
829
+
830
+ staged_dest = stage_partition_from_file_paths(
831
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
832
+ )
833
+ dest_partition = ds.commit_partition(
834
+ staged_dest, **local_deltacat_storage_kwargs
835
+ )
836
+
837
+ # action
838
+ rcf_url = compact_partition(
839
+ CompactPartitionParams.of(
840
+ {
841
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
842
+ "compacted_file_content_type": ContentType.PARQUET,
843
+ "dd_max_parallelism_ratio": 1.0,
844
+ "deltacat_storage": ds,
845
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
846
+ "destination_partition_locator": dest_partition.locator,
847
+ "drop_duplicates": True,
848
+ "hash_bucket_count": 4,
849
+ "last_stream_position_to_compact": source_delta.stream_position,
850
+ "list_deltas_kwargs": {
851
+ **local_deltacat_storage_kwargs,
852
+ **{"equivalent_table_types": []},
853
+ },
854
+ "primary_keys": ["pk"],
855
+ "rebase_source_partition_locator": source_delta.partition_locator,
856
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
857
+ "records_per_compacted_file": 1,
858
+ "s3_client_kwargs": {},
859
+ "source_partition_locator": source_delta.partition_locator,
860
+ }
861
+ )
862
+ )
863
+
864
+ backfill_rcf = get_rcf(s3_resource, rcf_url)
865
+ bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
866
+ # Move the records to different hash buckets to simulate a validation failure.
867
+ backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
868
+ s3_resource.Bucket(bucket).put_object(
869
+ Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
870
+ )
871
+
872
+ # Now run an incremental compaction and verify if the previous RCF was read properly.
873
+ new_source_delta = commit_delta_to_partition(
874
+ source_delta.partition_locator,
875
+ [self.INCREMENTAL_FILE_PATH],
876
+ **local_deltacat_storage_kwargs,
877
+ )
878
+
879
+ new_destination_partition = ds.get_partition(
880
+ dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
881
+ )
882
+
883
+ new_rcf = compact_partition(
884
+ CompactPartitionParams.of(
885
+ {
886
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
887
+ "compacted_file_content_type": ContentType.PARQUET,
888
+ "dd_max_parallelism_ratio": 1.0,
889
+ "deltacat_storage": ds,
890
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
891
+ "destination_partition_locator": new_destination_partition.locator,
892
+ "drop_duplicates": True,
893
+ "hash_bucket_count": 4,
894
+ "last_stream_position_to_compact": new_source_delta.stream_position,
895
+ "list_deltas_kwargs": {
896
+ **local_deltacat_storage_kwargs,
897
+ **{"equivalent_table_types": []},
898
+ },
899
+ "primary_keys": ["pk"],
900
+ "rebase_source_partition_locator": None,
901
+ "rebase_source_partition_high_watermark": None,
902
+ "records_per_compacted_file": 4000,
903
+ "s3_client_kwargs": {},
904
+ "source_partition_locator": new_source_delta.partition_locator,
905
+ }
906
+ )
907
+ )
908
+
909
+ incremental_rcf = get_rcf(s3_resource, new_rcf)
910
+ assert incremental_rcf.hash_bucket_count == 4
911
+ assert len(incremental_rcf.hb_index_to_entry_range) == 2
912
+
913
+ def test_compact_partition_when_bucket_spec_validation_succeeds(
914
+ self,
915
+ s3_resource,
916
+ local_deltacat_storage_kwargs,
917
+ enable_bucketing_spec_validation,
918
+ ):
919
+ """
920
+ A test case which asserts the bucketing spec validation does not throw
921
+ and error when the validation succeeds.
922
+ """
923
+
924
+ # setup
925
+ staged_source = stage_partition_from_file_paths(
926
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
927
+ )
928
+
929
+ source_delta = commit_delta_to_staged_partition(
930
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
931
+ )
932
+
933
+ staged_dest = stage_partition_from_file_paths(
934
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
935
+ )
936
+ dest_partition = ds.commit_partition(
937
+ staged_dest, **local_deltacat_storage_kwargs
938
+ )
939
+
940
+ # action
941
+ rcf_url = compact_partition(
942
+ CompactPartitionParams.of(
943
+ {
944
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
945
+ "compacted_file_content_type": ContentType.PARQUET,
946
+ "dd_max_parallelism_ratio": 1.0,
947
+ "deltacat_storage": ds,
948
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
949
+ "destination_partition_locator": dest_partition.locator,
950
+ "drop_duplicates": True,
951
+ "hash_bucket_count": 4,
952
+ "last_stream_position_to_compact": source_delta.stream_position,
953
+ "list_deltas_kwargs": {
954
+ **local_deltacat_storage_kwargs,
955
+ **{"equivalent_table_types": []},
956
+ },
957
+ "primary_keys": ["pk"],
958
+ "rebase_source_partition_locator": source_delta.partition_locator,
959
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
960
+ "records_per_compacted_file": 1,
961
+ "s3_client_kwargs": {},
962
+ "source_partition_locator": source_delta.partition_locator,
963
+ }
964
+ )
965
+ )
966
+
967
+ rcf = get_rcf(s3_resource, rcf_url)
968
+ assert rcf.hash_bucket_count == 4
969
+
970
+ # Now run an incremental compaction and verify if the previous RCF was read properly.
971
+ new_source_delta = commit_delta_to_partition(
972
+ source_delta.partition_locator,
973
+ [self.INCREMENTAL_FILE_PATH],
974
+ **local_deltacat_storage_kwargs,
975
+ )
976
+
977
+ new_destination_partition = ds.get_partition(
978
+ dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
979
+ )
980
+
981
+ new_uri = compact_partition(
982
+ CompactPartitionParams.of(
983
+ {
984
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
985
+ "compacted_file_content_type": ContentType.PARQUET,
986
+ "dd_max_parallelism_ratio": 1.0,
987
+ "deltacat_storage": ds,
988
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
989
+ "destination_partition_locator": new_destination_partition.locator,
990
+ "drop_duplicates": True,
991
+ "hash_bucket_count": 4,
992
+ "last_stream_position_to_compact": new_source_delta.stream_position,
993
+ "list_deltas_kwargs": {
994
+ **local_deltacat_storage_kwargs,
995
+ **{"equivalent_table_types": []},
996
+ },
997
+ "primary_keys": ["pk"],
998
+ "rebase_source_partition_locator": None,
999
+ "rebase_source_partition_high_watermark": None,
1000
+ "records_per_compacted_file": 4000,
1001
+ "s3_client_kwargs": {},
1002
+ "source_partition_locator": new_source_delta.partition_locator,
1003
+ }
1004
+ )
1005
+ )
1006
+
1007
+ rcf = get_rcf(s3_resource, new_uri)
1008
+ assert rcf.hash_bucket_count == 4