deltacat 1.1.30__tar.gz → 1.1.32__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. {deltacat-1.1.30/deltacat.egg-info → deltacat-1.1.32}/PKG-INFO +1 -1
  2. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/__init__.py +1 -1
  3. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/utils/task_options.py +43 -23
  4. deltacat-1.1.32/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +305 -0
  5. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/test_pyarrow.py +106 -4
  6. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/pyarrow.py +11 -5
  7. {deltacat-1.1.30 → deltacat-1.1.32/deltacat.egg-info}/PKG-INFO +1 -1
  8. deltacat-1.1.30/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -33
  9. {deltacat-1.1.30 → deltacat-1.1.32}/LICENSE +0 -0
  10. {deltacat-1.1.30 → deltacat-1.1.32}/MANIFEST.in +0 -0
  11. {deltacat-1.1.30 → deltacat-1.1.32}/README.md +0 -0
  12. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/aws/__init__.py +0 -0
  13. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/aws/clients.py +0 -0
  14. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/aws/constants.py +0 -0
  15. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/aws/redshift/__init__.py +0 -0
  16. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/aws/redshift/model/__init__.py +0 -0
  17. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/aws/redshift/model/manifest.py +0 -0
  18. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/aws/s3u.py +0 -0
  19. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/benchmarking/__init__.py +0 -0
  20. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
  21. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/benchmarking/conftest.py +0 -0
  22. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/catalog/__init__.py +0 -0
  23. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/catalog/default_catalog_impl/__init__.py +0 -0
  24. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/catalog/delegate.py +0 -0
  25. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/catalog/interface.py +0 -0
  26. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/catalog/model/__init__.py +0 -0
  27. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/catalog/model/catalog.py +0 -0
  28. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/catalog/model/table_definition.py +0 -0
  29. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/__init__.py +0 -0
  30. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/__init__.py +0 -0
  31. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/compaction_session.py +0 -0
  32. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/__init__.py +0 -0
  33. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
  34. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
  35. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/compactor_version.py +0 -0
  36. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
  37. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
  38. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
  39. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
  40. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
  41. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/materialize_result.py +0 -0
  42. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
  43. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  44. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  45. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
  46. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/model/table_object_store.py +0 -0
  47. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/repartition_session.py +0 -0
  48. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/steps/__init__.py +0 -0
  49. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/steps/dedupe.py +0 -0
  50. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/steps/hash_bucket.py +0 -0
  51. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/steps/materialize.py +0 -0
  52. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/steps/repartition.py +0 -0
  53. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/utils/__init__.py +0 -0
  54. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/utils/io.py +0 -0
  55. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
  56. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
  57. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/utils/sort_key.py +0 -0
  58. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor/utils/system_columns.py +0 -0
  59. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/__init__.py +0 -0
  60. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/compaction_session.py +0 -0
  61. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/constants.py +0 -0
  62. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
  63. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +0 -0
  64. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/deletes/delete_strategy.py +0 -0
  65. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +0 -0
  66. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/deletes/model.py +0 -0
  67. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/deletes/utils.py +0 -0
  68. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/model/__init__.py +0 -0
  69. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -0
  70. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +0 -0
  71. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
  72. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/model/merge_file_group.py +0 -0
  73. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/model/merge_input.py +0 -0
  74. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/model/merge_result.py +0 -0
  75. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/private/__init__.py +0 -0
  76. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/private/compaction_utils.py +0 -0
  77. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  78. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/steps/hash_bucket.py +0 -0
  79. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/steps/merge.py +0 -0
  80. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  81. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/utils/content_type_params.py +0 -0
  82. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/utils/dedupe.py +0 -0
  83. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/utils/delta.py +0 -0
  84. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/utils/io.py +0 -0
  85. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/utils/merge.py +0 -0
  86. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/compactor_v2/utils/primary_key_index.py +0 -0
  87. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/merge_on_read/__init__.py +0 -0
  88. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/merge_on_read/daft.py +0 -0
  89. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/merge_on_read/model/__init__.py +0 -0
  90. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -0
  91. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  92. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/merge_on_read/utils/delta.py +0 -0
  93. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/resource_estimation/__init__.py +0 -0
  94. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/resource_estimation/delta.py +0 -0
  95. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/resource_estimation/manifest.py +0 -0
  96. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/resource_estimation/model.py +0 -0
  97. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/resource_estimation/parquet.py +0 -0
  98. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/stats/__init__.py +0 -0
  99. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/stats/models/__init__.py +0 -0
  100. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  101. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/stats/models/delta_stats.py +0 -0
  102. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  103. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  104. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/stats/models/stats_result.py +0 -0
  105. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/compute/stats/types.py +0 -0
  106. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/constants.py +0 -0
  107. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/exceptions.py +0 -0
  108. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/__init__.py +0 -0
  109. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/aws/__init__.py +0 -0
  110. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/aws/redshift/__init__.py +0 -0
  111. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/dataset.py +0 -0
  112. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/file_object_store.py +0 -0
  113. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/memcached_object_store.py +0 -0
  114. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/object_store.py +0 -0
  115. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/ray_plasma_object_store.py +0 -0
  116. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/read_api.py +0 -0
  117. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/redis_object_store.py +0 -0
  118. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/io/s3_object_store.py +0 -0
  119. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/logs.py +0 -0
  120. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/__init__.py +0 -0
  121. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/interface.py +0 -0
  122. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/__init__.py +0 -0
  123. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/delete_parameters.py +0 -0
  124. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/delta.py +0 -0
  125. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/list_result.py +0 -0
  126. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/locator.py +0 -0
  127. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/namespace.py +0 -0
  128. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/partition.py +0 -0
  129. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/partition_spec.py +0 -0
  130. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/sort_key.py +0 -0
  131. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/stream.py +0 -0
  132. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/table.py +0 -0
  133. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/table_version.py +0 -0
  134. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/transform.py +0 -0
  135. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/storage/model/types.py +0 -0
  136. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/__init__.py +0 -0
  137. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/aws/__init__.py +0 -0
  138. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/aws/test_clients.py +0 -0
  139. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/aws/test_s3u.py +0 -0
  140. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/catalog/__init__.py +0 -0
  141. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/catalog/test_default_catalog_impl.py +0 -0
  142. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/__init__.py +0 -0
  143. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +0 -0
  144. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compact_partition_rebase_test_cases.py +0 -0
  145. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +0 -0
  146. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compact_partition_test_cases.py +0 -0
  147. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor/__init__.py +0 -0
  148. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  149. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor/steps/test_repartition.py +0 -0
  150. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  151. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor/utils/test_io.py +0 -0
  152. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -0
  153. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  154. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -0
  155. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
  156. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor_v2/utils/__init__.py +0 -0
  157. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -0
  158. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/resource_estimation/__init__.py +0 -0
  159. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/resource_estimation/data/__init__.py +0 -0
  160. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/resource_estimation/test_delta.py +0 -0
  161. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/resource_estimation/test_manifest.py +0 -0
  162. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/test_compact_partition_incremental.py +0 -0
  163. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/test_compact_partition_multiple_rounds.py +0 -0
  164. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/test_compact_partition_params.py +0 -0
  165. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/test_compact_partition_rebase.py +0 -0
  166. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +0 -0
  167. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/test_util_common.py +0 -0
  168. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/test_util_constant.py +0 -0
  169. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -0
  170. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/io/__init__.py +0 -0
  171. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/io/test_cloudpickle_bug_fix.py +0 -0
  172. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/io/test_file_object_store.py +0 -0
  173. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/io/test_memcached_object_store.py +0 -0
  174. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/io/test_ray_plasma_object_store.py +0 -0
  175. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/io/test_redis_object_store.py +0 -0
  176. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/io/test_s3_object_store.py +0 -0
  177. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/local_deltacat_storage/__init__.py +0 -0
  178. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/local_deltacat_storage/exceptions.py +0 -0
  179. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/test_exceptions.py +0 -0
  180. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/test_logs.py +0 -0
  181. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/test_utils/__init__.py +0 -0
  182. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/test_utils/constants.py +0 -0
  183. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/test_utils/pyarrow.py +0 -0
  184. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/test_utils/storage.py +0 -0
  185. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/test_utils/utils.py +0 -0
  186. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/__init__.py +0 -0
  187. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/data/__init__.py +0 -0
  188. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/ray_utils/__init__.py +0 -0
  189. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/ray_utils/test_concurrency.py +0 -0
  190. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/ray_utils/test_dataset.py +0 -0
  191. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/test_cloudpickle.py +0 -0
  192. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/test_daft.py +0 -0
  193. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/test_metrics.py +0 -0
  194. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/test_placement.py +0 -0
  195. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
  196. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/tests/utils/test_resources.py +0 -0
  197. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/types/__init__.py +0 -0
  198. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/types/media.py +0 -0
  199. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/types/partial_download.py +0 -0
  200. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/types/tables.py +0 -0
  201. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/__init__.py +0 -0
  202. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/arguments.py +0 -0
  203. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/cloudpickle.py +0 -0
  204. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/common.py +0 -0
  205. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/daft.py +0 -0
  206. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/metrics.py +0 -0
  207. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/numpy.py +0 -0
  208. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/pandas.py +0 -0
  209. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/performance.py +0 -0
  210. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/placement.py +0 -0
  211. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/ray_utils/__init__.py +0 -0
  212. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/ray_utils/collections.py +0 -0
  213. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/ray_utils/concurrency.py +0 -0
  214. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/ray_utils/dataset.py +0 -0
  215. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/ray_utils/performance.py +0 -0
  216. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/ray_utils/runtime.py +0 -0
  217. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/resources.py +0 -0
  218. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/s3fs.py +0 -0
  219. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat/utils/schema.py +0 -0
  220. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat.egg-info/SOURCES.txt +0 -0
  221. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat.egg-info/dependency_links.txt +0 -0
  222. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat.egg-info/requires.txt +0 -0
  223. {deltacat-1.1.30 → deltacat-1.1.32}/deltacat.egg-info/top_level.txt +0 -0
  224. {deltacat-1.1.30 → deltacat-1.1.32}/setup.cfg +0 -0
  225. {deltacat-1.1.30 → deltacat-1.1.32}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.30
3
+ Version: 1.1.32
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.30"
47
+ __version__ = "1.1.32"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -1,11 +1,16 @@
1
1
  import logging
2
2
  from typing import Dict, Optional, List, Tuple, Any
3
3
  from deltacat import logs
4
+ from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
5
+ from deltacat.compute.compactor_v2.constants import (
6
+ AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
7
+ )
4
8
  from deltacat.compute.compactor_v2.model.merge_file_group import (
5
9
  LocalMergeFileGroupsProvider,
6
10
  )
7
11
  from deltacat.storage import (
8
12
  Manifest,
13
+ ManifestEntry,
9
14
  interface as unimplemented_deltacat_storage,
10
15
  )
11
16
  from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
@@ -81,16 +86,27 @@ def _get_merge_task_options(
81
86
  and compacted_delta_manifest
82
87
  and round_completion_info.hb_index_to_entry_range
83
88
  ):
84
-
85
- previous_inflation = (
86
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
87
- / round_completion_info.compacted_pyarrow_write_result.file_bytes
89
+ logger.debug_conditional(
90
+ f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
91
+ memory_logs_enabled,
92
+ )
93
+ previous_inflation: float = (
94
+ (
95
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
96
+ / round_completion_info.compacted_pyarrow_write_result.file_bytes
97
+ )
98
+ if round_completion_info.compacted_pyarrow_write_result.file_bytes
99
+ else PYARROW_INFLATION_MULTIPLIER
88
100
  )
89
101
  debug_memory_params["previous_inflation"] = previous_inflation
90
102
 
91
- average_record_size = (
92
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
93
- / round_completion_info.compacted_pyarrow_write_result.records
103
+ average_record_size: float = (
104
+ (
105
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
106
+ / round_completion_info.compacted_pyarrow_write_result.records
107
+ )
108
+ if round_completion_info.compacted_pyarrow_write_result.records
109
+ else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
94
110
  )
95
111
  debug_memory_params["average_record_size"] = average_record_size
96
112
 
@@ -106,31 +122,36 @@ def _get_merge_task_options(
106
122
  str(hb_idx)
107
123
  ]
108
124
  for entry_index in range(entry_start, entry_end):
109
- entry = compacted_delta_manifest.entries[entry_index]
110
-
111
- current_entry_size = estimate_manifest_entry_size_bytes(
112
- entry=entry,
113
- operation_type=OperationType.PYARROW_DOWNLOAD,
114
- estimate_resources_params=estimate_resources_params,
125
+ entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
126
+ current_entry_size: float = (
127
+ estimate_manifest_entry_size_bytes(
128
+ entry=entry,
129
+ operation_type=OperationType.PYARROW_DOWNLOAD,
130
+ estimate_resources_params=estimate_resources_params,
131
+ )
132
+ or 0.0
115
133
  )
116
- current_entry_rows = estimate_manifest_entry_num_rows(
117
- entry=entry,
118
- operation_type=OperationType.PYARROW_DOWNLOAD,
119
- estimate_resources_params=estimate_resources_params,
134
+ current_entry_rows: int = (
135
+ estimate_manifest_entry_num_rows(
136
+ entry=entry,
137
+ operation_type=OperationType.PYARROW_DOWNLOAD,
138
+ estimate_resources_params=estimate_resources_params,
139
+ )
140
+ or 0
120
141
  )
121
-
142
+ # NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
122
143
  data_size += current_entry_size
123
144
  num_rows += current_entry_rows
124
-
125
145
  if primary_keys:
126
- pk_size = estimate_manifest_entry_column_size_bytes(
146
+ pk_size: Optional[
147
+ float
148
+ ] = estimate_manifest_entry_column_size_bytes(
127
149
  entry=entry,
128
150
  columns=primary_keys,
129
151
  operation_type=OperationType.PYARROW_DOWNLOAD,
130
152
  estimate_resources_params=estimate_resources_params,
131
153
  )
132
-
133
- if pk_size is None:
154
+ if not pk_size:
134
155
  pk_size_bytes += current_entry_size
135
156
  else:
136
157
  pk_size_bytes += pk_size
@@ -159,7 +180,6 @@ def _get_merge_task_options(
159
180
  f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
160
181
  memory_logs_enabled,
161
182
  )
162
-
163
183
  return _get_task_options(0.01, total_memory, ray_custom_resources)
164
184
 
165
185
 
@@ -0,0 +1,305 @@
1
+ import unittest
2
+ import ray
3
+ from deltacat.compute.compactor_v2.utils.task_options import (
4
+ _get_task_options,
5
+ _get_merge_task_options,
6
+ logger,
7
+ )
8
+ from deltacat.compute.resource_estimation.model import (
9
+ EstimateResourcesParams,
10
+ ResourceEstimationMethod,
11
+ )
12
+ from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
13
+ from deltacat.compute.compactor import (
14
+ PyArrowWriteResult,
15
+ RoundCompletionInfo,
16
+ )
17
+ from deltacat.types.media import (
18
+ ContentType,
19
+ ContentEncoding,
20
+ )
21
+ from deltacat.storage import (
22
+ DeltaLocator,
23
+ Manifest,
24
+ ManifestMeta,
25
+ ManifestEntry,
26
+ ManifestEntryList,
27
+ PartitionValues,
28
+ )
29
+ from unittest.mock import MagicMock
30
+ from typing import Optional
31
+
32
+ from deltacat.compute.compactor_v2.constants import (
33
+ AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
34
+ )
35
+
36
+
37
+ @ray.remote
38
+ def valid_func():
39
+ return 2
40
+
41
+
42
+ @ray.remote
43
+ def throwing_func():
44
+ raise ConnectionAbortedError()
45
+
46
+
47
+ class TestTaskOptions(unittest.TestCase):
48
+ TEST_INDEX = 0
49
+ TEST_HB_GROUP_IDX = 0
50
+ TEST_STREAM_POSITION = 1_000_000
51
+ TEST_NUM_HASH_GROUPS = 1
52
+
53
+ @classmethod
54
+ def setUpClass(cls):
55
+ ray.init(local_mode=True, ignore_reinit_error=True)
56
+ super().setUpClass()
57
+
58
+ @classmethod
59
+ def tearDownClass(cls) -> None:
60
+ ray.shutdown()
61
+
62
+ def _make_estimate_resource_params(
63
+ cls,
64
+ resource_estimation_method: Optional[
65
+ ResourceEstimationMethod
66
+ ] = ResourceEstimationMethod.DEFAULT,
67
+ previous_inflation: Optional[int] = 7,
68
+ average_record_size_bytes: Optional[int] = 1000,
69
+ ):
70
+ return EstimateResourcesParams.of(
71
+ resource_estimation_method=resource_estimation_method,
72
+ previous_inflation=previous_inflation,
73
+ average_record_size_bytes=average_record_size_bytes,
74
+ )
75
+
76
+ def _make_manifest(
77
+ self,
78
+ source_content_length: Optional[int] = 1000,
79
+ content_type: Optional[ContentType] = ContentType.PARQUET,
80
+ content_encoding: Optional[ContentEncoding] = ContentEncoding.IDENTITY,
81
+ partition_values: Optional[PartitionValues] = None,
82
+ uri: Optional[str] = "test",
83
+ url: Optional[str] = "test",
84
+ author: Optional[str] = "foo",
85
+ entry_uuid: Optional[str] = "foo",
86
+ manifest_uuid: Optional[str] = "bar",
87
+ ) -> Manifest:
88
+ meta = ManifestMeta.of(
89
+ 10,
90
+ 10,
91
+ content_type=content_type,
92
+ content_encoding=content_encoding,
93
+ source_content_length=source_content_length,
94
+ partition_values=partition_values,
95
+ )
96
+
97
+ return Manifest.of(
98
+ entries=ManifestEntryList.of(
99
+ [
100
+ ManifestEntry.of(
101
+ uri=uri, url=url, meta=meta, mandatory=True, uuid=entry_uuid
102
+ )
103
+ ]
104
+ ),
105
+ author=author,
106
+ uuid=manifest_uuid,
107
+ )
108
+
109
+ def make_round_completion_info(
110
+ self,
111
+ high_watermark: Optional[int] = 1_000_000,
112
+ compacted_delta_locator: Optional[DeltaLocator] = None,
113
+ records_written: Optional[int] = 10,
114
+ bytes_written: Optional[int] = 10,
115
+ files_written: Optional[int] = 10,
116
+ rows_dropped: Optional[int] = 10,
117
+ sort_keys_bit_width: Optional[int] = 0,
118
+ hash_bucket_count: Optional[int] = 1,
119
+ hb_index_to_entry_range: Optional[dict] = None,
120
+ ) -> RoundCompletionInfo:
121
+ if compacted_delta_locator is None:
122
+ compacted_delta_locator = MagicMock(spec=DeltaLocator)
123
+
124
+ hb_index_to_entry_range = hb_index_to_entry_range or {"0": (0, 1)}
125
+
126
+ return RoundCompletionInfo.of(
127
+ compacted_delta_locator=compacted_delta_locator,
128
+ high_watermark=high_watermark,
129
+ compacted_pyarrow_write_result=PyArrowWriteResult.of(
130
+ records_written, bytes_written, files_written, rows_dropped
131
+ ),
132
+ sort_keys_bit_width=sort_keys_bit_width,
133
+ hb_index_to_entry_range=hb_index_to_entry_range,
134
+ hash_bucket_count=hash_bucket_count,
135
+ )
136
+
137
+ def test_get_task_options_sanity(self):
138
+ opts = _get_task_options(0.01, 0.01)
139
+ result_ref = valid_func.options(**opts).remote()
140
+ result = ray.get(result_ref)
141
+
142
+ self.assertEqual(result, 2)
143
+
144
+ def test_get_task_options_when_exception_is_thrown(self):
145
+ opts = _get_task_options(0.01, 0.01)
146
+ result_ref = throwing_func.options(**opts).remote()
147
+
148
+ self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
149
+
150
+ def test_get_merge_task_options_memory_logs_enabled_sanity(self):
151
+ test_index = 0
152
+ test_hb_group_idx = 0
153
+ test_debug_memory_params = {"merge_task_index": test_index}
154
+ test_estimate_memory_params = self._make_estimate_resource_params()
155
+ test_ray_custom_resources = {}
156
+ test_rcf = self.make_round_completion_info()
157
+ test_manifest = self._make_manifest()
158
+ expected_task_opts = {
159
+ "max_retries": 3,
160
+ "memory": 1680.64,
161
+ "num_cpus": 0.01,
162
+ "scheduling_strategy": "SPREAD",
163
+ }
164
+ expected_previous_inflation = 1.0
165
+ expected_average_record_size = 1.0
166
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
167
+ # At least one log of level DEBUG must be emitted
168
+ actual_merge_tasks_opts = _get_merge_task_options(
169
+ index=test_index,
170
+ hb_group_idx=test_hb_group_idx,
171
+ data_size=1,
172
+ pk_size_bytes=1,
173
+ num_rows=1,
174
+ num_hash_groups=1,
175
+ total_memory_buffer_percentage=1,
176
+ incremental_index_array_size=1,
177
+ debug_memory_params=test_debug_memory_params,
178
+ ray_custom_resources=test_ray_custom_resources,
179
+ estimate_resources_params=test_estimate_memory_params,
180
+ round_completion_info=test_rcf,
181
+ compacted_delta_manifest=test_manifest,
182
+ memory_logs_enabled=True,
183
+ )
184
+ assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
185
+ log_message_round_completion_info = cm.records[0].getMessage()
186
+ log_message_debug_memory_params = cm.records[1].getMessage()
187
+ self.assertIn(
188
+ f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
189
+ log_message_round_completion_info,
190
+ )
191
+ self.assertIn(
192
+ f"[Merge task {test_index}]: Params used for calculating merge memory",
193
+ log_message_debug_memory_params,
194
+ )
195
+ self.assertIn(
196
+ f"'previous_inflation': {expected_previous_inflation}",
197
+ log_message_debug_memory_params,
198
+ )
199
+ self.assertIn(
200
+ f"'average_record_size': {expected_average_record_size}",
201
+ log_message_debug_memory_params,
202
+ )
203
+
204
+ def test_get_merge_task_options_memory_logs_enabled_fallback_previous_inflation_fallback_average_record_size(
205
+ self,
206
+ ):
207
+ test_index = 0
208
+ test_hb_group_idx = 0
209
+ test_debug_memory_params = {"merge_task_index": test_index}
210
+ test_estimate_memory_params = self._make_estimate_resource_params()
211
+ test_ray_custom_resources = {}
212
+ test_rcf = self.make_round_completion_info(
213
+ bytes_written=0, records_written=0, files_written=0, rows_dropped=0
214
+ )
215
+ test_manifest = self._make_manifest()
216
+ expected_task_opts = {
217
+ "max_retries": 3,
218
+ "memory": 1680.64,
219
+ "num_cpus": 0.01,
220
+ "scheduling_strategy": "SPREAD",
221
+ }
222
+ expected_previous_inflation = PYARROW_INFLATION_MULTIPLIER
223
+ expected_average_record_size = DEFAULT_AVERAGE_RECORD_SIZE_BYTES
224
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
225
+ # At least one log of level DEBUG must be emitted
226
+ actual_merge_tasks_opts = _get_merge_task_options(
227
+ index=test_index,
228
+ hb_group_idx=test_hb_group_idx,
229
+ data_size=1,
230
+ pk_size_bytes=1,
231
+ num_rows=1,
232
+ num_hash_groups=1,
233
+ total_memory_buffer_percentage=1,
234
+ incremental_index_array_size=1,
235
+ debug_memory_params=test_debug_memory_params,
236
+ ray_custom_resources=test_ray_custom_resources,
237
+ estimate_resources_params=test_estimate_memory_params,
238
+ round_completion_info=test_rcf,
239
+ compacted_delta_manifest=test_manifest,
240
+ memory_logs_enabled=True,
241
+ )
242
+ assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
243
+ log_message_round_completion_info = cm.records[0].getMessage()
244
+ log_message_debug_memory_params = cm.records[1].getMessage()
245
+ self.assertIn(
246
+ f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
247
+ log_message_round_completion_info,
248
+ )
249
+ self.assertIn(
250
+ f"[Merge task {test_index}]: Params used for calculating merge memory",
251
+ log_message_debug_memory_params,
252
+ )
253
+ self.assertIn(
254
+ f"'previous_inflation': {expected_previous_inflation}",
255
+ log_message_debug_memory_params,
256
+ )
257
+ self.assertIn(
258
+ f"'average_record_size': {expected_average_record_size}",
259
+ log_message_debug_memory_params,
260
+ )
261
+
262
+ def test_get_merge_task_options_memory_logs_enabled_not_using_previous_round_completion_info(
263
+ self,
264
+ ):
265
+ test_index = 0
266
+ test_hb_group_idx = 0
267
+ test_debug_memory_params = {"merge_task_index": test_index}
268
+ test_estimate_memory_params = self._make_estimate_resource_params()
269
+ test_ray_custom_resources = {}
270
+ test_rcf = None
271
+ test_manifest = self._make_manifest()
272
+ expected_task_opts = {
273
+ "max_retries": 3,
274
+ "memory": 1680.64,
275
+ "num_cpus": 0.01,
276
+ "scheduling_strategy": "SPREAD",
277
+ }
278
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
279
+ # At least one log of level DEBUG must be emitted
280
+ actual_merge_tasks_opts = _get_merge_task_options(
281
+ index=test_index,
282
+ hb_group_idx=test_hb_group_idx,
283
+ data_size=1,
284
+ pk_size_bytes=1,
285
+ num_rows=1,
286
+ num_hash_groups=1,
287
+ total_memory_buffer_percentage=1,
288
+ incremental_index_array_size=1,
289
+ debug_memory_params=test_debug_memory_params,
290
+ ray_custom_resources=test_ray_custom_resources,
291
+ estimate_resources_params=test_estimate_memory_params,
292
+ round_completion_info=test_rcf,
293
+ compacted_delta_manifest=test_manifest,
294
+ memory_logs_enabled=True,
295
+ )
296
+ assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
297
+ log_message_debug_memory_params = cm.records[0].getMessage()
298
+ self.assertIn(
299
+ f"[Merge task {test_index}]: Params used for calculating merge memory",
300
+ log_message_debug_memory_params,
301
+ )
302
+ self.assertNotIn(
303
+ "'average_record_size'",
304
+ log_message_debug_memory_params,
305
+ )
@@ -2,9 +2,12 @@ from unittest import TestCase
2
2
  from deltacat.utils.pyarrow import (
3
3
  s3_partial_parquet_file_to_table,
4
4
  pyarrow_read_csv,
5
+ ContentTypeValidationError,
5
6
  content_type_to_reader_kwargs,
6
7
  _add_column_kwargs,
8
+ logger,
7
9
  s3_file_to_table,
10
+ s3_file_to_parquet,
8
11
  ReadKwargsProviderPyArrowSchemaOverride,
9
12
  RAISE_ON_EMPTY_CSV_KWARG,
10
13
  RAISE_ON_DECIMAL_OVERFLOW,
@@ -435,7 +438,7 @@ class TestReadCSV(TestCase):
435
438
  pa.lib.ArrowInvalid,
436
439
  lambda: pyarrow_read_csv(
437
440
  OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
438
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
441
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
439
442
  ),
440
443
  )
441
444
 
@@ -479,7 +482,7 @@ class TestReadCSV(TestCase):
479
482
  pa.lib.ArrowInvalid,
480
483
  lambda: pyarrow_read_csv(
481
484
  OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
482
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
485
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
483
486
  ),
484
487
  )
485
488
 
@@ -590,7 +593,7 @@ class TestReadCSV(TestCase):
590
593
  pa.lib.ArrowNotImplementedError,
591
594
  lambda: pyarrow_read_csv(
592
595
  OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
593
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
596
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
594
597
  ),
595
598
  )
596
599
 
@@ -818,8 +821,11 @@ class TestS3FileToTable(TestCase):
818
821
  schema = pa.schema(
819
822
  [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
820
823
  )
821
-
822
824
  # OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
825
+ pa_kwargs_provider = lambda content_type, kwargs: {
826
+ "reader_type": "pyarrow",
827
+ **kwargs,
828
+ }
823
829
  pa_kwargs_provider = lambda content_type, kwargs: {
824
830
  "reader_type": "pyarrow",
825
831
  OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
@@ -864,3 +870,99 @@ class TestS3FileToTable(TestCase):
864
870
  schema = result.schema
865
871
  schema_index = schema.get_field_index("n_legs")
866
872
  self.assertEqual(schema.field(schema_index).type, "int64")
873
+
874
+
875
+ class TestS3FileToParquet(TestCase):
876
+ def test_s3_file_to_parquet_sanity(self):
877
+ test_s3_url = PARQUET_FILE_PATH
878
+ test_content_type = ContentType.PARQUET.value
879
+ test_content_encoding = ContentEncoding.IDENTITY.value
880
+ pa_kwargs_provider = lambda content_type, kwargs: {
881
+ "reader_type": "pyarrow",
882
+ **kwargs,
883
+ }
884
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
885
+ result_parquet_file: ParquetFile = s3_file_to_parquet(
886
+ test_s3_url,
887
+ test_content_type,
888
+ test_content_encoding,
889
+ ["n_legs", "animal"],
890
+ ["n_legs"],
891
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
892
+ )
893
+ log_message_log_args = cm.records[0].getMessage()
894
+ log_message_presanitize_kwargs = cm.records[1].getMessage()
895
+ self.assertIn(
896
+ f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
897
+ log_message_log_args,
898
+ )
899
+ self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
900
+ for index, field in enumerate(result_parquet_file.schema_arrow):
901
+ self.assertEqual(
902
+ field.name, result_parquet_file.schema_arrow.field(index).name
903
+ )
904
+ self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
905
+
906
+ def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success(
907
+ self,
908
+ ):
909
+ test_s3_url = PARQUET_FILE_PATH
910
+ test_content_type = ContentType.PARQUET.value
911
+ test_content_encoding = ContentEncoding.GZIP.value
912
+ pa_kwargs_provider = lambda content_type, kwargs: {
913
+ "reader_type": "pyarrow",
914
+ OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
915
+ **kwargs,
916
+ }
917
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
918
+ result_parquet_file: ParquetFile = s3_file_to_parquet(
919
+ test_s3_url,
920
+ test_content_type,
921
+ test_content_encoding,
922
+ ["n_legs", "animal"],
923
+ ["n_legs"],
924
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
925
+ )
926
+ log_message_log_args = cm.records[0].getMessage()
927
+ log_message_log_new_content_encoding = cm.records[1].getMessage()
928
+ log_message_presanitize_kwargs = cm.records[2].getMessage()
929
+ self.assertIn(
930
+ f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
931
+ log_message_log_args,
932
+ )
933
+ self.assertIn(
934
+ f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
935
+ log_message_log_new_content_encoding,
936
+ )
937
+ self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
938
+ for index, field in enumerate(result_parquet_file.schema_arrow):
939
+ self.assertEqual(
940
+ field.name, result_parquet_file.schema_arrow.field(index).name
941
+ )
942
+ self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
943
+
944
+ def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error(
945
+ self,
946
+ ):
947
+ test_s3_url = PARQUET_FILE_PATH
948
+ test_content_type = ContentType.PARQUET.value
949
+ test_content_encoding = ContentEncoding.GZIP.value
950
+ pa_kwargs_provider = lambda content_type, kwargs: {
951
+ "reader_type": "pyarrow",
952
+ **kwargs,
953
+ }
954
+ with self.assertRaises(ContentTypeValidationError):
955
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
956
+ s3_file_to_parquet(
957
+ test_s3_url,
958
+ test_content_type,
959
+ test_content_encoding,
960
+ ["n_legs", "animal"],
961
+ ["n_legs"],
962
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
963
+ )
964
+ log_message_log_args = cm.records[0].getMessage()
965
+ self.assertIn(
966
+ f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
967
+ log_message_log_args,
968
+ )
@@ -617,7 +617,18 @@ def s3_file_to_parquet(
617
617
  f"Reading {s3_url} to PyArrow ParquetFile. "
618
618
  f"Content type: {content_type}. Encoding: {content_encoding}"
619
619
  )
620
+ kwargs = {}
621
+ if pa_read_func_kwargs_provider:
622
+ kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
620
623
 
624
+ if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
625
+ new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
626
+ if content_type == ContentType.PARQUET.value:
627
+ logger.debug(
628
+ f"Overriding {s3_url} content encoding from {content_encoding} "
629
+ f"to {new_content_encoding}"
630
+ )
631
+ content_encoding = new_content_encoding
621
632
  if (
622
633
  content_type != ContentType.PARQUET.value
623
634
  or content_encoding != ContentEncoding.IDENTITY
@@ -630,15 +641,10 @@ def s3_file_to_parquet(
630
641
  if s3_client_kwargs is None:
631
642
  s3_client_kwargs = {}
632
643
 
633
- kwargs = {}
634
-
635
644
  if s3_url.startswith("s3://"):
636
645
  s3_file_system = create_s3_file_system(s3_client_kwargs)
637
646
  kwargs["filesystem"] = s3_file_system
638
647
 
639
- if pa_read_func_kwargs_provider:
640
- kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
641
-
642
648
  logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
643
649
 
644
650
  kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.30
3
+ Version: 1.1.32
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,33 +0,0 @@
1
- import unittest
2
- import ray
3
- from deltacat.compute.compactor_v2.utils.task_options import _get_task_options
4
-
5
-
6
- @ray.remote
7
- def valid_func():
8
- return 2
9
-
10
-
11
- @ray.remote
12
- def throwing_func():
13
- raise ConnectionAbortedError()
14
-
15
-
16
- class TestTaskOptions(unittest.TestCase):
17
- @classmethod
18
- def setUpClass(cls):
19
- ray.init(local_mode=True, ignore_reinit_error=True)
20
- super().setUpClass()
21
-
22
- def test_get_task_options_sanity(self):
23
- opts = _get_task_options(0.01, 0.01)
24
- result_ref = valid_func.options(**opts).remote()
25
- result = ray.get(result_ref)
26
-
27
- self.assertEqual(result, 2)
28
-
29
- def test_get_task_options_when_exception_is_thrown(self):
30
- opts = _get_task_options(0.01, 0.01)
31
- result_ref = throwing_func.options(**opts).remote()
32
-
33
- self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
File without changes
File without changes
File without changes