deltacat 0.2.7__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. {deltacat-0.2.7 → deltacat-0.2.9}/PKG-INFO +1 -1
  2. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/__init__.py +1 -1
  3. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/aws/s3u.py +6 -0
  4. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/compaction_session.py +2 -3
  5. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -57
  6. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/steps/dedupe.py +2 -2
  7. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/steps/hash_bucket.py +2 -2
  8. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/steps/materialize.py +2 -2
  9. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/compaction_session.py +3 -16
  10. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/steps/hash_bucket.py +41 -24
  11. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/steps/merge.py +38 -21
  12. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/utils/primary_key_index.py +2 -1
  13. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/test_compact_partition_incremental.py +18 -1
  14. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +4 -1
  15. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/utils/test_resources.py +21 -0
  16. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/daft.py +2 -0
  17. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/pyarrow.py +69 -0
  18. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/resources.py +58 -2
  19. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat.egg-info/PKG-INFO +1 -1
  20. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat.egg-info/requires.txt +1 -1
  21. {deltacat-0.2.7 → deltacat-0.2.9}/setup.py +1 -1
  22. {deltacat-0.2.7 → deltacat-0.2.9}/MANIFEST.in +0 -0
  23. {deltacat-0.2.7 → deltacat-0.2.9}/README.md +0 -0
  24. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/aws/__init__.py +0 -0
  25. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/aws/clients.py +0 -0
  26. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/aws/constants.py +0 -0
  27. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/aws/redshift/__init__.py +0 -0
  28. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/aws/redshift/model/__init__.py +0 -0
  29. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/aws/redshift/model/manifest.py +0 -0
  30. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/benchmarking/__init__.py +0 -0
  31. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
  32. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/benchmarking/conftest.py +0 -0
  33. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/catalog/__init__.py +0 -0
  34. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/catalog/delegate.py +0 -0
  35. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/catalog/interface.py +0 -0
  36. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/catalog/model/__init__.py +0 -0
  37. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/catalog/model/catalog.py +0 -0
  38. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/catalog/model/table_definition.py +0 -0
  39. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/__init__.py +0 -0
  40. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/__init__.py +0 -0
  41. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/__init__.py +0 -0
  42. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
  43. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/compactor_version.py +0 -0
  44. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
  45. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
  46. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
  47. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
  48. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
  49. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/materialize_result.py +0 -0
  50. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
  51. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  52. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  53. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
  54. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/repartition_session.py +0 -0
  55. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/steps/__init__.py +0 -0
  56. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/steps/repartition.py +0 -0
  57. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/utils/__init__.py +0 -0
  58. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/utils/io.py +0 -0
  59. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
  60. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
  61. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/utils/sort_key.py +0 -0
  62. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor/utils/system_columns.py +0 -0
  63. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/__init__.py +0 -0
  64. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/constants.py +0 -0
  65. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/model/__init__.py +0 -0
  66. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +0 -0
  67. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
  68. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/model/merge_input.py +0 -0
  69. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/model/merge_result.py +0 -0
  70. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  71. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  72. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/utils/content_type_params.py +0 -0
  73. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/utils/dedupe.py +0 -0
  74. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/utils/io.py +0 -0
  75. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/compactor_v2/utils/task_options.py +0 -0
  76. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/__init__.py +0 -0
  77. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/config/__init__.py +0 -0
  78. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/meta_stats.py +0 -0
  79. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/model/__init__.py +0 -0
  80. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/model/partition_stats_dict.py +0 -0
  81. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -0
  82. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/stats.py +0 -0
  83. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/utils/__init__.py +0 -0
  84. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/utils/constants.py +0 -0
  85. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/utils/io.py +0 -0
  86. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -0
  87. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/metastats/utils/ray_utils.py +0 -0
  88. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/__init__.py +0 -0
  89. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/basic.py +0 -0
  90. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/models/__init__.py +0 -0
  91. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  92. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/models/delta_stats.py +0 -0
  93. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  94. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  95. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/models/stats_result.py +0 -0
  96. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/types.py +0 -0
  97. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/utils/__init__.py +0 -0
  98. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/utils/intervals.py +0 -0
  99. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/utils/io.py +0 -0
  100. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/compute/stats/utils/manifest_stats_file.py +0 -0
  101. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/constants.py +0 -0
  102. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/exceptions.py +0 -0
  103. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/__init__.py +0 -0
  104. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/aws/__init__.py +0 -0
  105. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/aws/redshift/__init__.py +0 -0
  106. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/aws/redshift/redshift_datasource.py +0 -0
  107. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/dataset.py +0 -0
  108. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/file_object_store.py +0 -0
  109. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/memcached_object_store.py +0 -0
  110. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/object_store.py +0 -0
  111. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/ray_plasma_object_store.py +0 -0
  112. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/read_api.py +0 -0
  113. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/redis_object_store.py +0 -0
  114. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/io/s3_object_store.py +0 -0
  115. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/logs.py +0 -0
  116. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/__init__.py +0 -0
  117. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/interface.py +0 -0
  118. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/__init__.py +0 -0
  119. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/delta.py +0 -0
  120. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/list_result.py +0 -0
  121. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/locator.py +0 -0
  122. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/namespace.py +0 -0
  123. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/partition.py +0 -0
  124. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/sort_key.py +0 -0
  125. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/stream.py +0 -0
  126. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/table.py +0 -0
  127. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/table_version.py +0 -0
  128. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/storage/model/types.py +0 -0
  129. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/__init__.py +0 -0
  130. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/aws/__init__.py +0 -0
  131. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/aws/test_clients.py +0 -0
  132. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/__init__.py +0 -0
  133. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compact_partition_test_cases.py +0 -0
  134. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compactor/__init__.py +0 -0
  135. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  136. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compactor/steps/test_repartition.py +0 -0
  137. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  138. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compactor/utils/test_io.py +0 -0
  139. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  140. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -0
  141. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
  142. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compactor_v2/utils/__init__.py +0 -0
  143. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -0
  144. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/test_compact_partition_params.py +0 -0
  145. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/test_util_common.py +0 -0
  146. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/test_util_constant.py +0 -0
  147. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -0
  148. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/io/__init__.py +0 -0
  149. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/io/test_cloudpickle_bug_fix.py +0 -0
  150. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/io/test_file_object_store.py +0 -0
  151. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/io/test_memcached_object_store.py +0 -0
  152. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/io/test_ray_plasma_object_store.py +0 -0
  153. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/io/test_redis_object_store.py +0 -0
  154. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/io/test_s3_object_store.py +0 -0
  155. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/local_deltacat_storage/__init__.py +0 -0
  156. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/stats/__init__.py +0 -0
  157. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/stats/test_intervals.py +0 -0
  158. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/test_utils/__init__.py +0 -0
  159. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/test_utils/constants.py +0 -0
  160. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/test_utils/pyarrow.py +0 -0
  161. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/test_utils/storage.py +0 -0
  162. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/test_utils/utils.py +0 -0
  163. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/utils/__init__.py +0 -0
  164. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/utils/data/__init__.py +0 -0
  165. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/utils/test_cloudpickle.py +0 -0
  166. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/utils/test_daft.py +0 -0
  167. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/utils/test_pyarrow.py +0 -0
  168. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
  169. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/types/__init__.py +0 -0
  170. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/types/media.py +0 -0
  171. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/types/partial_download.py +0 -0
  172. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/types/tables.py +0 -0
  173. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/__init__.py +0 -0
  174. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/arguments.py +0 -0
  175. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/cloudpickle.py +0 -0
  176. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/common.py +0 -0
  177. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/metrics.py +0 -0
  178. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/numpy.py +0 -0
  179. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/pandas.py +0 -0
  180. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/performance.py +0 -0
  181. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/placement.py +0 -0
  182. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/ray_utils/__init__.py +0 -0
  183. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/ray_utils/collections.py +0 -0
  184. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/ray_utils/concurrency.py +0 -0
  185. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/ray_utils/dataset.py +0 -0
  186. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/ray_utils/performance.py +0 -0
  187. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/ray_utils/runtime.py +0 -0
  188. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/s3fs.py +0 -0
  189. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat/utils/schema.py +0 -0
  190. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat.egg-info/SOURCES.txt +0 -0
  191. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat.egg-info/dependency_links.txt +0 -0
  192. {deltacat-0.2.7 → deltacat-0.2.9}/deltacat.egg-info/top_level.txt +0 -0
  193. {deltacat-0.2.7 → deltacat-0.2.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "0.2.7"
47
+ __version__ = "0.2.9"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -383,6 +383,12 @@ def upload_table(
383
383
  # s3fs may swallow S3 errors - we were probably throttled
384
384
  raise RetryableError(f"Retry table upload to: {s3_url}") from e
385
385
  raise NonRetryableError(f"Failed table upload to: {s3_url}") from e
386
+ except BaseException as e:
387
+ logger.warn(
388
+ f"Upload has failed for {s3_url} and content_type={content_type}. Error: {e}",
389
+ exc_info=True,
390
+ )
391
+ raise e
386
392
  return manifest_entries
387
393
 
388
394
 
@@ -52,7 +52,7 @@ from deltacat.compute.compactor.model.compaction_session_audit_info import (
52
52
  )
53
53
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
54
54
  from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
55
- from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
55
+ from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
56
56
 
57
57
 
58
58
  if importlib.util.find_spec("memray"):
@@ -293,7 +293,6 @@ def _execute_compaction_round(
293
293
  f"{node_resource_keys}"
294
294
  )
295
295
 
296
- compaction_audit.set_cluster_cpu_max(cluster_cpus)
297
296
  # create a remote options provider to round-robin tasks across all nodes or allocated bundles
298
297
  logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
299
298
  round_robin_opt_provider = functools.partial(
@@ -680,7 +679,7 @@ def _execute_compaction_round(
680
679
  [m.pyarrow_write_result for m in mat_results]
681
680
  )
682
681
 
683
- session_peak_memory = get_current_node_peak_memory_usage_in_bytes()
682
+ session_peak_memory = get_current_process_peak_memory_usage_in_bytes()
684
683
  compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
685
684
  session_peak_memory
686
685
  )
@@ -98,14 +98,6 @@ class CompactionSessionAuditInfo(dict):
98
98
  """
99
99
  return self.get("hashBucketCount")
100
100
 
101
- @property
102
- def cluster_cpu_max(self) -> float:
103
- """
104
- Total cluster cpu allocated for the compaction job. If it is autoscaling cluster,
105
- max cpu at any time will be reported.
106
- """
107
- return self.get("clusterCpuMax")
108
-
109
101
  @property
110
102
  def compaction_time_in_seconds(self) -> float:
111
103
  """
@@ -423,35 +415,6 @@ class CompactionSessionAuditInfo(dict):
423
415
  """
424
416
  return self.get("hashBucketProcessedSizeBytes")
425
417
 
426
- @property
427
- def total_cpu_seconds(self) -> float:
428
- """
429
- Total number of vCPUs provisioned in the cluster weighted over time.
430
- """
431
- return self.get("totalCPUSeconds")
432
-
433
- @property
434
- def used_cpu_seconds(self) -> float:
435
- """
436
- Total used vCPU in the cluster weighted over time.
437
- """
438
- return self.get("usedCPUSeconds")
439
-
440
- @property
441
- def used_memory_gb_seconds(self) -> float:
442
- """
443
- The used memory in the cluster weighted over time. This
444
- determines opportunities for better memory estimation.
445
- """
446
- return self.get("usedMemoryGBSeconds")
447
-
448
- @property
449
- def total_memory_gb_seconds(self) -> float:
450
- """
451
- Total memory in the cluster weighted over time in GB.
452
- """
453
- return self.get("totalMemoryGBSeconds")
454
-
455
418
  @property
456
419
  def pyarrow_version(self) -> str:
457
420
  """
@@ -510,10 +473,6 @@ class CompactionSessionAuditInfo(dict):
510
473
  self["hashBucketCount"] = hash_bucket_count
511
474
  return self
512
475
 
513
- def set_cluster_cpu_max(self, cluster_cpu_max: float) -> CompactionSessionAuditInfo:
514
- self["clusterCpuMax"] = cluster_cpu_max
515
- return self
516
-
517
476
  def set_compaction_time_in_seconds(
518
477
  self, compaction_time_in_seconds: float
519
478
  ) -> CompactionSessionAuditInfo:
@@ -778,22 +737,6 @@ class CompactionSessionAuditInfo(dict):
778
737
  self["hashBucketProcessedSizeBytes"] = size
779
738
  return self
780
739
 
781
- def set_total_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
782
- self["totalCPUSeconds"] = value
783
- return self
784
-
785
- def set_used_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
786
- self["usedCPUSeconds"] = value
787
- return self
788
-
789
- def set_used_memory_gb_seconds(self, value: float) -> CompactionSessionAuditInfo:
790
- self["usedMemoryGBSeconds"] = value
791
- return self
792
-
793
- def set_total_memory_gb_seconds(self, value: float) -> CompactionSessionAuditInfo:
794
- self["totalMemoryGBSeconds"] = value
795
- return self
796
-
797
740
  def set_pyarrow_version(self, value: str) -> CompactionSessionAuditInfo:
798
741
  self["pyarrowVersion"] = value
799
742
  return self
@@ -25,7 +25,7 @@ from deltacat.utils.ray_utils.runtime import (
25
25
  from deltacat.utils.performance import timed_invocation
26
26
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
27
27
  from deltacat.io.object_store import IObjectStore
28
- from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
28
+ from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
29
29
 
30
30
  if importlib.util.find_spec("memray"):
31
31
  import memray
@@ -228,7 +228,7 @@ def _timed_dedupe(
228
228
  f"{len(mat_bucket_to_dd_idx_obj_id)}"
229
229
  )
230
230
 
231
- peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
231
+ peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
232
232
  return DedupeResult(
233
233
  mat_bucket_to_dd_idx_obj_id,
234
234
  np.int64(total_deduped_records),
@@ -32,7 +32,7 @@ from deltacat.utils.common import ReadKwargsProvider
32
32
  from deltacat.utils.performance import timed_invocation
33
33
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
34
34
  from deltacat.io.object_store import IObjectStore
35
- from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
35
+ from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
36
36
 
37
37
  if importlib.util.find_spec("memray"):
38
38
  import memray
@@ -228,7 +228,7 @@ def _timed_hash_bucket(
228
228
  delta_file_envelope_groups, num_buckets, num_groups, object_store
229
229
  )
230
230
 
231
- peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
231
+ peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
232
232
  return HashBucketResult(
233
233
  hash_bucket_group_to_obj_id,
234
234
  np.int64(total_record_count),
@@ -44,7 +44,7 @@ from deltacat.utils.ray_utils.runtime import (
44
44
  get_current_ray_worker_id,
45
45
  )
46
46
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
47
- from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
47
+ from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
48
48
 
49
49
  if importlib.util.find_spec("memray"):
50
50
  import memray
@@ -314,7 +314,7 @@ def materialize(
314
314
  emit_metrics_time = latency
315
315
  logger.info(f"Materialize task ended in {end - start}s")
316
316
 
317
- peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
317
+ peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
318
318
 
319
319
  # Merge all new deltas into one for this materialize bucket index
320
320
  merged_materialize_result = MaterializeResult.of(
@@ -41,13 +41,12 @@ from deltacat.compute.compactor.model.compaction_session_audit_info import (
41
41
  CompactionSessionAuditInfo,
42
42
  )
43
43
  from deltacat.utils.resources import (
44
- get_current_node_peak_memory_usage_in_bytes,
44
+ get_current_process_peak_memory_usage_in_bytes,
45
45
  )
46
46
  from deltacat.compute.compactor_v2.utils.task_options import (
47
47
  hash_bucket_resource_options_provider,
48
48
  merge_resource_options_provider,
49
49
  )
50
- from deltacat.utils.resources import ClusterUtilizationOverTimeRange
51
50
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
52
51
 
53
52
  if importlib.util.find_spec("memray"):
@@ -65,10 +64,9 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
65
64
 
66
65
  with memray.Tracker(
67
66
  f"compaction_partition.bin"
68
- ) if params.enable_profiler else nullcontext(), ClusterUtilizationOverTimeRange() as cluster_util:
67
+ ) if params.enable_profiler else nullcontext():
69
68
  (new_partition, new_rci, new_rcf_partition_locator,) = _execute_compaction(
70
69
  params,
71
- cluster_util=cluster_util,
72
70
  **kwargs,
73
71
  )
74
72
 
@@ -469,7 +467,7 @@ def _execute_compaction(
469
467
  [m.pyarrow_write_result for m in mat_results]
470
468
  )
471
469
 
472
- session_peak_memory = get_current_node_peak_memory_usage_in_bytes()
470
+ session_peak_memory = get_current_process_peak_memory_usage_in_bytes()
473
471
  compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
474
472
  session_peak_memory
475
473
  )
@@ -478,17 +476,6 @@ def _execute_compaction(
478
476
  mat_results, telemetry_time_hb + telemetry_time_merge
479
477
  )
480
478
 
481
- cluster_util: ClusterUtilizationOverTimeRange = kwargs.get("cluster_util")
482
-
483
- if cluster_util:
484
- compaction_audit.set_total_cpu_seconds(cluster_util.total_vcpu_seconds)
485
- compaction_audit.set_used_cpu_seconds(cluster_util.used_vcpu_seconds)
486
- compaction_audit.set_used_memory_gb_seconds(cluster_util.used_memory_gb_seconds)
487
- compaction_audit.set_total_memory_gb_seconds(
488
- cluster_util.total_memory_gb_seconds
489
- )
490
- compaction_audit.set_cluster_cpu_max(cluster_util.max_cpu)
491
-
492
479
  input_inflation = None
493
480
  input_average_record_size_bytes = None
494
481
  # Note: we only consider inflation for incremental delta
@@ -27,7 +27,11 @@ from deltacat.utils.ray_utils.runtime import (
27
27
  from deltacat.utils.common import ReadKwargsProvider
28
28
  from deltacat.utils.performance import timed_invocation
29
29
  from deltacat.utils.metrics import emit_timer_metrics
30
- from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
30
+ from deltacat.utils.resources import (
31
+ get_current_process_peak_memory_usage_in_bytes,
32
+ ProcessUtilizationOverTimeRange,
33
+ )
34
+ from deltacat.constants import BYTES_PER_GIBIBYTE
31
35
 
32
36
  if importlib.util.find_spec("memray"):
33
37
  import memray
@@ -166,7 +170,10 @@ def _timed_hash_bucket(input: HashBucketInput):
166
170
  object_store=input.object_store,
167
171
  )
168
172
 
169
- peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
173
+ peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
174
+ logger.info(
175
+ f"Peak memory usage in bytes after hash bucketing: {peak_memory_usage_bytes}"
176
+ )
170
177
  return HashBucketResult(
171
178
  hash_bucket_group_to_obj_id_tuple,
172
179
  np.int64(total_size_bytes),
@@ -179,28 +186,38 @@ def _timed_hash_bucket(input: HashBucketInput):
179
186
 
180
187
  @ray.remote
181
188
  def hash_bucket(input: HashBucketInput) -> HashBucketResult:
189
+ with ProcessUtilizationOverTimeRange() as process_util:
190
+ logger.info(f"Starting hash bucket task...")
182
191
 
183
- logger.info(f"Starting hash bucket task...")
184
- hash_bucket_result, duration = timed_invocation(
185
- func=_timed_hash_bucket, input=input
186
- )
192
+ # Log node peak memory utilization every 10 seconds
193
+ def log_peak_memory():
194
+ logger.debug(
195
+ f"Process peak memory utilization so far: {process_util.max_memory} bytes "
196
+ f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
197
+ )
198
+
199
+ process_util.schedule_callback(log_peak_memory, 10)
187
200
 
188
- emit_metrics_time = 0.0
189
- if input.metrics_config:
190
- emit_result, latency = timed_invocation(
191
- func=emit_timer_metrics,
192
- metrics_name="hash_bucket",
193
- value=duration,
194
- metrics_config=input.metrics_config,
201
+ hash_bucket_result, duration = timed_invocation(
202
+ func=_timed_hash_bucket, input=input
203
+ )
204
+
205
+ emit_metrics_time = 0.0
206
+ if input.metrics_config:
207
+ emit_result, latency = timed_invocation(
208
+ func=emit_timer_metrics,
209
+ metrics_name="hash_bucket",
210
+ value=duration,
211
+ metrics_config=input.metrics_config,
212
+ )
213
+ emit_metrics_time = latency
214
+
215
+ logger.info(f"Finished hash bucket task...")
216
+ return HashBucketResult(
217
+ hash_bucket_result[0],
218
+ hash_bucket_result[1],
219
+ hash_bucket_result[2],
220
+ hash_bucket_result[3],
221
+ np.double(emit_metrics_time),
222
+ hash_bucket_result[5],
195
223
  )
196
- emit_metrics_time = latency
197
-
198
- logger.info(f"Finished hash bucket task...")
199
- return HashBucketResult(
200
- hash_bucket_result[0],
201
- hash_bucket_result[1],
202
- hash_bucket_result[2],
203
- hash_bucket_result[3],
204
- np.double(emit_metrics_time),
205
- hash_bucket_result[5],
206
- )
@@ -30,7 +30,10 @@ from deltacat.compute.compactor.utils import system_columns as sc
30
30
 
31
31
  from deltacat.utils.performance import timed_invocation
32
32
  from deltacat.utils.metrics import emit_timer_metrics
33
- from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
33
+ from deltacat.utils.resources import (
34
+ get_current_process_peak_memory_usage_in_bytes,
35
+ ProcessUtilizationOverTimeRange,
36
+ )
34
37
  from deltacat.compute.compactor_v2.utils.primary_key_index import (
35
38
  generate_pk_hash_column,
36
39
  hash_group_index_to_hash_bucket_indices,
@@ -44,6 +47,7 @@ from deltacat.storage import (
44
47
  interface as unimplemented_deltacat_storage,
45
48
  )
46
49
  from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
50
+ from deltacat.constants import BYTES_PER_GIBIBYTE
47
51
 
48
52
 
49
53
  if importlib.util.find_spec("memray"):
@@ -436,7 +440,10 @@ def _timed_merge(input: MergeInput) -> MergeResult:
436
440
  f"{total_dfes_found} != {len(hb_index_to_delta_file_envelopes_list)}"
437
441
  )
438
442
 
439
- peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
443
+ peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
444
+ logger.info(
445
+ f"Peak memory usage in bytes after merge: {peak_memory_usage_bytes}"
446
+ )
440
447
 
441
448
  return MergeResult(
442
449
  materialized_results,
@@ -449,25 +456,35 @@ def _timed_merge(input: MergeInput) -> MergeResult:
449
456
 
450
457
  @ray.remote
451
458
  def merge(input: MergeInput) -> MergeResult:
459
+ with ProcessUtilizationOverTimeRange() as process_util:
460
+ logger.info(f"Starting merge task...")
461
+
462
+ # Log node peak memory utilization every 10 seconds
463
+ def log_peak_memory():
464
+ logger.debug(
465
+ f"Process peak memory utilization so far: {process_util.max_memory} bytes "
466
+ f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
467
+ )
468
+
469
+ process_util.schedule_callback(log_peak_memory, 10)
452
470
 
453
- logger.info(f"Starting merge task...")
454
- merge_result, duration = timed_invocation(func=_timed_merge, input=input)
471
+ merge_result, duration = timed_invocation(func=_timed_merge, input=input)
455
472
 
456
- emit_metrics_time = 0.0
457
- if input.metrics_config:
458
- emit_result, latency = timed_invocation(
459
- func=emit_timer_metrics,
460
- metrics_name="merge",
461
- value=duration,
462
- metrics_config=input.metrics_config,
473
+ emit_metrics_time = 0.0
474
+ if input.metrics_config:
475
+ emit_result, latency = timed_invocation(
476
+ func=emit_timer_metrics,
477
+ metrics_name="merge",
478
+ value=duration,
479
+ metrics_config=input.metrics_config,
480
+ )
481
+ emit_metrics_time = latency
482
+
483
+ logger.info(f"Finished merge task...")
484
+ return MergeResult(
485
+ merge_result[0],
486
+ merge_result[1],
487
+ merge_result[2],
488
+ np.double(emit_metrics_time),
489
+ merge_result[4],
463
490
  )
464
- emit_metrics_time = latency
465
-
466
- logger.info(f"Finished merge task...")
467
- return MergeResult(
468
- merge_result[0],
469
- merge_result[1],
470
- merge_result[2],
471
- np.double(emit_metrics_time),
472
- merge_result[4],
473
- )
@@ -17,6 +17,7 @@ from deltacat import logs
17
17
  from deltacat.compute.compactor.utils import system_columns as sc
18
18
  from deltacat.io.object_store import IObjectStore
19
19
  from deltacat.utils.performance import timed_invocation
20
+ from deltacat.utils.pyarrow import sliced_string_cast
20
21
 
21
22
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
22
23
 
@@ -182,7 +183,7 @@ def generate_pk_hash_column(
182
183
  def _generate_pk_hash(table: pa.Table) -> pa.Array:
183
184
  pk_columns = []
184
185
  for pk_name in primary_keys:
185
- pk_columns.append(pc.cast(table[pk_name], pa.string()))
186
+ pk_columns.append(sliced_string_cast(table[pk_name]))
186
187
 
187
188
  pk_columns.append(PK_DELIMITER)
188
189
  hash_column = pc.binary_join_element_wise(*pk_columns)
@@ -6,6 +6,8 @@ import boto3
6
6
  from typing import Any, Callable, Dict, List, Optional, Set
7
7
  from boto3.resources.base import ServiceResource
8
8
  import pyarrow as pa
9
+ from pytest_benchmark.fixture import BenchmarkFixture
10
+
9
11
  from deltacat.tests.compute.test_util_common import (
10
12
  get_rcf,
11
13
  )
@@ -161,6 +163,7 @@ def test_compact_partition_incremental(
161
163
  read_kwargs_provider_param: Any,
162
164
  skip_enabled_compact_partition_drivers,
163
165
  compact_partition_func: Callable,
166
+ benchmark: BenchmarkFixture,
164
167
  ):
165
168
  import deltacat.tests.local_deltacat_storage as ds
166
169
  from deltacat.types.media import ContentType
@@ -235,8 +238,22 @@ def test_compact_partition_incremental(
235
238
  "sort_keys": sort_keys if sort_keys else None,
236
239
  }
237
240
  )
241
+
238
242
  # execute
239
- rcf_file_s3_uri = compact_partition_func(compact_partition_params)
243
+ def _incremental_compaction_setup():
244
+ """
245
+ This callable runs right before invoking the benchmark target function (compaction).
246
+ This is needed as the benchmark module will invoke the target function multiple times
247
+ in a single test run, which can lead to non-idempotent behavior if RCFs are generated.
248
+
249
+ Returns: args, kwargs
250
+ """
251
+ setup_s3_resource.Bucket(TEST_S3_RCF_BUCKET_NAME).objects.all().delete()
252
+ return (compact_partition_params,), {}
253
+
254
+ rcf_file_s3_uri = benchmark.pedantic(
255
+ compact_partition_func, setup=_incremental_compaction_setup
256
+ )
240
257
  # validate
241
258
  round_completion_info = get_rcf(setup_s3_resource, rcf_file_s3_uri)
242
259
  compacted_delta_locator: DeltaLocator = (
@@ -5,6 +5,8 @@ import pytest
5
5
  import boto3
6
6
  from boto3.resources.base import ServiceResource
7
7
  import pyarrow as pa
8
+ from pytest_benchmark.fixture import BenchmarkFixture
9
+
8
10
  from deltacat.tests.compute.test_util_constant import (
9
11
  BASE_TEST_SOURCE_NAMESPACE,
10
12
  BASE_TEST_SOURCE_TABLE_NAME,
@@ -182,6 +184,7 @@ def test_compact_partition_rebase_then_incremental(
182
184
  rebase_expected_compact_partition_result: pa.Table,
183
185
  skip_enabled_compact_partition_drivers,
184
186
  compact_partition_func: Callable,
187
+ benchmark: BenchmarkFixture,
185
188
  ):
186
189
  import deltacat.tests.local_deltacat_storage as ds
187
190
  from deltacat.types.media import ContentType
@@ -265,7 +268,7 @@ def test_compact_partition_rebase_then_incremental(
265
268
  }
266
269
  )
267
270
  # execute
268
- rcf_file_s3_uri = compact_partition_func(compact_partition_params)
271
+ rcf_file_s3_uri = benchmark(compact_partition_func, compact_partition_params)
269
272
  compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
270
273
  setup_s3_resource, rcf_file_s3_uri
271
274
  )
@@ -49,3 +49,24 @@ class TestClusterUtilizationOverTimeRange(unittest.TestCase):
49
49
  self.assertIsNotNone(cu.total_memory_gb_seconds)
50
50
  self.assertIsNotNone(cu.used_memory_gb_seconds)
51
51
  self.assertIsNotNone(cu.max_cpu)
52
+
53
+
54
+ class TestProcessUtilizationOverTimeRange(unittest.TestCase):
55
+ def test_sanity(self):
56
+ from deltacat.utils.resources import ProcessUtilizationOverTimeRange
57
+
58
+ with ProcessUtilizationOverTimeRange() as nu:
59
+ time.sleep(3)
60
+ self.assertIsNotNone(nu.max_memory)
61
+
62
+ def test_callback(self):
63
+ from deltacat.utils.resources import ProcessUtilizationOverTimeRange
64
+
65
+ with ProcessUtilizationOverTimeRange() as nu:
66
+
67
+ def test_callback():
68
+ nu.test_field_set = True
69
+
70
+ nu.schedule_callback(test_callback, 1)
71
+ time.sleep(3)
72
+ self.assertTrue(nu.test_field_set)
@@ -66,6 +66,8 @@ def daft_s3_file_to_table(
66
66
  )
67
67
  )
68
68
 
69
+ logger.debug(f"Preparing to read S3 object from {s3_url} into daft table")
70
+
69
71
  pa_table, latency = timed_invocation(
70
72
  read_parquet_into_pyarrow,
71
73
  path=s3_url,
@@ -11,6 +11,8 @@ from pyarrow.parquet import ParquetFile
11
11
  from deltacat.exceptions import ValidationError
12
12
 
13
13
  import pyarrow as pa
14
+ import numpy as np
15
+ import pyarrow.compute as pc
14
16
  from fsspec import AbstractFileSystem
15
17
  from pyarrow import csv as pacsv
16
18
  from pyarrow import feather as paf
@@ -38,6 +40,7 @@ from deltacat.utils.arguments import (
38
40
  sanitize_kwargs_to_callable,
39
41
  sanitize_kwargs_by_supported_kwargs,
40
42
  )
43
+ from functools import lru_cache
41
44
 
42
45
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
43
46
 
@@ -738,3 +741,69 @@ class RecordBatchTables:
738
741
  """
739
742
  self._remaining_tables.clear()
740
743
  self._remaining_record_count = 0
744
+
745
+
746
+ @lru_cache(maxsize=1)
747
+ def _int_max_string_len() -> int:
748
+ PA_UINT64_MAX_STR_BYTES = pc.binary_length(
749
+ pc.cast(pa.scalar(2**64 - 1, type=pa.uint64()), pa.string())
750
+ ).as_py()
751
+ PA_INT64_MAX_STR_BYTES = pc.binary_length(
752
+ pc.cast(pa.scalar(-(2**63), type=pa.int64()), pa.string())
753
+ ).as_py()
754
+ return max(PA_UINT64_MAX_STR_BYTES, PA_INT64_MAX_STR_BYTES)
755
+
756
+
757
+ @lru_cache(maxsize=1)
758
+ def _float_max_string_len() -> int:
759
+ PA_POS_FLOAT64_MAX_STR_BYTES = pc.binary_length(
760
+ pc.cast(pa.scalar(np.finfo(np.float64).max, type=pa.float64()), pa.string())
761
+ ).as_py()
762
+ PA_NEG_FLOAT64_MAX_STR_BYTES = pc.binary_length(
763
+ pc.cast(pa.scalar(np.finfo(np.float64).min, type=pa.float64()), pa.string())
764
+ ).as_py()
765
+ return max(PA_POS_FLOAT64_MAX_STR_BYTES, PA_NEG_FLOAT64_MAX_STR_BYTES)
766
+
767
+
768
+ def _max_decimal128_string_len():
769
+ return 40 # "-" + 38 digits + decimal
770
+
771
+
772
+ def _max_decimal256_string_len():
773
+ return 78 # "-" + 76 digits + decimal
774
+
775
+
776
+ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
777
+ """performs slicing of a pyarrow array prior casting to a string.
778
+ This prevents a pyarrow from allocating too large of an array causing a failure.
779
+ Issue: https://github.com/apache/arrow/issues/38835
780
+ TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
781
+ """
782
+ dtype = array.type
783
+ MAX_BYTES = 2147483646
784
+ max_str_len = None
785
+ if pa.types.is_integer(dtype):
786
+ max_str_len = _int_max_string_len()
787
+ elif pa.types.is_floating(dtype):
788
+ max_str_len = _float_max_string_len()
789
+ elif pa.types.is_decimal128(dtype):
790
+ max_str_len = _max_decimal128_string_len()
791
+ elif pa.types.is_decimal256(dtype):
792
+ max_str_len = _max_decimal256_string_len()
793
+
794
+ if max_str_len is not None:
795
+ max_elems_per_chunk = MAX_BYTES // (2 * max_str_len) # safety factor of 2
796
+ all_chunks = []
797
+ for chunk in array.chunks:
798
+ if len(chunk) < max_elems_per_chunk:
799
+ all_chunks.append(chunk)
800
+ else:
801
+ curr_pos = 0
802
+ total_len = len(chunk)
803
+ while curr_pos < total_len:
804
+ sliced = chunk.slice(curr_pos, max_elems_per_chunk)
805
+ curr_pos += len(sliced)
806
+ all_chunks.append(sliced)
807
+ array = pa.chunked_array(all_chunks, type=dtype)
808
+
809
+ return pc.cast(array, pa.string())