deltacat 1.1.28__tar.gz → 1.1.29__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (224) hide show
  1. {deltacat-1.1.28/deltacat.egg-info → deltacat-1.1.29}/PKG-INFO +1 -1
  2. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/__init__.py +1 -1
  3. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/constants.py +15 -1
  4. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/steps/merge.py +30 -5
  5. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  6. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/primary_key_index.py +15 -3
  7. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compact_partition_test_cases.py +32 -0
  8. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +133 -0
  9. deltacat-1.1.29/deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  10. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/test_utils/pyarrow.py +15 -8
  11. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/test_pyarrow.py +23 -0
  12. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/pyarrow.py +9 -7
  13. {deltacat-1.1.28 → deltacat-1.1.29/deltacat.egg-info}/PKG-INFO +1 -1
  14. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat.egg-info/SOURCES.txt +1 -0
  15. {deltacat-1.1.28 → deltacat-1.1.29}/LICENSE +0 -0
  16. {deltacat-1.1.28 → deltacat-1.1.29}/MANIFEST.in +0 -0
  17. {deltacat-1.1.28 → deltacat-1.1.29}/README.md +0 -0
  18. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/aws/__init__.py +0 -0
  19. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/aws/clients.py +0 -0
  20. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/aws/constants.py +0 -0
  21. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/aws/redshift/__init__.py +0 -0
  22. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/aws/redshift/model/__init__.py +0 -0
  23. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/aws/redshift/model/manifest.py +0 -0
  24. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/aws/s3u.py +0 -0
  25. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/benchmarking/__init__.py +0 -0
  26. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
  27. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/benchmarking/conftest.py +0 -0
  28. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/catalog/__init__.py +0 -0
  29. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/catalog/default_catalog_impl/__init__.py +0 -0
  30. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/catalog/delegate.py +0 -0
  31. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/catalog/interface.py +0 -0
  32. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/catalog/model/__init__.py +0 -0
  33. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/catalog/model/catalog.py +0 -0
  34. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/catalog/model/table_definition.py +0 -0
  35. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/__init__.py +0 -0
  36. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/__init__.py +0 -0
  37. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/compaction_session.py +0 -0
  38. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/__init__.py +0 -0
  39. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
  40. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
  41. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/compactor_version.py +0 -0
  42. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
  43. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
  44. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
  45. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
  46. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
  47. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/materialize_result.py +0 -0
  48. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
  49. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  50. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  51. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
  52. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/model/table_object_store.py +0 -0
  53. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/repartition_session.py +0 -0
  54. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/steps/__init__.py +0 -0
  55. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/steps/dedupe.py +0 -0
  56. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/steps/hash_bucket.py +0 -0
  57. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/steps/materialize.py +0 -0
  58. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/steps/repartition.py +0 -0
  59. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/utils/__init__.py +0 -0
  60. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/utils/io.py +0 -0
  61. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
  62. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
  63. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/utils/sort_key.py +0 -0
  64. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor/utils/system_columns.py +0 -0
  65. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/__init__.py +0 -0
  66. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/compaction_session.py +0 -0
  67. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
  68. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +0 -0
  69. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/delete_strategy.py +0 -0
  70. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +0 -0
  71. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/model.py +0 -0
  72. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/utils.py +0 -0
  73. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/__init__.py +0 -0
  74. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -0
  75. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +0 -0
  76. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
  77. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/merge_file_group.py +0 -0
  78. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/merge_input.py +0 -0
  79. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/merge_result.py +0 -0
  80. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/private/__init__.py +0 -0
  81. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/private/compaction_utils.py +0 -0
  82. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  83. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/steps/hash_bucket.py +0 -0
  84. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  85. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/content_type_params.py +0 -0
  86. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/delta.py +0 -0
  87. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/io.py +0 -0
  88. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/merge.py +0 -0
  89. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/task_options.py +0 -0
  90. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/merge_on_read/__init__.py +0 -0
  91. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/merge_on_read/daft.py +0 -0
  92. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/merge_on_read/model/__init__.py +0 -0
  93. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -0
  94. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  95. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/merge_on_read/utils/delta.py +0 -0
  96. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/resource_estimation/__init__.py +0 -0
  97. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/resource_estimation/delta.py +0 -0
  98. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/resource_estimation/manifest.py +0 -0
  99. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/resource_estimation/model.py +0 -0
  100. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/resource_estimation/parquet.py +0 -0
  101. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/stats/__init__.py +0 -0
  102. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/stats/models/__init__.py +0 -0
  103. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  104. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/stats/models/delta_stats.py +0 -0
  105. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  106. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  107. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/stats/models/stats_result.py +0 -0
  108. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/compute/stats/types.py +0 -0
  109. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/constants.py +0 -0
  110. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/exceptions.py +0 -0
  111. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/__init__.py +0 -0
  112. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/aws/__init__.py +0 -0
  113. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/aws/redshift/__init__.py +0 -0
  114. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/dataset.py +0 -0
  115. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/file_object_store.py +0 -0
  116. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/memcached_object_store.py +0 -0
  117. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/object_store.py +0 -0
  118. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/ray_plasma_object_store.py +0 -0
  119. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/read_api.py +0 -0
  120. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/redis_object_store.py +0 -0
  121. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/io/s3_object_store.py +0 -0
  122. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/logs.py +0 -0
  123. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/__init__.py +0 -0
  124. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/interface.py +0 -0
  125. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/__init__.py +0 -0
  126. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/delete_parameters.py +0 -0
  127. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/delta.py +0 -0
  128. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/list_result.py +0 -0
  129. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/locator.py +0 -0
  130. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/namespace.py +0 -0
  131. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/partition.py +0 -0
  132. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/partition_spec.py +0 -0
  133. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/sort_key.py +0 -0
  134. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/stream.py +0 -0
  135. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/table.py +0 -0
  136. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/table_version.py +0 -0
  137. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/transform.py +0 -0
  138. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/storage/model/types.py +0 -0
  139. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/__init__.py +0 -0
  140. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/aws/__init__.py +0 -0
  141. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/aws/test_clients.py +0 -0
  142. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/aws/test_s3u.py +0 -0
  143. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/catalog/__init__.py +0 -0
  144. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/catalog/test_default_catalog_impl.py +0 -0
  145. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/__init__.py +0 -0
  146. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +0 -0
  147. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compact_partition_rebase_test_cases.py +0 -0
  148. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +0 -0
  149. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor/__init__.py +0 -0
  150. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  151. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor/steps/test_repartition.py +0 -0
  152. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  153. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor/utils/test_io.py +0 -0
  154. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -0
  155. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  156. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
  157. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/utils/__init__.py +0 -0
  158. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -0
  159. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/resource_estimation/__init__.py +0 -0
  160. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/resource_estimation/data/__init__.py +0 -0
  161. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/resource_estimation/test_delta.py +0 -0
  162. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/resource_estimation/test_manifest.py +0 -0
  163. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/test_compact_partition_incremental.py +0 -0
  164. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/test_compact_partition_multiple_rounds.py +0 -0
  165. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/test_compact_partition_params.py +0 -0
  166. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/test_compact_partition_rebase.py +0 -0
  167. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +0 -0
  168. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/test_util_common.py +0 -0
  169. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/test_util_constant.py +0 -0
  170. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -0
  171. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/io/__init__.py +0 -0
  172. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/io/test_cloudpickle_bug_fix.py +0 -0
  173. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/io/test_file_object_store.py +0 -0
  174. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/io/test_memcached_object_store.py +0 -0
  175. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/io/test_ray_plasma_object_store.py +0 -0
  176. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/io/test_redis_object_store.py +0 -0
  177. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/io/test_s3_object_store.py +0 -0
  178. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/local_deltacat_storage/__init__.py +0 -0
  179. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/local_deltacat_storage/exceptions.py +0 -0
  180. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/test_exceptions.py +0 -0
  181. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/test_logs.py +0 -0
  182. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/test_utils/__init__.py +0 -0
  183. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/test_utils/constants.py +0 -0
  184. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/test_utils/storage.py +0 -0
  185. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/test_utils/utils.py +0 -0
  186. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/__init__.py +0 -0
  187. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/data/__init__.py +0 -0
  188. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/ray_utils/__init__.py +0 -0
  189. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/ray_utils/test_concurrency.py +0 -0
  190. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/ray_utils/test_dataset.py +0 -0
  191. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/test_cloudpickle.py +0 -0
  192. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/test_daft.py +0 -0
  193. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/test_metrics.py +0 -0
  194. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/test_placement.py +0 -0
  195. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
  196. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/tests/utils/test_resources.py +0 -0
  197. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/types/__init__.py +0 -0
  198. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/types/media.py +0 -0
  199. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/types/partial_download.py +0 -0
  200. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/types/tables.py +0 -0
  201. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/__init__.py +0 -0
  202. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/arguments.py +0 -0
  203. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/cloudpickle.py +0 -0
  204. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/common.py +0 -0
  205. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/daft.py +0 -0
  206. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/metrics.py +0 -0
  207. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/numpy.py +0 -0
  208. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/pandas.py +0 -0
  209. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/performance.py +0 -0
  210. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/placement.py +0 -0
  211. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/ray_utils/__init__.py +0 -0
  212. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/ray_utils/collections.py +0 -0
  213. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/ray_utils/concurrency.py +0 -0
  214. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/ray_utils/dataset.py +0 -0
  215. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/ray_utils/performance.py +0 -0
  216. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/ray_utils/runtime.py +0 -0
  217. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/resources.py +0 -0
  218. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/s3fs.py +0 -0
  219. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat/utils/schema.py +0 -0
  220. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat.egg-info/dependency_links.txt +0 -0
  221. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat.egg-info/requires.txt +0 -0
  222. {deltacat-1.1.28 → deltacat-1.1.29}/deltacat.egg-info/top_level.txt +0 -0
  223. {deltacat-1.1.28 → deltacat-1.1.29}/setup.cfg +0 -0
  224. {deltacat-1.1.28 → deltacat-1.1.29}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.28
3
+ Version: 1.1.29
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.28"
47
+ __version__ = "1.1.29"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -1,3 +1,5 @@
1
+ from deltacat.utils.common import env_bool, env_integer
2
+
1
3
  TOTAL_BYTES_IN_SHA1_HASH = 20
2
4
 
3
5
  PK_DELIMITER = "L6kl7u5f"
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
31
33
  # The total size of records that will be hash bucketed at once
32
34
  # Since, sorting is nlogn, we ensure that is not performed
33
35
  # on a very large dataset for best performance.
34
- MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
36
+ MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
37
+ "MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
38
+ )
35
39
 
36
40
  # Whether to drop duplicates during merge.
37
41
  DROP_DUPLICATES = True
@@ -78,3 +82,13 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
78
82
  # Number of rounds to run hash/merge for a single
79
83
  # partition. (For large table support)
80
84
  DEFAULT_NUM_ROUNDS = 1
85
+
86
+ # Whether to perform sha1 hashing when required to
87
+ # optimize memory. For example, hashing is always
88
+ # required for bucketing where it's not mandatory
89
+ # when dropping duplicates. Setting this to True
90
+ # will disable sha1 hashing in cases where it isn't
91
+ # mandatory. This flag is False by default.
92
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
93
+ "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
94
+ )
@@ -7,6 +7,7 @@ import ray
7
7
  import itertools
8
8
  import time
9
9
  import pyarrow.compute as pc
10
+ from deltacat.utils.pyarrow import MAX_INT_BYTES
10
11
  import deltacat.compute.compactor_v2.utils.merge as merge_utils
11
12
  from uuid import uuid4
12
13
  from deltacat import logs
@@ -147,10 +148,32 @@ def _merge_tables(
147
148
  if compacted_table:
148
149
  compacted_table = all_tables[0]
149
150
 
151
+ compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
152
+ incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
153
+
154
+ logger.info(
155
+ f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
156
+ f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
157
+ )
158
+
159
+ if (
160
+ compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
161
+ or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
162
+ ):
163
+ logger.info("Casting compacted and incremental pk hash to large_string...")
164
+ # is_in combines the chunks of the chunked array passed which can cause
165
+ # ArrowCapacityError if the total size of string array is over 2GB.
166
+ # Using a large_string would resolve this issue.
167
+ # The cast here should be zero-copy in most cases.
168
+ compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
169
+ incremental_pk_hash_str = pc.cast(
170
+ incremental_pk_hash_str, pa.large_string()
171
+ )
172
+
150
173
  records_to_keep = pc.invert(
151
174
  pc.is_in(
152
- compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
153
- incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
175
+ compacted_pk_hash_str,
176
+ incremental_pk_hash_str,
154
177
  )
155
178
  )
156
179
 
@@ -492,9 +515,11 @@ def _copy_manifests_from_hash_bucketing(
492
515
  def _timed_merge(input: MergeInput) -> MergeResult:
493
516
  task_id = get_current_ray_task_id()
494
517
  worker_id = get_current_ray_worker_id()
495
- with memray.Tracker(
496
- f"merge_{worker_id}_{task_id}.bin"
497
- ) if input.enable_profiler else nullcontext():
518
+ with (
519
+ memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
520
+ if input.enable_profiler
521
+ else nullcontext()
522
+ ):
498
523
  total_input_records, total_deduped_records = 0, 0
499
524
  total_dropped_records = 0
500
525
  materialized_results: List[MaterializeResult] = []
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
25
25
  result[index] = np.arange(cl, dtype="int32")
26
26
 
27
27
  chunk_lengths = ([0] + chunk_lengths)[:-1]
28
- result = pa.chunked_array(result + np.cumsum(chunk_lengths))
28
+ result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
29
29
  return result
30
30
 
31
31
 
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
10
10
  TOTAL_BYTES_IN_SHA1_HASH,
11
11
  PK_DELIMITER,
12
12
  MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
13
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
13
14
  )
14
15
  import time
15
16
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
48
49
  f"Found total length of hash column={total_len} and total_size={total_size}"
49
50
  )
50
51
 
52
+ if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
53
+ logger.info(
54
+ f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
55
+ f"Returning False for is_sha1_desired"
56
+ )
57
+ return False
58
+
51
59
  return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
52
60
 
53
61
 
@@ -108,9 +116,10 @@ def _optimized_group_record_batches_by_hash_bucket(
108
116
  record_batches = []
109
117
  result_len = 0
110
118
  for record_batch in table_batches:
111
- current_bytes += record_batch.nbytes
112
- record_batches.append(record_batch)
113
- if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
119
+ if (
120
+ record_batches
121
+ and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
122
+ ):
114
123
  logger.info(
115
124
  f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
116
125
  f"is {len(record_batches)} and size {current_bytes}"
@@ -128,6 +137,9 @@ def _optimized_group_record_batches_by_hash_bucket(
128
137
  current_bytes = 0
129
138
  record_batches.clear()
130
139
 
140
+ current_bytes += record_batch.nbytes
141
+ record_batches.append(record_batch)
142
+
131
143
  if record_batches:
132
144
  appended_len, append_latency = timed_invocation(
133
145
  _append_table_by_hash_bucket,
@@ -601,6 +601,38 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
601
601
  skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
602
602
  assert_compaction_audit=None,
603
603
  ),
604
+ "15-incremental-empty-input-with-single-hash-bucket": IncrementalCompactionTestCaseParams(
605
+ primary_keys={"pk_col_1"},
606
+ sort_keys=[SortKey.of(key_name="sk_col_1")],
607
+ partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
608
+ partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
609
+ input_deltas=pa.Table.from_arrays(
610
+ [
611
+ pa.array([]),
612
+ pa.array([]),
613
+ ],
614
+ names=["pk_col_1", "sk_col_1"],
615
+ ),
616
+ input_deltas_delta_type=DeltaType.UPSERT,
617
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
618
+ [
619
+ pa.array([]),
620
+ pa.array([]),
621
+ ],
622
+ names=["pk_col_1", "sk_col_1"],
623
+ ),
624
+ expected_terminal_exception=None,
625
+ expected_terminal_exception_message=None,
626
+ do_create_placement_group=False,
627
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
628
+ hash_bucket_count=1,
629
+ read_kwargs_provider=None,
630
+ drop_duplicates=True,
631
+ is_inplace=False,
632
+ add_late_deltas=None,
633
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
634
+ assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
635
+ ),
604
636
  }
605
637
 
606
638
  INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
@@ -1,6 +1,7 @@
1
1
  from typing import Dict, Any
2
2
  import ray
3
3
  import os
4
+ import pyarrow as pa
4
5
  import pytest
5
6
  import boto3
6
7
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
@@ -76,6 +77,17 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
76
77
  os.remove(DATABASE_FILE_PATH_VALUE)
77
78
 
78
79
 
80
+ @pytest.fixture(scope="function")
81
+ def disable_sha1(monkeypatch):
82
+ import deltacat.compute.compactor_v2.utils.primary_key_index
83
+
84
+ monkeypatch.setattr(
85
+ deltacat.compute.compactor_v2.utils.primary_key_index,
86
+ "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
87
+ True,
88
+ )
89
+
90
+
79
91
  class TestCompactionSession:
80
92
  """
81
93
  This class adds specific tests that aren't part of the parametrized test suite.
@@ -556,3 +568,124 @@ class TestCompactionSession:
556
568
  }
557
569
  )
558
570
  )
571
+
572
+ def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
573
+ self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
574
+ ):
575
+ """
576
+ A test case which ensures the compaction succeeds even if the incremental
577
+ arrow table size is over 2GB. It is added to prevent ArrowCapacityError
578
+ when running is_in operation during merge.
579
+
580
+ Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
581
+ which truncates the lengths of pk strings when deduping.
582
+ """
583
+ # setup
584
+ staged_source = stage_partition_from_file_paths(
585
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
586
+ )
587
+ # we create chunked array to avoid ArrowCapacityError
588
+ chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
589
+ table = pa.table([chunked_pk_array], names=["pk"])
590
+ source_delta = commit_delta_to_staged_partition(
591
+ staged_source, pa_table=table, **local_deltacat_storage_kwargs
592
+ )
593
+
594
+ staged_dest = stage_partition_from_file_paths(
595
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
596
+ )
597
+ dest_partition = ds.commit_partition(
598
+ staged_dest, **local_deltacat_storage_kwargs
599
+ )
600
+
601
+ # rebase first
602
+ rebase_url = compact_partition(
603
+ CompactPartitionParams.of(
604
+ {
605
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
606
+ "compacted_file_content_type": ContentType.PARQUET,
607
+ "dd_max_parallelism_ratio": 1.0,
608
+ "deltacat_storage": ds,
609
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
610
+ "destination_partition_locator": dest_partition.locator,
611
+ "drop_duplicates": True,
612
+ "hash_bucket_count": 1,
613
+ "last_stream_position_to_compact": source_delta.stream_position,
614
+ "list_deltas_kwargs": {
615
+ **local_deltacat_storage_kwargs,
616
+ **{"equivalent_table_types": []},
617
+ },
618
+ "primary_keys": ["pk"],
619
+ "rebase_source_partition_locator": source_delta.partition_locator,
620
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
621
+ "records_per_compacted_file": 4000,
622
+ "s3_client_kwargs": {},
623
+ "source_partition_locator": source_delta.partition_locator,
624
+ "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
625
+ }
626
+ )
627
+ )
628
+
629
+ rebased_rcf = get_rcf(s3_resource, rebase_url)
630
+
631
+ assert rebased_rcf.compacted_pyarrow_write_result.files == 1
632
+ assert rebased_rcf.compacted_pyarrow_write_result.records == 2
633
+
634
+ # Run incremental with a small delta on source
635
+ chunked_pk_array = pa.chunked_array(
636
+ [["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
637
+ ) # 2.3GB
638
+ table = pa.table([chunked_pk_array], names=["pk"])
639
+
640
+ incremental_source_delta = commit_delta_to_partition(
641
+ source_delta.partition_locator,
642
+ pa_table=table,
643
+ **local_deltacat_storage_kwargs,
644
+ )
645
+ assert (
646
+ incremental_source_delta.partition_locator == source_delta.partition_locator
647
+ ), "source partition locator should not change"
648
+ dest_partition = ds.get_partition(
649
+ dest_partition.stream_locator,
650
+ dest_partition.partition_values,
651
+ **local_deltacat_storage_kwargs,
652
+ )
653
+
654
+ assert (
655
+ dest_partition.locator
656
+ == rebased_rcf.compacted_delta_locator.partition_locator
657
+ ), "The new destination partition should be same as compacted partition"
658
+
659
+ # Run incremental
660
+ incremental_url = compact_partition(
661
+ CompactPartitionParams.of(
662
+ {
663
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
664
+ "compacted_file_content_type": ContentType.PARQUET,
665
+ "dd_max_parallelism_ratio": 1.0,
666
+ "deltacat_storage": ds,
667
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
668
+ "destination_partition_locator": dest_partition.locator,
669
+ "drop_duplicates": True,
670
+ "hash_bucket_count": 1,
671
+ "last_stream_position_to_compact": incremental_source_delta.stream_position,
672
+ "list_deltas_kwargs": {
673
+ **local_deltacat_storage_kwargs,
674
+ **{"equivalent_table_types": []},
675
+ },
676
+ "primary_keys": ["pk"],
677
+ "records_per_compacted_file": 4000,
678
+ "s3_client_kwargs": {},
679
+ "source_partition_locator": incremental_source_delta.partition_locator,
680
+ "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
681
+ }
682
+ )
683
+ )
684
+
685
+ incremental_rcf = get_rcf(s3_resource, incremental_url)
686
+
687
+ assert incremental_rcf.compacted_pyarrow_write_result.files == 1
688
+ assert (
689
+ incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
690
+ )
691
+ assert incremental_rcf.compacted_pyarrow_write_result.records == 4
@@ -0,0 +1,45 @@
1
+ import pyarrow as pa
2
+ from deltacat.compute.compactor_v2.utils.primary_key_index import (
3
+ group_by_pk_hash_bucket,
4
+ )
5
+
6
+
7
+ class TestGroupByPkHashBucket:
8
+ def test_sanity(self):
9
+ record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
10
+ pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
11
+ record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
12
+ table = pa.Table.from_batches([record_batch])
13
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
14
+
15
+ assert len(grouped_array) == 3
16
+ total_records = 0
17
+ for arr in grouped_array:
18
+ if arr is not None:
19
+ total_records += len(arr[1])
20
+
21
+ assert total_records == len(table)
22
+
23
+ def test_when_record_batches_exceed_int_max_size(self):
24
+ record = pa.array(["12bytestring" * 90_000_000])
25
+ record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
26
+ table = pa.Table.from_batches([record_batch, record_batch])
27
+
28
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
29
+
30
+ assert len(grouped_array) == 3
31
+ # two record batches are preserved as combining them
32
+ # would exceed 2GB.
33
+ assert len(grouped_array[2].to_batches()) == 2
34
+
35
+ def test_when_record_batches_less_than_int_max_size(self):
36
+ record = pa.array(["12bytestring" * 90_000])
37
+ record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
38
+ table = pa.Table.from_batches([record_batch, record_batch])
39
+
40
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
41
+
42
+ assert len(grouped_array) == 3
43
+ # Combined the arrays into one record batch as the size
44
+ # would not exceed 2GB.
45
+ assert len(grouped_array[1].to_batches()) == 1
@@ -47,7 +47,8 @@ def stage_partition_from_file_paths(
47
47
 
48
48
  def commit_delta_to_staged_partition(
49
49
  staged_partition,
50
- file_paths: List[str],
50
+ file_paths: List[str] = None,
51
+ pa_table: pa.Table = None,
51
52
  content_type: ContentType = ContentType.PARQUET,
52
53
  *args,
53
54
  **kwargs,
@@ -57,6 +58,7 @@ def commit_delta_to_staged_partition(
57
58
  *args,
58
59
  file_paths=file_paths,
59
60
  content_type=content_type,
61
+ pa_table=pa_table,
60
62
  **kwargs,
61
63
  )
62
64
  ds.commit_partition(staged_partition, **kwargs)
@@ -76,23 +78,28 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
76
78
 
77
79
  def commit_delta_to_partition(
78
80
  partition: Union[Partition, PartitionLocator],
79
- file_paths: List[str],
81
+ file_paths: List[str] = None,
82
+ pa_table: pa.Table = None,
80
83
  content_type: ContentType = ContentType.PARQUET,
81
84
  *args,
82
85
  **kwargs,
83
86
  ) -> Delta:
84
- tables = []
85
87
 
86
88
  if isinstance(partition, PartitionLocator):
87
89
  partition = ds.get_partition(
88
90
  partition.stream_locator, partition.partition_values, *args, **kwargs
89
91
  )
92
+ if pa_table is None:
93
+ assert file_paths is not None, "One of pa_table or file_paths must be passed."
94
+ tables = []
95
+ for file_path in file_paths:
96
+ table = pa.csv.read_csv(file_path)
97
+ tables.append(table)
90
98
 
91
- for file_path in file_paths:
92
- table = pa.csv.read_csv(file_path)
93
- tables.append(table)
99
+ pa_table = pa.concat_tables(tables)
94
100
 
95
- table = pa.concat_tables(tables)
96
- staged_delta = ds.stage_delta(table, partition, content_type=content_type, **kwargs)
101
+ staged_delta = ds.stage_delta(
102
+ pa_table, partition, content_type=content_type, **kwargs
103
+ )
97
104
 
98
105
  return ds.commit_delta(staged_delta, **kwargs)
@@ -16,6 +16,7 @@ from pyarrow.parquet import ParquetFile
16
16
  import pyarrow as pa
17
17
 
18
18
  PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
19
+ PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
19
20
  EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
20
21
  NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
21
22
  OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
@@ -789,3 +790,25 @@ class TestS3FileToTable(TestCase):
789
790
  self.assertEqual(field.name, schema.field(index).name)
790
791
 
791
792
  self.assertEqual(result.schema.field(1).type, "string")
793
+
794
+ def test_s3_file_to_table_when_parquet_gzip(self):
795
+
796
+ pa_kwargs_provider = lambda content_type, kwargs: {
797
+ "reader_type": "pyarrow",
798
+ **kwargs,
799
+ }
800
+
801
+ result = s3_file_to_table(
802
+ PARQUET_GZIP_COMPRESSED_FILE_PATH,
803
+ ContentType.PARQUET.value,
804
+ ContentEncoding.GZIP.value,
805
+ ["n_legs", "animal"],
806
+ ["n_legs"],
807
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
808
+ )
809
+
810
+ self.assertEqual(len(result), 6)
811
+ self.assertEqual(len(result.column_names), 1)
812
+ schema = result.schema
813
+ schema_index = schema.get_field_index("n_legs")
814
+ self.assertEqual(schema.field(schema_index).type, "int64")
@@ -58,6 +58,7 @@ RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
58
58
  # Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
59
59
  DECIMAL256_DEFAULT_SCALE = 38
60
60
  DECIMAL256_MAX_PRECISION = 76
61
+ MAX_INT_BYTES = 2147483646
61
62
 
62
63
 
63
64
  def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
@@ -129,9 +130,11 @@ def _read_csv_rounding_decimal_columns_to_fit_scale(
129
130
  # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
130
131
  new_schema = _new_schema_with_replaced_fields(
131
132
  schema,
132
- lambda fld: pa.field(fld.name, pa.string(), metadata=fld.metadata)
133
- if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
134
- else None,
133
+ lambda fld: (
134
+ pa.field(fld.name, pa.string(), metadata=fld.metadata)
135
+ if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
136
+ else None
137
+ ),
135
138
  )
136
139
  new_kwargs = sanitize_kwargs_by_supported_kwargs(
137
140
  ["read_options", "parse_options", "convert_options", "memory_pool"],
@@ -569,8 +572,8 @@ def s3_file_to_table(
569
572
  **s3_client_kwargs,
570
573
  )
571
574
 
572
- if READER_TYPE_KWARG in kwargs:
573
- kwargs.pop(READER_TYPE_KWARG)
575
+ if READER_TYPE_KWARG in kwargs:
576
+ kwargs.pop(READER_TYPE_KWARG)
574
577
 
575
578
  filesystem = io
576
579
  if s3_url.startswith("s3://"):
@@ -912,7 +915,6 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
912
915
  TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
913
916
  """
914
917
  dtype = array.type
915
- MAX_BYTES = 2147483646
916
918
  max_str_len = None
917
919
  if pa.types.is_integer(dtype):
918
920
  max_str_len = _int_max_string_len()
@@ -924,7 +926,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
924
926
  max_str_len = _max_decimal256_string_len()
925
927
 
926
928
  if max_str_len is not None:
927
- max_elems_per_chunk = MAX_BYTES // (2 * max_str_len) # safety factor of 2
929
+ max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len) # safety factor of 2
928
930
  all_chunks = []
929
931
  for chunk in array.chunks:
930
932
  if len(chunk) < max_elems_per_chunk:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.28
3
+ Version: 1.1.29
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -164,6 +164,7 @@ deltacat/tests/compute/compactor_v2/__init__.py
164
164
  deltacat/tests/compute/compactor_v2/test_compaction_session.py
165
165
  deltacat/tests/compute/compactor_v2/test_hashlib.py
166
166
  deltacat/tests/compute/compactor_v2/utils/__init__.py
167
+ deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py
167
168
  deltacat/tests/compute/compactor_v2/utils/test_task_options.py
168
169
  deltacat/tests/compute/resource_estimation/__init__.py
169
170
  deltacat/tests/compute/resource_estimation/test_delta.py
File without changes
File without changes
File without changes
File without changes