deltacat 0.1.18b13__tar.gz → 0.1.18b15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/PKG-INFO +14 -2
  2. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/README.md +13 -1
  3. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/__init__.py +3 -2
  4. deltacat-0.1.18b15/deltacat/aws/clients.py +189 -0
  5. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/redshift/model/manifest.py +4 -0
  6. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/s3u.py +24 -1
  7. deltacat-0.1.18b15/deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
  8. deltacat-0.1.18b15/deltacat/benchmarking/conftest.py +61 -0
  9. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/catalog/delegate.py +1 -1
  10. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/catalog/interface.py +1 -1
  11. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/__init__.py +0 -3
  12. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/compaction_session.py +45 -20
  13. deltacat-0.1.18b15/deltacat/compute/compactor/model/compact_partition_params.py +382 -0
  14. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  15. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/delta_annotated.py +91 -9
  16. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
  17. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/primary_key_index.py +1 -1
  18. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/round_completion_info.py +17 -1
  19. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/repartition_session.py +5 -3
  20. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/steps/dedupe.py +10 -8
  21. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/steps/hash_bucket.py +25 -4
  22. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/steps/materialize.py +11 -6
  23. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/steps/repartition.py +16 -1
  24. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/utils/io.py +40 -23
  25. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/utils/primary_key_index.py +1 -15
  26. deltacat-0.1.18b15/deltacat/compute/compactor/utils/sort_key.py +57 -0
  27. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/utils/system_columns.py +43 -0
  28. deltacat-0.1.18b15/deltacat/compute/compactor_v2/compaction_session.py +506 -0
  29. deltacat-0.1.18b15/deltacat/compute/compactor_v2/constants.py +34 -0
  30. deltacat-0.1.18b15/deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  31. deltacat-0.1.18b15/deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  32. deltacat-0.1.18b15/deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  33. deltacat-0.1.18b15/deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  34. deltacat-0.1.18b15/deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  35. deltacat-0.1.18b15/deltacat/compute/compactor_v2/steps/merge.py +41 -0
  36. deltacat-0.1.18b15/deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  37. deltacat-0.1.18b15/deltacat/compute/compactor_v2/utils/io.py +149 -0
  38. deltacat-0.1.18b15/deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  39. deltacat-0.1.18b15/deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  40. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/meta_stats.py +4 -2
  41. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/stats.py +1 -0
  42. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/utils/io.py +4 -0
  43. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/utils/io.py +20 -5
  44. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/exceptions.py +4 -0
  45. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/memcached_object_store.py +37 -14
  46. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/logs.py +4 -3
  47. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/__init__.py +3 -0
  48. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/interface.py +11 -2
  49. deltacat-0.1.18b15/deltacat/storage/model/sort_key.py +33 -0
  50. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/table_version.py +11 -0
  51. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/types.py +2 -1
  52. deltacat-0.1.18b15/deltacat/tests/aws/test_clients.py +80 -0
  53. deltacat-0.1.18b15/deltacat/tests/compute/common.py +96 -0
  54. {deltacat-0.1.18b13/deltacat/tests → deltacat-0.1.18b15/deltacat/tests/compute/compactor/steps}/test_repartition.py +22 -8
  55. deltacat-0.1.18b15/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  56. {deltacat-0.1.18b13/deltacat/tests → deltacat-0.1.18b15/deltacat/tests/compute}/compactor/utils/test_io.py +47 -5
  57. deltacat-0.1.18b15/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  58. deltacat-0.1.18b15/deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  59. deltacat-0.1.18b15/deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  60. {deltacat-0.1.18b13/deltacat/tests/compactor → deltacat-0.1.18b15/deltacat/tests/compute}/test_compact_partition_params.py +14 -30
  61. deltacat-0.1.18b15/deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  62. deltacat-0.1.18b15/deltacat/tests/compute/testcases.py +390 -0
  63. deltacat-0.1.18b15/deltacat/tests/io/__init__.py +0 -0
  64. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/io/test_memcached_object_store.py +5 -4
  65. deltacat-0.1.18b15/deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
  66. deltacat-0.1.18b15/deltacat/tests/stats/__init__.py +0 -0
  67. deltacat-0.1.18b15/deltacat/tests/test_utils/__init__.py +0 -0
  68. deltacat-0.1.18b15/deltacat/tests/test_utils/pyarrow.py +32 -0
  69. deltacat-0.1.18b15/deltacat/tests/test_utils/utils.py +13 -0
  70. deltacat-0.1.18b15/deltacat/tests/utils/__init__.py +0 -0
  71. deltacat-0.1.18b15/deltacat/tests/utils/data/__init__.py +0 -0
  72. deltacat-0.1.18b15/deltacat/tests/utils/test_daft.py +76 -0
  73. deltacat-0.1.18b15/deltacat/tests/utils/test_pyarrow.py +133 -0
  74. deltacat-0.1.18b15/deltacat/tests/utils/test_resources.py +48 -0
  75. deltacat-0.1.18b15/deltacat/types/__init__.py +0 -0
  76. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/types/media.py +1 -0
  77. deltacat-0.1.18b15/deltacat/types/partial_download.py +82 -0
  78. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/types/tables.py +1 -0
  79. deltacat-0.1.18b15/deltacat/utils/__init__.py +0 -0
  80. deltacat-0.1.18b15/deltacat/utils/arguments.py +26 -0
  81. deltacat-0.1.18b15/deltacat/utils/daft.py +87 -0
  82. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/performance.py +4 -2
  83. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/placement.py +20 -3
  84. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/pyarrow.py +213 -1
  85. deltacat-0.1.18b15/deltacat/utils/ray_utils/__init__.py +0 -0
  86. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/ray_utils/concurrency.py +26 -1
  87. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/resources.py +72 -1
  88. deltacat-0.1.18b15/deltacat/utils/s3fs.py +21 -0
  89. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat.egg-info/PKG-INFO +14 -2
  90. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat.egg-info/SOURCES.txt +46 -6
  91. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat.egg-info/requires.txt +3 -1
  92. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/setup.py +3 -1
  93. deltacat-0.1.18b13/deltacat/aws/clients.py +0 -69
  94. deltacat-0.1.18b13/deltacat/compute/compactor/model/compact_partition_params.py +0 -153
  95. deltacat-0.1.18b13/deltacat/compute/compactor/model/sort_key.py +0 -98
  96. deltacat-0.1.18b13/deltacat/tests/utils/test_resources.py +0 -45
  97. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/MANIFEST.in +0 -0
  98. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/__init__.py +0 -0
  99. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/constants.py +0 -0
  100. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/redshift/__init__.py +0 -0
  101. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/redshift/model/__init__.py +0 -0
  102. {deltacat-0.1.18b13/deltacat/catalog → deltacat-0.1.18b15/deltacat/benchmarking}/__init__.py +0 -0
  103. {deltacat-0.1.18b13/deltacat/catalog/model → deltacat-0.1.18b15/deltacat/catalog}/__init__.py +0 -0
  104. {deltacat-0.1.18b13/deltacat/compute → deltacat-0.1.18b15/deltacat/catalog/model}/__init__.py +0 -0
  105. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/catalog/model/catalog.py +0 -0
  106. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/catalog/model/table_definition.py +0 -0
  107. {deltacat-0.1.18b13/deltacat/compute/compactor/model → deltacat-0.1.18b15/deltacat/compute}/__init__.py +0 -0
  108. {deltacat-0.1.18b13/deltacat/compute/compactor/steps → deltacat-0.1.18b15/deltacat/compute/compactor/model}/__init__.py +0 -0
  109. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
  110. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
  111. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
  112. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/materialize_result.py +0 -0
  113. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  114. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  115. {deltacat-0.1.18b13/deltacat/compute/compactor/utils → deltacat-0.1.18b15/deltacat/compute/compactor/steps}/__init__.py +0 -0
  116. {deltacat-0.1.18b13/deltacat/compute/metastats → deltacat-0.1.18b15/deltacat/compute/compactor/utils}/__init__.py +0 -0
  117. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
  118. {deltacat-0.1.18b13/deltacat/compute/metastats/config → deltacat-0.1.18b15/deltacat/compute/compactor_v2}/__init__.py +0 -0
  119. {deltacat-0.1.18b13/deltacat/compute/metastats → deltacat-0.1.18b15/deltacat/compute/compactor_v2}/model/__init__.py +0 -0
  120. {deltacat-0.1.18b13/deltacat/compute/metastats/utils → deltacat-0.1.18b15/deltacat/compute/compactor_v2/steps}/__init__.py +0 -0
  121. {deltacat-0.1.18b13/deltacat/compute/stats → deltacat-0.1.18b15/deltacat/compute/compactor_v2/utils}/__init__.py +0 -0
  122. {deltacat-0.1.18b13/deltacat/compute/stats/models → deltacat-0.1.18b15/deltacat/compute/metastats}/__init__.py +0 -0
  123. {deltacat-0.1.18b13/deltacat/compute/stats/utils → deltacat-0.1.18b15/deltacat/compute/metastats/config}/__init__.py +0 -0
  124. {deltacat-0.1.18b13/deltacat/io → deltacat-0.1.18b15/deltacat/compute/metastats/model}/__init__.py +0 -0
  125. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/model/partition_stats_dict.py +0 -0
  126. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -0
  127. {deltacat-0.1.18b13/deltacat/io/aws → deltacat-0.1.18b15/deltacat/compute/metastats/utils}/__init__.py +0 -0
  128. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/utils/constants.py +0 -0
  129. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -0
  130. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/utils/ray_utils.py +0 -0
  131. {deltacat-0.1.18b13/deltacat/io/aws/redshift → deltacat-0.1.18b15/deltacat/compute/stats}/__init__.py +0 -0
  132. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/basic.py +0 -0
  133. {deltacat-0.1.18b13/deltacat/storage/model → deltacat-0.1.18b15/deltacat/compute/stats/models}/__init__.py +0 -0
  134. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  135. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/models/delta_stats.py +0 -0
  136. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  137. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  138. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/models/stats_result.py +0 -0
  139. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/types.py +0 -0
  140. {deltacat-0.1.18b13/deltacat/tests → deltacat-0.1.18b15/deltacat/compute/stats/utils}/__init__.py +0 -0
  141. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/utils/intervals.py +0 -0
  142. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/utils/manifest_stats_file.py +0 -0
  143. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/constants.py +0 -0
  144. {deltacat-0.1.18b13/deltacat/tests/compactor → deltacat-0.1.18b15/deltacat/io}/__init__.py +0 -0
  145. {deltacat-0.1.18b13/deltacat/tests/compactor/utils → deltacat-0.1.18b15/deltacat/io/aws}/__init__.py +0 -0
  146. {deltacat-0.1.18b13/deltacat/tests/io → deltacat-0.1.18b15/deltacat/io/aws/redshift}/__init__.py +0 -0
  147. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/aws/redshift/redshift_datasource.py +0 -0
  148. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/dataset.py +0 -0
  149. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/file_object_store.py +0 -0
  150. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/object_store.py +0 -0
  151. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/ray_plasma_object_store.py +0 -0
  152. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/read_api.py +0 -0
  153. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/redis_object_store.py +0 -0
  154. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/s3_object_store.py +0 -0
  155. {deltacat-0.1.18b13/deltacat/tests/stats → deltacat-0.1.18b15/deltacat/storage/model}/__init__.py +0 -0
  156. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/delta.py +0 -0
  157. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/list_result.py +0 -0
  158. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/locator.py +0 -0
  159. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/namespace.py +0 -0
  160. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/partition.py +0 -0
  161. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/stream.py +0 -0
  162. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/table.py +0 -0
  163. {deltacat-0.1.18b13/deltacat/tests/test_utils → deltacat-0.1.18b15/deltacat/tests}/__init__.py +0 -0
  164. {deltacat-0.1.18b13/deltacat/tests/utils → deltacat-0.1.18b15/deltacat/tests/aws}/__init__.py +0 -0
  165. {deltacat-0.1.18b13/deltacat/types → deltacat-0.1.18b15/deltacat/tests/compute}/__init__.py +0 -0
  166. {deltacat-0.1.18b13/deltacat/utils → deltacat-0.1.18b15/deltacat/tests/compute/compactor}/__init__.py +0 -0
  167. {deltacat-0.1.18b13/deltacat/utils/ray_utils → deltacat-0.1.18b15/deltacat/tests/compute/compactor/steps}/__init__.py +0 -0
  168. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/io/test_file_object_store.py +0 -0
  169. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/io/test_ray_plasma_object_store.py +0 -0
  170. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/io/test_redis_object_store.py +0 -0
  171. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/io/test_s3_object_store.py +0 -0
  172. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/stats/test_intervals.py +0 -0
  173. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/test_utils/constants.py +0 -0
  174. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
  175. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/common.py +0 -0
  176. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/metrics.py +0 -0
  177. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/numpy.py +0 -0
  178. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/pandas.py +0 -0
  179. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/ray_utils/collections.py +0 -0
  180. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/ray_utils/dataset.py +0 -0
  181. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/ray_utils/performance.py +0 -0
  182. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/ray_utils/runtime.py +0 -0
  183. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat.egg-info/dependency_links.txt +0 -0
  184. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat.egg-info/top_level.txt +0 -0
  185. {deltacat-0.1.18b13 → deltacat-0.1.18b15}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.1.18b13
3
+ Version: 0.1.18b15
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -18,12 +18,24 @@ Description: # DeltaCAT
18
18
  change-data-capture, data consistency checks, and table repair.
19
19
 
20
20
  ## Getting Started
21
- ---
21
+
22
22
  ### Install
23
+
23
24
  ```
24
25
  pip install deltacat
25
26
  ```
26
27
 
28
+ ### Running Tests
29
+
30
+ ```
31
+ pip3 install virtualenv
32
+ virtualenv test_env
33
+ source test_env/bin/activate
34
+ pip3 install -r requirements.txt
35
+
36
+ pytest
37
+ ```
38
+
27
39
  Platform: UNKNOWN
28
40
  Classifier: Development Status :: 4 - Beta
29
41
  Classifier: Intended Audience :: Developers
@@ -11,8 +11,20 @@ for common table management tasks, including petabyte-scale
11
11
  change-data-capture, data consistency checks, and table repair.
12
12
 
13
13
  ## Getting Started
14
- ---
14
+
15
15
  ### Install
16
+
16
17
  ```
17
18
  pip install deltacat
18
19
  ```
20
+
21
+ ### Running Tests
22
+
23
+ ```
24
+ pip3 install virtualenv
25
+ virtualenv test_env
26
+ source test_env/bin/activate
27
+ pip3 install -r requirements.txt
28
+
29
+ pytest
30
+ ```
@@ -28,7 +28,6 @@ from deltacat.catalog.model.catalog import ( # noqa: F401
28
28
  init,
29
29
  )
30
30
  from deltacat.catalog.model.table_definition import TableDefinition
31
- from deltacat.compute.compactor import SortKey, SortOrder
32
31
  from deltacat.storage import (
33
32
  DistributedDataset,
34
33
  LifecycleState,
@@ -37,13 +36,15 @@ from deltacat.storage import (
37
36
  LocalTable,
38
37
  Namespace,
39
38
  SchemaConsistencyType,
39
+ SortKey,
40
+ SortOrder,
40
41
  )
41
42
  from deltacat.types.media import ContentEncoding, ContentType, TableType
42
43
  from deltacat.types.tables import TableWriteMode
43
44
 
44
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
45
46
 
46
- __version__ = "0.1.18b13"
47
+ __version__ = "0.1.18b15"
47
48
 
48
49
 
49
50
  __all__ = [
@@ -0,0 +1,189 @@
1
+ import logging
2
+ from functools import lru_cache
3
+ from typing import Optional, FrozenSet
4
+ from http import HTTPStatus
5
+
6
+ import boto3
7
+ from boto3.exceptions import ResourceNotExistsError
8
+ from boto3.resources.base import ServiceResource
9
+ from botocore.client import BaseClient
10
+ from botocore.config import Config
11
+ from requests.adapters import Response
12
+ from tenacity import (
13
+ RetryError,
14
+ Retrying,
15
+ wait_fixed,
16
+ retry_if_exception,
17
+ stop_after_delay,
18
+ )
19
+
20
+ from deltacat import logs
21
+ from deltacat.aws.constants import BOTO_MAX_RETRIES
22
+ import requests
23
+
24
+
25
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
+
27
+ BOTO3_PROFILE_NAME_KWARG_KEY = "boto3_profile_name"
28
+ INSTANCE_METADATA_SERVICE_IPV4_URI = "http://169.254.169.254/latest/meta-data/" # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
29
+ RETRYABLE_HTTP_STATUS_CODES = [
30
+ # 429
31
+ HTTPStatus.TOO_MANY_REQUESTS,
32
+ # 5xx
33
+ HTTPStatus.INTERNAL_SERVER_ERROR,
34
+ HTTPStatus.NOT_IMPLEMENTED,
35
+ HTTPStatus.BAD_GATEWAY,
36
+ HTTPStatus.SERVICE_UNAVAILABLE,
37
+ HTTPStatus.GATEWAY_TIMEOUT,
38
+ ]
39
+
40
+
41
+ class RetryIfRetryableHTTPStatusCode(retry_if_exception):
42
+ """
43
+ Retry strategy that retries if the exception is an ``HTTPError`` with
44
+ a status code in the retryable errors list.
45
+ """
46
+
47
+ def __init__(self):
48
+ def is_retryable_error(exception):
49
+ return (
50
+ isinstance(exception, requests.exceptions.HTTPError)
51
+ and exception.response.status_code in RETRYABLE_HTTP_STATUS_CODES
52
+ )
53
+
54
+ super().__init__(predicate=is_retryable_error)
55
+
56
+
57
+ def _log_attempt_number(retry_state):
58
+ """return the result of the last call attempt"""
59
+ logger.warning(f"Retrying: {retry_state.attempt_number}...")
60
+
61
+
62
+ def _get_url(url: str, get_url_kwargs=None):
63
+ if get_url_kwargs is None:
64
+ get_url_kwargs = {}
65
+ resp = requests.get(url, **get_url_kwargs)
66
+ resp.raise_for_status()
67
+ return resp
68
+
69
+
70
+ def retrying_get(
71
+ url: str,
72
+ retry_strategy,
73
+ wait_strategy,
74
+ stop_strategy,
75
+ short_circuit_on_status: FrozenSet[int] = {HTTPStatus.OK},
76
+ ) -> Optional[Response]:
77
+ """Retries a request to the given URL until it succeeds.
78
+
79
+ Args:
80
+ retry_strategy (Callable): A function that returns a retry strategy.
81
+ wait_strategy (Callable): A function that returns a wait strategy.
82
+ stop_strategy (Callable): A function that returns a stop strategy.
83
+ url (str): The URL to retry.
84
+
85
+ Returns:
86
+ Optional[Response]: The response from the URL, or None if the request
87
+ failed after the maximum number of retries.
88
+ """
89
+ try:
90
+ resp = _get_url(url)
91
+ if resp.status_code in short_circuit_on_status:
92
+ return resp
93
+ for attempt in Retrying(
94
+ retry=retry_strategy(),
95
+ wait=wait_strategy,
96
+ stop=stop_strategy,
97
+ after=_log_attempt_number,
98
+ ):
99
+ with attempt:
100
+ resp = _get_url(url)
101
+ return resp
102
+ except RetryError as re:
103
+ logger.error(f"Failed to retry URL: {url} - {re}")
104
+ logger.info(f"Unable to get from URL: {url}")
105
+ return None
106
+
107
+
108
+ def block_until_instance_metadata_service_returns_success(
109
+ url=INSTANCE_METADATA_SERVICE_IPV4_URI,
110
+ retry_strategy=RetryIfRetryableHTTPStatusCode,
111
+ wait_strategy=wait_fixed(2), # wait 2 seconds before retrying,
112
+ stop_strategy=stop_after_delay(60 * 10), # stop trying after 10 minutes
113
+ ) -> Optional[Response]:
114
+ """Blocks until the instance metadata service returns a successful response.
115
+
116
+ Args:
117
+ retry_strategy (Callable): A function that returns a retry strategy.
118
+ wait_strategy (Callable): A function that returns a wait strategy.
119
+ stop_strategy (Callable): A function that returns a stop strategy.
120
+ url (str): The URL of the instance metadata service.
121
+
122
+ Returns:
123
+ Optional[Response]: The response from the instance metadata service,
124
+ or None if the request failed after the maximum number of retries.
125
+
126
+ https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
127
+ """
128
+ # We will get a 403 HTTP status code if running deltacat not in an EC2 instance. In that case we won't want to block.
129
+ return retrying_get(
130
+ url,
131
+ retry_strategy,
132
+ wait_strategy,
133
+ stop_strategy,
134
+ short_circuit_on_status={HTTPStatus.OK, HTTPStatus.FORBIDDEN},
135
+ )
136
+
137
+
138
+ def _get_session_from_kwargs(input_kwargs):
139
+ block_until_instance_metadata_service_returns_success()
140
+ if input_kwargs.get(BOTO3_PROFILE_NAME_KWARG_KEY) is not None:
141
+ boto3_session = boto3.Session(
142
+ profile_name=input_kwargs.get(BOTO3_PROFILE_NAME_KWARG_KEY)
143
+ )
144
+ input_kwargs.pop(BOTO3_PROFILE_NAME_KWARG_KEY)
145
+ return boto3_session
146
+ else:
147
+ return boto3.Session()
148
+
149
+
150
+ def _resource(name: str, region: Optional[str], **kwargs) -> ServiceResource:
151
+ boto3_session = _get_session_from_kwargs(kwargs)
152
+
153
+ boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
154
+ return boto3_session.resource(
155
+ name,
156
+ region,
157
+ config=boto_config,
158
+ **kwargs,
159
+ )
160
+
161
+
162
+ def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
163
+ try:
164
+ # try to re-use a client from the resource cache first
165
+ return resource_cache(name, region, **kwargs).meta.client
166
+ except ResourceNotExistsError:
167
+ # fall back for clients without an associated resource
168
+ boto3_session = _get_session_from_kwargs(kwargs)
169
+ boto_config = Config(
170
+ retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"}
171
+ )
172
+ return boto3_session.client(
173
+ name,
174
+ region,
175
+ config=boto_config,
176
+ **kwargs,
177
+ )
178
+
179
+
180
+ def resource_cache(name: str, region: Optional[str], **kwargs) -> ServiceResource:
181
+ # we don't use the @lru_cache decorator because Ray can't pickle it
182
+ cached_function = lru_cache()(_resource)
183
+ return cached_function(name, region, **kwargs)
184
+
185
+
186
+ def client_cache(name: str, region: Optional[str], **kwargs) -> BaseClient:
187
+ # we don't use the @lru_cache decorator because Ray can't pickle it
188
+ cached_function = lru_cache()(_client)
189
+ return cached_function(name, region, **kwargs)
@@ -170,6 +170,10 @@ class ManifestMeta(dict):
170
170
  def content_type_parameters(self) -> Optional[List[Dict[str, str]]]:
171
171
  return self.get("content_type_parameters")
172
172
 
173
+ @content_type_parameters.setter
174
+ def content_type_parameters(self, params: List[Dict[str, str]]) -> None:
175
+ self["content_type_parameters"] = params
176
+
173
177
  @property
174
178
  def credentials(self) -> Optional[Dict[str, str]]:
175
179
  return self.get("credentials")
@@ -3,6 +3,8 @@ import multiprocessing
3
3
  from functools import partial
4
4
  from typing import Any, Callable, Dict, Generator, List, Optional, Union
5
5
  from uuid import uuid4
6
+ from botocore.config import Config
7
+ from deltacat.aws.constants import BOTO_MAX_RETRIES
6
8
 
7
9
  import pyarrow as pa
8
10
  import ray
@@ -39,6 +41,7 @@ from deltacat.types.tables import (
39
41
  TABLE_TYPE_TO_READER_FUNC,
40
42
  get_table_length,
41
43
  )
44
+ from deltacat.types.partial_download import PartialFileDownloadParams
42
45
  from deltacat.utils.common import ReadKwargsProvider
43
46
 
44
47
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -197,6 +200,7 @@ def read_file(
197
200
  column_names: Optional[List[str]] = None,
198
201
  include_columns: Optional[List[str]] = None,
199
202
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
203
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
200
204
  **s3_client_kwargs,
201
205
  ) -> LocalTable:
202
206
 
@@ -209,6 +213,7 @@ def read_file(
209
213
  column_names,
210
214
  include_columns,
211
215
  file_reader_kwargs_provider,
216
+ partial_file_download_params,
212
217
  **s3_client_kwargs,
213
218
  )
214
219
  return table
@@ -217,6 +222,13 @@ def read_file(
217
222
  # Timeout error not caught by botocore
218
223
  raise RetryableError(f"Retry table download from: {s3_url}") from e
219
224
  raise NonRetryableError(f"Failed table download from: {s3_url}") from e
225
+ except BaseException as e:
226
+ logger.warn(
227
+ f"Read has failed for {s3_url} and content_type={content_type} "
228
+ f"and encoding={content_encoding}. Error: {e}",
229
+ exc_info=True,
230
+ )
231
+ raise e
220
232
 
221
233
 
222
234
  def upload_sliced_table(
@@ -385,14 +397,16 @@ def download_manifest_entry(
385
397
  content_encoding: Optional[ContentEncoding] = None,
386
398
  ) -> LocalTable:
387
399
 
400
+ conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
388
401
  s3_client_kwargs = (
389
402
  {
390
403
  "aws_access_key_id": token_holder["accessKeyId"],
391
404
  "aws_secret_access_key": token_holder["secretAccessKey"],
392
405
  "aws_session_token": token_holder["sessionToken"],
406
+ "config": conf,
393
407
  }
394
408
  if token_holder
395
- else {}
409
+ else {"config": conf}
396
410
  )
397
411
  if not content_type:
398
412
  content_type = manifest_entry.meta.content_type
@@ -409,6 +423,14 @@ def download_manifest_entry(
409
423
  s3_url = manifest_entry.uri
410
424
  if s3_url is None:
411
425
  s3_url = manifest_entry.url
426
+
427
+ partial_file_download_params = None
428
+ if manifest_entry.meta and manifest_entry.meta.content_type_parameters:
429
+ for type_params in manifest_entry.meta.content_type_parameters:
430
+ if isinstance(type_params, PartialFileDownloadParams):
431
+ partial_file_download_params = type_params
432
+ break
433
+
412
434
  # @retry decorator can't be pickled by Ray, so wrap download in Retrying
413
435
  retrying = Retrying(
414
436
  wait=wait_random_exponential(multiplier=1, max=60),
@@ -424,6 +446,7 @@ def download_manifest_entry(
424
446
  column_names,
425
447
  include_columns,
426
448
  file_reader_kwargs_provider,
449
+ partial_file_download_params,
427
450
  **s3_client_kwargs,
428
451
  )
429
452
  return table
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ import pytest
4
+
5
+
6
+ # Benchmarks for retrieving a single column in the Parquet file
7
+ SINGLE_COLUMN_BENCHMARKS = {
8
+ "mvp": ("s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet", ["a"]),
9
+ "TPCH-lineitems-200MB-2RG": (
10
+ "s3://daft-public-data/test_fixtures/parquet-dev/daft_200MB_lineitem_chunk.RG-2.parquet",
11
+ ["L_ORDERKEY"],
12
+ ),
13
+ }
14
+
15
+ # Benchmarks for retrieving all columns in the Parquet file
16
+ ALL_COLUMN_BENCHMARKS = {
17
+ "mvp": ("s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet", None),
18
+ "TPCH-lineitems-200MB-2RG": (
19
+ "s3://daft-public-data/test_fixtures/parquet-dev/daft_200MB_lineitem_chunk.RG-2.parquet",
20
+ None,
21
+ ),
22
+ }
23
+
24
+
25
+ @pytest.mark.benchmark(group="num_rowgroups_single_column")
26
+ @pytest.mark.parametrize(
27
+ ["name", "path", "columns"],
28
+ [
29
+ (name, path, columns)
30
+ for name, (path, columns) in SINGLE_COLUMN_BENCHMARKS.items()
31
+ ],
32
+ ids=[name for name in SINGLE_COLUMN_BENCHMARKS],
33
+ )
34
+ def test_read_parquet_num_rowgroups_single_column(
35
+ name, path, columns, read_fn, benchmark
36
+ ):
37
+ data = benchmark(read_fn, path, columns=columns)
38
+ if columns is not None:
39
+ assert data.column_names == columns
40
+
41
+
42
+ @pytest.mark.benchmark(group="num_rowgroups_all_columns")
43
+ @pytest.mark.parametrize(
44
+ ["name", "path", "columns"],
45
+ [(name, path, columns) for name, (path, columns) in ALL_COLUMN_BENCHMARKS.items()],
46
+ ids=[name for name in ALL_COLUMN_BENCHMARKS],
47
+ )
48
+ def test_read_parquet_num_rowgroups_all_columns(
49
+ name, path, columns, read_fn, benchmark
50
+ ):
51
+ data = benchmark(read_fn, path, columns=columns)
52
+ if columns is not None:
53
+ assert data.column_names == columns
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+
3
+ import pyarrow as pa
4
+ import pyarrow.fs as pafs
5
+ import pyarrow.parquet as papq
6
+ import pytest
7
+
8
+ from deltacat.utils.pyarrow import s3_file_to_table
9
+ from deltacat.types.media import (
10
+ ContentEncoding,
11
+ ContentType,
12
+ )
13
+
14
+
15
+ def pyarrow_read(path: str, columns: list[str] | None = None) -> pa.Table:
16
+ assert path.startswith(
17
+ "s3://"
18
+ ), f"Expected file path to start with 's3://', but got {path}."
19
+ fs = pafs.S3FileSystem()
20
+ path = path.replace("s3://", "")
21
+ return papq.read_table(path, columns=columns, filesystem=fs)
22
+
23
+
24
+ def deltacat_read(path: str, columns: list[str] | None = None) -> pa.Table:
25
+ assert path.startswith("s3://")
26
+ return s3_file_to_table(
27
+ path,
28
+ content_type=ContentType.PARQUET,
29
+ content_encoding=ContentEncoding.IDENTITY,
30
+ column_names=None, # Parquet files are schemaful
31
+ include_columns=columns,
32
+ )
33
+
34
+
35
+ def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
36
+ try:
37
+ import daft
38
+ except ImportError:
39
+ raise ImportError(
40
+ "Daft not installed. Install Daft using pip to run these benchmarks: `pip install getdaft`"
41
+ )
42
+
43
+ tbl = daft.table.Table.read_parquet(path, columns=columns)
44
+ return tbl.to_arrow()
45
+
46
+
47
+ @pytest.fixture(
48
+ params=[
49
+ daft_table_read,
50
+ pyarrow_read,
51
+ deltacat_read,
52
+ ],
53
+ ids=[
54
+ "daft_table",
55
+ "pyarrow",
56
+ "deltacat",
57
+ ],
58
+ )
59
+ def read_fn(request):
60
+ """Fixture which returns the function to read a PyArrow table from a path"""
61
+ return request.param
@@ -5,7 +5,7 @@ import ray
5
5
 
6
6
  from deltacat.catalog.model.catalog import Catalog, all_catalogs
7
7
  from deltacat.catalog.model.table_definition import TableDefinition
8
- from deltacat.compute.compactor.model.sort_key import SortKey
8
+ from deltacat.storage.model.sort_key import SortKey
9
9
  from deltacat.storage.model.list_result import ListResult
10
10
  from deltacat.storage.model.namespace import Namespace
11
11
  from deltacat.storage.model.types import (
@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Union
3
3
  import pyarrow as pa
4
4
 
5
5
  from deltacat.catalog.model.table_definition import TableDefinition
6
- from deltacat.compute.compactor.model.sort_key import SortKey
6
+ from deltacat.storage.model.sort_key import SortKey
7
7
  from deltacat.storage.model.list_result import ListResult
8
8
  from deltacat.storage.model.namespace import Namespace
9
9
  from deltacat.storage.model.types import (
@@ -13,7 +13,6 @@ from deltacat.compute.compactor.model.round_completion_info import (
13
13
  RoundCompletionInfo,
14
14
  HighWatermark,
15
15
  )
16
- from deltacat.compute.compactor.model.sort_key import SortKey, SortOrder
17
16
 
18
17
  __all__ = [
19
18
  "DeltaAnnotated",
@@ -27,6 +26,4 @@ __all__ = [
27
26
  "PyArrowWriteResult",
28
27
  "RoundCompletionInfo",
29
28
  "HighWatermark",
30
- "SortKey",
31
- "SortOrder",
32
29
  ]