deltacat 2.0.0b7__tar.gz → 2.0.0b10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. {deltacat-2.0.0b7/deltacat.egg-info → deltacat-2.0.0b10}/PKG-INFO +2 -1
  2. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/__init__.py +27 -6
  3. deltacat-2.0.0b10/deltacat/api.py +523 -0
  4. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/aws/s3u.py +2 -2
  5. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/benchmarking/conftest.py +1 -1
  6. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/main/impl.py +12 -6
  7. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/model/catalog.py +65 -47
  8. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/model/properties.py +1 -3
  9. deltacat-2.0.0b10/deltacat/compute/__init__.py +14 -0
  10. deltacat-2.0.0b10/deltacat/compute/converter/constants.py +9 -0
  11. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/converter_session.py +78 -36
  12. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/model/convert_input.py +24 -4
  13. deltacat-2.0.0b10/deltacat/compute/converter/model/convert_result.py +61 -0
  14. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/model/converter_session_params.py +52 -10
  15. deltacat-2.0.0b10/deltacat/compute/converter/pyiceberg/overrides.py +254 -0
  16. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/steps/convert.py +84 -36
  17. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/steps/dedupe.py +25 -4
  18. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/utils/convert_task_options.py +42 -13
  19. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  20. deltacat-2.0.0b10/deltacat/compute/converter/utils/io.py +114 -0
  21. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/utils/s3u.py +13 -4
  22. deltacat-2.0.0b10/deltacat/compute/jobs/client.py +404 -0
  23. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/constants.py +4 -4
  24. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/daft/daft_scan.py +7 -3
  25. deltacat-2.0.0b10/deltacat/daft/translator.py +126 -0
  26. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/examples/basic_logging.py +5 -3
  27. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/examples/hello_world.py +4 -2
  28. deltacat-2.0.0b10/deltacat/examples/indexer/indexer.py +163 -0
  29. deltacat-2.0.0b10/deltacat/examples/indexer/job_runner.py +199 -0
  30. deltacat-2.0.0b10/deltacat/io/__init__.py +13 -0
  31. deltacat-2.0.0b10/deltacat/io/dataset/deltacat_dataset.py +91 -0
  32. deltacat-2.0.0b10/deltacat/io/datasink/deltacat_datasink.py +207 -0
  33. deltacat-2.0.0b10/deltacat/io/datasource/deltacat_datasource.py +580 -0
  34. deltacat-2.0.0b10/deltacat/io/reader/deltacat_read_api.py +172 -0
  35. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/__init__.py +2 -0
  36. deltacat-2.0.0b10/deltacat/storage/model/expression/__init__.py +47 -0
  37. deltacat-2.0.0b10/deltacat/storage/model/expression/expression.py +656 -0
  38. deltacat-2.0.0b10/deltacat/storage/model/expression/visitor.py +248 -0
  39. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/metafile.py +74 -42
  40. deltacat-2.0.0b10/deltacat/storage/model/scan/push_down.py +46 -0
  41. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/types.py +5 -3
  42. deltacat-2.0.0b10/deltacat/storage/rivulet/__init__.py +11 -0
  43. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/converter/test_convert_session.py +209 -46
  44. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  45. deltacat-2.0.0b10/deltacat/tests/storage/model/test_expression.py +327 -0
  46. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
  47. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/rivulet/test_dataset.py +1 -1
  48. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/rivulet/test_manifest.py +1 -1
  49. deltacat-2.0.0b10/deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  50. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
  51. deltacat-2.0.0b10/deltacat/tests/test_deltacat_api.py +80 -0
  52. deltacat-2.0.0b10/deltacat/tests/test_utils/__init__.py +0 -0
  53. deltacat-2.0.0b10/deltacat/tests/utils/__init__.py +0 -0
  54. deltacat-2.0.0b10/deltacat/tests/utils/data/__init__.py +0 -0
  55. deltacat-2.0.0b10/deltacat/tests/utils/ray_utils/__init__.py +0 -0
  56. deltacat-2.0.0b10/deltacat/types/__init__.py +0 -0
  57. deltacat-2.0.0b10/deltacat/types/media.py +193 -0
  58. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/types/tables.py +35 -7
  59. deltacat-2.0.0b10/deltacat/utils/__init__.py +0 -0
  60. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/daft.py +2 -2
  61. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/filesystem.py +39 -9
  62. deltacat-2.0.0b10/deltacat/utils/polars.py +128 -0
  63. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/pyarrow.py +151 -15
  64. deltacat-2.0.0b10/deltacat/utils/ray_utils/__init__.py +0 -0
  65. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/ray_utils/concurrency.py +1 -1
  66. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/ray_utils/runtime.py +56 -4
  67. deltacat-2.0.0b10/deltacat/utils/url.py +1284 -0
  68. {deltacat-2.0.0b7 → deltacat-2.0.0b10/deltacat.egg-info}/PKG-INFO +2 -1
  69. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat.egg-info/SOURCES.txt +25 -0
  70. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat.egg-info/requires.txt +9 -5
  71. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/setup.py +11 -5
  72. deltacat-2.0.0b7/deltacat/api.py +0 -168
  73. deltacat-2.0.0b7/deltacat/compute/converter/constants.py +0 -4
  74. deltacat-2.0.0b7/deltacat/compute/converter/pyiceberg/overrides.py +0 -135
  75. deltacat-2.0.0b7/deltacat/compute/converter/utils/io.py +0 -43
  76. deltacat-2.0.0b7/deltacat/storage/model/scan/push_down.py +0 -19
  77. deltacat-2.0.0b7/deltacat/storage/rivulet/__init__.py +0 -11
  78. deltacat-2.0.0b7/deltacat/tests/test_deltacat_api.py +0 -39
  79. deltacat-2.0.0b7/deltacat/types/media.py +0 -95
  80. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/LICENSE +0 -0
  81. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/MANIFEST.in +0 -0
  82. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/README.md +0 -0
  83. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/annotations.py +0 -0
  84. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/aws/__init__.py +0 -0
  85. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/aws/clients.py +0 -0
  86. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/aws/constants.py +0 -0
  87. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/benchmarking/__init__.py +0 -0
  88. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/benchmarking/benchmark_engine.py +0 -0
  89. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
  90. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/benchmarking/benchmark_report.py +0 -0
  91. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/benchmarking/benchmark_suite.py +0 -0
  92. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/benchmarking/data/__init__.py +0 -0
  93. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/benchmarking/data/random_row_generator.py +0 -0
  94. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/benchmarking/data/row_generator.py +0 -0
  95. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/benchmarking/test_benchmark_pipeline.py +0 -0
  96. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/__init__.py +0 -0
  97. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/delegate.py +0 -0
  98. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/iceberg/__init__.py +0 -0
  99. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/iceberg/iceberg_catalog_config.py +0 -0
  100. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/iceberg/impl.py +0 -0
  101. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/iceberg/overrides.py +0 -0
  102. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/interface.py +0 -0
  103. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/main/__init__.py +0 -0
  104. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/model/__init__.py +0 -0
  105. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/catalog/model/table_definition.py +0 -0
  106. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/__init__.py +0 -0
  107. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/compaction_session.py +0 -0
  108. {deltacat-2.0.0b7/deltacat/compute → deltacat-2.0.0b10/deltacat/compute/compactor/model}/__init__.py +0 -0
  109. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
  110. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
  111. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/compactor_version.py +0 -0
  112. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
  113. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
  114. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
  115. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
  116. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
  117. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/materialize_result.py +0 -0
  118. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
  119. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  120. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  121. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
  122. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/model/table_object_store.py +0 -0
  123. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/repartition_session.py +0 -0
  124. {deltacat-2.0.0b7/deltacat/compute/compactor/model → deltacat-2.0.0b10/deltacat/compute/compactor/steps}/__init__.py +0 -0
  125. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/steps/dedupe.py +0 -0
  126. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/steps/hash_bucket.py +0 -0
  127. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/steps/materialize.py +0 -0
  128. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/steps/repartition.py +0 -0
  129. {deltacat-2.0.0b7/deltacat/compute/compactor/steps → deltacat-2.0.0b10/deltacat/compute/compactor/utils}/__init__.py +0 -0
  130. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/utils/io.py +0 -0
  131. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
  132. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
  133. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/utils/sort_key.py +0 -0
  134. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor/utils/system_columns.py +0 -0
  135. {deltacat-2.0.0b7/deltacat/compute/compactor/utils → deltacat-2.0.0b10/deltacat/compute/compactor_v2}/__init__.py +0 -0
  136. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/compaction_session.py +0 -0
  137. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/constants.py +0 -0
  138. {deltacat-2.0.0b7/deltacat/compute/compactor_v2 → deltacat-2.0.0b10/deltacat/compute/compactor_v2/deletes}/__init__.py +0 -0
  139. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +0 -0
  140. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/deletes/delete_strategy.py +0 -0
  141. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +0 -0
  142. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/deletes/model.py +0 -0
  143. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/deletes/utils.py +0 -0
  144. {deltacat-2.0.0b7/deltacat/compute/compactor_v2/deletes → deltacat-2.0.0b10/deltacat/compute/compactor_v2/model}/__init__.py +0 -0
  145. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -0
  146. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +0 -0
  147. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
  148. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/model/merge_file_group.py +0 -0
  149. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/model/merge_input.py +0 -0
  150. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/model/merge_result.py +0 -0
  151. {deltacat-2.0.0b7/deltacat/compute/compactor_v2/model → deltacat-2.0.0b10/deltacat/compute/compactor_v2/private}/__init__.py +0 -0
  152. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/private/compaction_utils.py +0 -0
  153. {deltacat-2.0.0b7/deltacat/compute/compactor_v2/private → deltacat-2.0.0b10/deltacat/compute/compactor_v2/steps}/__init__.py +0 -0
  154. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/steps/hash_bucket.py +0 -0
  155. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/steps/merge.py +0 -0
  156. {deltacat-2.0.0b7/deltacat/compute/compactor_v2/steps → deltacat-2.0.0b10/deltacat/compute/compactor_v2/utils}/__init__.py +0 -0
  157. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/utils/content_type_params.py +0 -0
  158. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/utils/dedupe.py +0 -0
  159. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/utils/delta.py +0 -0
  160. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/utils/io.py +0 -0
  161. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/utils/merge.py +0 -0
  162. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/utils/primary_key_index.py +0 -0
  163. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/compactor_v2/utils/task_options.py +0 -0
  164. {deltacat-2.0.0b7/deltacat/compute/compactor_v2/utils → deltacat-2.0.0b10/deltacat/compute/converter}/__init__.py +0 -0
  165. {deltacat-2.0.0b7/deltacat/compute/converter → deltacat-2.0.0b10/deltacat/compute/converter/model}/__init__.py +0 -0
  166. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/model/convert_input_files.py +0 -0
  167. {deltacat-2.0.0b7/deltacat/compute/converter/model → deltacat-2.0.0b10/deltacat/compute/converter/pyiceberg}/__init__.py +0 -0
  168. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/pyiceberg/catalog.py +0 -0
  169. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +0 -0
  170. {deltacat-2.0.0b7/deltacat/compute/converter/pyiceberg → deltacat-2.0.0b10/deltacat/compute/converter/steps}/__init__.py +0 -0
  171. {deltacat-2.0.0b7/deltacat/compute/converter/steps → deltacat-2.0.0b10/deltacat/compute/converter/utils}/__init__.py +0 -0
  172. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/converter/utils/converter_session_utils.py +0 -0
  173. {deltacat-2.0.0b7/deltacat/compute/converter/utils → deltacat-2.0.0b10/deltacat/compute/jobs}/__init__.py +0 -0
  174. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/merge_on_read/__init__.py +0 -0
  175. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/merge_on_read/daft.py +0 -0
  176. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/merge_on_read/model/__init__.py +0 -0
  177. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -0
  178. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  179. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/merge_on_read/utils/delta.py +0 -0
  180. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/resource_estimation/__init__.py +0 -0
  181. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/resource_estimation/delta.py +0 -0
  182. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/resource_estimation/manifest.py +0 -0
  183. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/resource_estimation/model.py +0 -0
  184. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/resource_estimation/parquet.py +0 -0
  185. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/stats/__init__.py +0 -0
  186. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/stats/models/__init__.py +0 -0
  187. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  188. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/stats/models/delta_stats.py +0 -0
  189. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  190. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  191. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/stats/models/stats_result.py +0 -0
  192. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/compute/stats/types.py +0 -0
  193. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/daft/__init__.py +0 -0
  194. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/daft/model.py +0 -0
  195. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/env.py +0 -0
  196. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/examples/__init__.py +0 -0
  197. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/examples/common/__init__.py +0 -0
  198. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/examples/common/fixtures.py +0 -0
  199. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/examples/iceberg/__init__.py +0 -0
  200. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/examples/iceberg/iceberg_bucket_writer.py +0 -0
  201. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/examples/iceberg/iceberg_reader.py +0 -0
  202. {deltacat-2.0.0b7/deltacat/experimental → deltacat-2.0.0b10/deltacat/examples/indexer}/__init__.py +0 -0
  203. {deltacat-2.0.0b7/deltacat/io → deltacat-2.0.0b10/deltacat/examples/indexer/aws}/__init__.py +0 -0
  204. {deltacat-2.0.0b7/deltacat/storage/iceberg → deltacat-2.0.0b10/deltacat/examples/indexer/gcp}/__init__.py +0 -0
  205. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/exceptions.py +0 -0
  206. {deltacat-2.0.0b7/deltacat/storage/main → deltacat-2.0.0b10/deltacat/experimental}/__init__.py +0 -0
  207. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/experimental/daft/__init__.py +0 -0
  208. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/experimental/daft/daft_catalog.py +0 -0
  209. {deltacat-2.0.0b7/deltacat/storage/model → deltacat-2.0.0b10/deltacat/io/dataset}/__init__.py +0 -0
  210. {deltacat-2.0.0b7/deltacat/storage/model/scan → deltacat-2.0.0b10/deltacat/io/datasink}/__init__.py +0 -0
  211. {deltacat-2.0.0b7/deltacat/storage/rivulet/arrow → deltacat-2.0.0b10/deltacat/io/datasource}/__init__.py +0 -0
  212. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/io/file_object_store.py +0 -0
  213. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/io/memcached_object_store.py +0 -0
  214. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/io/object_store.py +0 -0
  215. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/io/ray_plasma_object_store.py +0 -0
  216. {deltacat-2.0.0b7/deltacat/storage/rivulet/fs → deltacat-2.0.0b10/deltacat/io/reader}/__init__.py +0 -0
  217. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/io/redis_object_store.py +0 -0
  218. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/io/s3_object_store.py +0 -0
  219. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/logs.py +0 -0
  220. {deltacat-2.0.0b7/deltacat/storage/rivulet/metastore → deltacat-2.0.0b10/deltacat/storage/iceberg}/__init__.py +0 -0
  221. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/iceberg/iceberg_scan_planner.py +0 -0
  222. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/iceberg/impl.py +0 -0
  223. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/iceberg/model.py +0 -0
  224. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/interface.py +0 -0
  225. {deltacat-2.0.0b7/deltacat/storage/rivulet/reader → deltacat-2.0.0b10/deltacat/storage/main}/__init__.py +0 -0
  226. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/main/impl.py +0 -0
  227. {deltacat-2.0.0b7/deltacat/storage/rivulet/schema → deltacat-2.0.0b10/deltacat/storage/model}/__init__.py +0 -0
  228. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/delta.py +0 -0
  229. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/interop.py +0 -0
  230. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/list_result.py +0 -0
  231. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/locator.py +0 -0
  232. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/manifest.py +0 -0
  233. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/namespace.py +0 -0
  234. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/partition.py +0 -0
  235. {deltacat-2.0.0b7/deltacat/storage/rivulet/writer → deltacat-2.0.0b10/deltacat/storage/model/scan}/__init__.py +0 -0
  236. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/scan/scan_plan.py +0 -0
  237. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/scan/scan_task.py +0 -0
  238. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/schema.py +0 -0
  239. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/shard.py +0 -0
  240. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/sort_key.py +0 -0
  241. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/stream.py +0 -0
  242. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/table.py +0 -0
  243. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/table_version.py +0 -0
  244. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/transaction.py +0 -0
  245. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/model/transform.py +0 -0
  246. {deltacat-2.0.0b7/deltacat/storage/util → deltacat-2.0.0b10/deltacat/storage/rivulet/arrow}/__init__.py +0 -0
  247. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/arrow/serializer.py +0 -0
  248. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/dataset.py +0 -0
  249. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/dataset_executor.py +0 -0
  250. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/feather/__init__.py +0 -0
  251. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/feather/file_reader.py +0 -0
  252. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/feather/serializer.py +0 -0
  253. {deltacat-2.0.0b7/deltacat/tests → deltacat-2.0.0b10/deltacat/storage/rivulet/fs}/__init__.py +0 -0
  254. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/fs/file_provider.py +0 -0
  255. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/fs/file_store.py +0 -0
  256. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/fs/input_file.py +0 -0
  257. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/fs/output_file.py +0 -0
  258. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/logical_plan.py +0 -0
  259. {deltacat-2.0.0b7/deltacat/tests/aws → deltacat-2.0.0b10/deltacat/storage/rivulet/metastore}/__init__.py +0 -0
  260. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/metastore/delta.py +0 -0
  261. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/metastore/json_sst.py +0 -0
  262. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/metastore/sst.py +0 -0
  263. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/metastore/sst_interval_tree.py +0 -0
  264. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/mvp/Table.py +0 -0
  265. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/mvp/__init__.py +0 -0
  266. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/parquet/__init__.py +0 -0
  267. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  268. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/parquet/file_reader.py +0 -0
  269. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/parquet/serializer.py +0 -0
  270. {deltacat-2.0.0b7/deltacat/tests/catalog → deltacat-2.0.0b10/deltacat/storage/rivulet/reader}/__init__.py +0 -0
  271. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/reader/block_scanner.py +0 -0
  272. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/reader/data_reader.py +0 -0
  273. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/reader/data_scan.py +0 -0
  274. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/reader/dataset_metastore.py +0 -0
  275. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/reader/dataset_reader.py +0 -0
  276. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/reader/pyarrow_data_reader.py +0 -0
  277. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/reader/query_expression.py +0 -0
  278. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/reader/reader_type_registrar.py +0 -0
  279. {deltacat-2.0.0b7/deltacat/tests/compute → deltacat-2.0.0b10/deltacat/storage/rivulet/schema}/__init__.py +0 -0
  280. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/schema/datatype.py +0 -0
  281. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/schema/schema.py +0 -0
  282. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/serializer.py +0 -0
  283. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/serializer_factory.py +0 -0
  284. {deltacat-2.0.0b7/deltacat/tests/compute/compactor → deltacat-2.0.0b10/deltacat/storage/rivulet/writer}/__init__.py +0 -0
  285. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/writer/dataset_writer.py +0 -0
  286. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/rivulet/writer/memtable_dataset_writer.py +0 -0
  287. {deltacat-2.0.0b7/deltacat/tests/compute/compactor/steps → deltacat-2.0.0b10/deltacat/storage/util}/__init__.py +0 -0
  288. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/storage/util/scan_planner.py +0 -0
  289. {deltacat-2.0.0b7/deltacat/tests/compute/compactor/utils → deltacat-2.0.0b10/deltacat/tests}/__init__.py +0 -0
  290. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/_io/__init__.py +0 -0
  291. {deltacat-2.0.0b7/deltacat/tests/compute/compactor_v2 → deltacat-2.0.0b10/deltacat/tests/_io/reader}/__init__.py +0 -0
  292. /deltacat-2.0.0b7/deltacat/tests/compute/compactor_v2/utils/__init__.py → /deltacat-2.0.0b10/deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  293. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/_io/test_cloudpickle_bug_fix.py +0 -0
  294. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/_io/test_file_object_store.py +0 -0
  295. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/_io/test_memcached_object_store.py +0 -0
  296. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/_io/test_ray_plasma_object_store.py +0 -0
  297. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/_io/test_redis_object_store.py +0 -0
  298. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/_io/test_s3_object_store.py +0 -0
  299. {deltacat-2.0.0b7/deltacat/tests/compute/converter → deltacat-2.0.0b10/deltacat/tests/aws}/__init__.py +0 -0
  300. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/aws/test_clients.py +0 -0
  301. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/aws/test_s3u.py +0 -0
  302. {deltacat-2.0.0b7/deltacat/tests/compute/resource_estimation → deltacat-2.0.0b10/deltacat/tests/catalog}/__init__.py +0 -0
  303. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/catalog/test_catalogs.py +0 -0
  304. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/catalog/test_default_catalog_impl.py +0 -0
  305. {deltacat-2.0.0b7/deltacat/tests/compute/resource_estimation/data → deltacat-2.0.0b10/deltacat/tests/compute}/__init__.py +0 -0
  306. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +0 -0
  307. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/compact_partition_rebase_test_cases.py +0 -0
  308. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +0 -0
  309. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/compact_partition_test_cases.py +0 -0
  310. {deltacat-2.0.0b7/deltacat/tests/storage → deltacat-2.0.0b10/deltacat/tests/compute/compactor}/__init__.py +0 -0
  311. {deltacat-2.0.0b7/deltacat/tests/storage/main → deltacat-2.0.0b10/deltacat/tests/compute/compactor/steps}/__init__.py +0 -0
  312. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/compactor/steps/test_repartition.py +0 -0
  313. {deltacat-2.0.0b7/deltacat/tests/storage/model → deltacat-2.0.0b10/deltacat/tests/compute/compactor/utils}/__init__.py +0 -0
  314. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/compactor/utils/test_io.py +0 -0
  315. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -0
  316. {deltacat-2.0.0b7/deltacat/tests/storage/rivulet → deltacat-2.0.0b10/deltacat/tests/compute/compactor_v2}/__init__.py +0 -0
  317. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -0
  318. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
  319. {deltacat-2.0.0b7/deltacat/tests/storage/rivulet/fs → deltacat-2.0.0b10/deltacat/tests/compute/compactor_v2/utils}/__init__.py +0 -0
  320. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -0
  321. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/conftest.py +0 -0
  322. {deltacat-2.0.0b7/deltacat/tests/storage/rivulet/schema → deltacat-2.0.0b10/deltacat/tests/compute/converter}/__init__.py +0 -0
  323. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/converter/conftest.py +0 -0
  324. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/converter/utils.py +0 -0
  325. {deltacat-2.0.0b7/deltacat/tests/storage/rivulet/writer → deltacat-2.0.0b10/deltacat/tests/compute/resource_estimation}/__init__.py +0 -0
  326. {deltacat-2.0.0b7/deltacat/tests/test_utils → deltacat-2.0.0b10/deltacat/tests/compute/resource_estimation/data}/__init__.py +0 -0
  327. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/resource_estimation/test_delta.py +0 -0
  328. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/resource_estimation/test_manifest.py +0 -0
  329. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/test_compact_partition_incremental.py +0 -0
  330. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/test_compact_partition_multiple_rounds.py +0 -0
  331. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/test_compact_partition_params.py +0 -0
  332. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/test_compact_partition_rebase.py +0 -0
  333. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +0 -0
  334. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/test_util_common.py +0 -0
  335. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/test_util_constant.py +0 -0
  336. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -0
  337. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/local_deltacat_storage/exceptions.py +0 -0
  338. {deltacat-2.0.0b7/deltacat/tests/utils → deltacat-2.0.0b10/deltacat/tests/storage}/__init__.py +0 -0
  339. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/conftest.py +0 -0
  340. {deltacat-2.0.0b7/deltacat/tests/utils/data → deltacat-2.0.0b10/deltacat/tests/storage/main}/__init__.py +0 -0
  341. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/main/test_main_storage.py +0 -0
  342. {deltacat-2.0.0b7/deltacat/tests/utils/ray_utils → deltacat-2.0.0b10/deltacat/tests/storage/model}/__init__.py +0 -0
  343. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/model/test_delete_parameters.py +0 -0
  344. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/model/test_manifest.py +0 -0
  345. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/model/test_metafile_io.py +0 -0
  346. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/model/test_schema.py +0 -0
  347. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/model/test_shard.py +0 -0
  348. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/model/test_table_version.py +0 -0
  349. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/model/test_transaction.py +0 -0
  350. {deltacat-2.0.0b7/deltacat/types → deltacat-2.0.0b10/deltacat/tests/storage/rivulet}/__init__.py +0 -0
  351. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/rivulet/conftest.py +0 -0
  352. {deltacat-2.0.0b7/deltacat/utils → deltacat-2.0.0b10/deltacat/tests/storage/rivulet/fs}/__init__.py +0 -0
  353. {deltacat-2.0.0b7/deltacat/utils/ray_utils → deltacat-2.0.0b10/deltacat/tests/storage/rivulet/schema}/__init__.py +0 -0
  354. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/rivulet/schema/test_schema.py +0 -0
  355. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/rivulet/test_sst_interval_tree.py +0 -0
  356. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/rivulet/test_utils.py +0 -0
  357. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +0 -0
  358. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +0 -0
  359. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/test_exceptions.py +0 -0
  360. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/test_logs.py +0 -0
  361. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/test_utils/constants.py +0 -0
  362. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/test_utils/filesystem.py +0 -0
  363. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/test_utils/message_pack_utils.py +0 -0
  364. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/test_utils/pyarrow.py +0 -0
  365. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/test_utils/storage.py +0 -0
  366. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/test_utils/utils.py +0 -0
  367. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/utils/ray_utils/test_concurrency.py +0 -0
  368. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/utils/ray_utils/test_dataset.py +0 -0
  369. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/utils/test_cloudpickle.py +0 -0
  370. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/utils/test_daft.py +0 -0
  371. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/utils/test_metrics.py +0 -0
  372. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/utils/test_placement.py +0 -0
  373. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/utils/test_pyarrow.py +0 -0
  374. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
  375. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/tests/utils/test_resources.py +0 -0
  376. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/types/partial_download.py +0 -0
  377. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/arguments.py +0 -0
  378. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/cloudpickle.py +0 -0
  379. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/common.py +0 -0
  380. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/export.py +0 -0
  381. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/metafile_locator.py +0 -0
  382. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/metrics.py +0 -0
  383. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/numpy.py +0 -0
  384. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/pandas.py +0 -0
  385. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/performance.py +0 -0
  386. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/placement.py +0 -0
  387. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/ray_utils/collections.py +0 -0
  388. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/ray_utils/dataset.py +0 -0
  389. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/ray_utils/performance.py +0 -0
  390. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/resources.py +0 -0
  391. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/s3fs.py +0 -0
  392. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat/utils/schema.py +0 -0
  393. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat.egg-info/dependency_links.txt +0 -0
  394. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/deltacat.egg-info/top_level.txt +0 -0
  395. {deltacat-2.0.0b7 → deltacat-2.0.0b10}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 2.0.0b7
3
+ Version: 2.0.0b10
4
4
  Summary: A portable, scalable, fast, and Pythonic Data Lakehouse for AI.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -15,6 +15,7 @@ Classifier: Operating System :: OS Independent
15
15
  Requires-Python: >=3.9
16
16
  Description-Content-Type: text/markdown
17
17
  Provides-Extra: iceberg
18
+ Provides-Extra: s3fs
18
19
  License-File: LICENSE
19
20
 
20
21
  <p align="center">
@@ -5,6 +5,7 @@ import deltacat.logs # noqa: F401
5
5
  from deltacat.api import (
6
6
  copy,
7
7
  get,
8
+ list,
8
9
  put,
9
10
  )
10
11
  from deltacat.catalog.delegate import (
@@ -30,13 +31,19 @@ from deltacat.catalog.delegate import (
30
31
  from deltacat.catalog.model.catalog import ( # noqa: F401
31
32
  Catalog,
32
33
  Catalogs,
34
+ raise_if_not_initialized,
33
35
  is_initialized,
34
36
  init,
35
37
  get_catalog,
36
38
  put_catalog,
37
39
  )
38
40
  from deltacat.catalog.model.table_definition import TableDefinition
41
+ from deltacat.compute import (
42
+ job_client,
43
+ local_job_client,
44
+ )
39
45
  from deltacat.storage import (
46
+ Dataset,
40
47
  DistributedDataset,
41
48
  Field,
42
49
  LifecycleState,
@@ -53,9 +60,16 @@ from deltacat.storage import (
53
60
  SortScheme,
54
61
  NullOrder,
55
62
  )
56
- from deltacat.storage.rivulet import Dataset, Datatype
57
- from deltacat.types.media import ContentEncoding, ContentType, TableType
63
+ from deltacat.storage.rivulet import Dataset as RivDataset, Datatype as RivDatatype
64
+ from deltacat.types.media import (
65
+ ContentEncoding,
66
+ ContentType,
67
+ DatasetType,
68
+ DatastoreType,
69
+ )
70
+
58
71
  from deltacat.types.tables import TableWriteMode
72
+ from deltacat.utils.url import DeltaCatUrl
59
73
 
60
74
  __iceberg__ = []
61
75
  if importlib.util.find_spec("pyiceberg") is not None:
@@ -67,13 +81,16 @@ if importlib.util.find_spec("pyiceberg") is not None:
67
81
 
68
82
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
69
83
 
70
- __version__ = "2.0.0b7"
84
+ __version__ = "2.0.0b10"
71
85
 
72
86
 
73
87
  __all__ = [
74
88
  "__version__",
89
+ "job_client",
90
+ "local_job_client",
75
91
  "copy",
76
92
  "get",
93
+ "list",
77
94
  "put",
78
95
  "alter_table",
79
96
  "create_table",
@@ -95,14 +112,19 @@ __all__ = [
95
112
  "read_table",
96
113
  "get_catalog",
97
114
  "put_catalog",
115
+ "raise_if_not_initialized",
98
116
  "is_initialized",
99
117
  "init",
100
118
  "Catalog",
101
119
  "ContentType",
102
120
  "ContentEncoding",
103
- "DistributedDataset",
104
121
  "Dataset",
105
- "Datatype",
122
+ "DatasetType",
123
+ "DatastoreType",
124
+ "DeltaCatUrl",
125
+ "DistributedDataset",
126
+ "RivDataset",
127
+ "RivDatatype",
106
128
  "Field",
107
129
  "LifecycleState",
108
130
  "ListResult",
@@ -118,7 +140,6 @@ __all__ = [
118
140
  "SortOrder",
119
141
  "SortScheme",
120
142
  "TableDefinition",
121
- "TableType",
122
143
  "TableWriteMode",
123
144
  ]
124
145
 
@@ -0,0 +1,523 @@
1
+ import time
2
+ from dataclasses import dataclass
3
+ from typing import Any, Union, List, Optional, Dict, Callable, Tuple
4
+
5
+ import ray
6
+ import deltacat as dc
7
+ import pyarrow.fs as pafs
8
+
9
+ from pyarrow.fs import FileType
10
+ from ray.exceptions import OutOfMemoryError
11
+
12
+ from deltacat.constants import BYTES_PER_GIBIBYTE
13
+ from deltacat.io import (
14
+ read_deltacat,
15
+ DeltacatReadType,
16
+ )
17
+ from deltacat.storage import (
18
+ Dataset,
19
+ DistributedDataset,
20
+ ListResult,
21
+ LocalTable,
22
+ Metafile,
23
+ )
24
+ from deltacat.types.media import DatasetType
25
+ from deltacat.utils.url import (
26
+ DeltaCatUrl,
27
+ DeltaCatUrlReader,
28
+ DeltaCatUrlWriter,
29
+ )
30
+ from deltacat.utils.common import ReadKwargsProvider
31
+ from deltacat.types.tables import (
32
+ get_table_size,
33
+ get_table_length,
34
+ )
35
+ from deltacat.utils.filesystem import (
36
+ resolve_path_and_filesystem,
37
+ get_file_info,
38
+ )
39
+ from deltacat.utils.performance import timed_invocation
40
+ from deltacat.utils.ray_utils.runtime import (
41
+ current_node_resources,
42
+ live_cpu_waiter,
43
+ live_node_resource_keys,
44
+ other_live_node_resource_keys,
45
+ find_max_single_node_resource_type,
46
+ )
47
+
48
+ """
49
+ # CLI Example of Copying from Source to Dest without file conversion
50
+ # (i.e., register only - shallow copy):
51
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table
52
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table
53
+
54
+ # CLI Example of Copying from Source to Dest without file conversion
55
+ # (i.e., register only - deep copy):
56
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table -r
57
+ # The above command will make a deep copy of all JSON files found in the source
58
+ # to the catalog data file directory in the destination.
59
+
60
+ # CLI Example of Copying from Source to Dest with file conversion
61
+ # (i.e., deep copy with file content type transformation):
62
+ $ dcat convert json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/ --type FEATHER
63
+ # The above command will read JSON files found in the source, transform them to
64
+ # Arrow Feather files, and register them in the destination.
65
+
66
+ # Python Example of Copying from Source to Dest with file conversion
67
+ # (i.e., deep copy with file content type transformation):
68
+ >>> ds = dc.get("json+s3://my_bucket/log_manager/")
69
+ >>> dc.put("dc://my_deltacat_catalog/log_manager/", dataset=ds, type=ContentType.FEATHER)
70
+ # Or, equivalently, we can do the write directly from the dataset:
71
+ >>> ds.write_deltacat("dc://my_deltacat_catalog/log_manager/", type=ContentType.FEATHER)
72
+ """
73
+
74
+
75
+ def _copy_dc(
76
+ source: DeltaCatUrl,
77
+ destination: DeltaCatUrl,
78
+ recursive: bool = False,
79
+ ) -> Metafile:
80
+ if recursive:
81
+ src_obj = list(source, recursive=True)
82
+ else:
83
+ src_obj = get(source) if not source.url.endswith("/*") else list(source)
84
+ """
85
+ dc_dest_url = DeltacatUrl(destination)
86
+ # TODO(pdames): Add writer with support for Ray Dataset DeltaCAT Sink &
87
+ # Recursive DeltaCAT source object copies. Ideally, the Ray Dataset read
88
+ # is lazy, and only indexes metadata about the objects at source instead
89
+ # of eagerly converting them to PyArrow-based Blocks.
90
+ dc_dest_url.writer(src_obj, recursive=recursive)
91
+ """
92
+
93
+ src_parts = source.url.split("/")
94
+ src_parts = [part for part in src_parts if part]
95
+ dst_parts = destination.url.split("/")
96
+ dst_parts = [part for part in dst_parts if part]
97
+ dc.raise_if_not_initialized()
98
+ if len(src_parts) != len(dst_parts):
99
+ # TODO(pdames): Better error message.
100
+ raise ValueError(
101
+ f"Cannot copy {source} to {destination}. "
102
+ f"Source and destination must share the same type."
103
+ )
104
+ return put(destination, metafile=src_obj)
105
+
106
+
107
+ def copy(
108
+ src: DeltaCatUrl,
109
+ dst: DeltaCatUrl,
110
+ *,
111
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
112
+ extension_to_memory_multiplier: Dict[str, float] = {
113
+ "pq": 5,
114
+ "parquet": 5,
115
+ "feather": 1.5,
116
+ "arrow": 1.5,
117
+ "csv": 1.5,
118
+ "tsv": 1.5,
119
+ "psv": 1.5,
120
+ "txt": 1.5,
121
+ "json": 1.5,
122
+ "jsonl": 1.5,
123
+ "gz": 35,
124
+ "bz2": 35,
125
+ "zip": 35,
126
+ "7z": 35,
127
+ "*": 2.5,
128
+ },
129
+ minimum_worker_cpus: int = 0,
130
+ reader_args: Dict[str, Any] = {},
131
+ writer_args: Dict[str, Any] = {},
132
+ filesystem: Optional[pafs.FileSystem] = None,
133
+ ) -> Union[Metafile, str]:
134
+ """
135
+ Copies data from the source datastore to the destination datastore. By
136
+ default, this method launches one parallel Ray process to read/transform
137
+ each input file found in the source followed by one parallel Ray process
138
+ to write each output file to the destination. Files written to the
139
+ destination are split or combined to contain uniform record counts. To
140
+ ensure that adequate resources are available to complete the operation,
141
+ you may optionally specify minimum cluster and/or worker CPUs to wait for
142
+ before starting parallel processing.
143
+
144
+ Args:
145
+ src: DeltaCAT URL of the source datastore to read.
146
+ dst: DeltaCAT URL of the destination datastore to write.
147
+ transforms: List of transforms to apply to the source dataset prior
148
+ to write it to the destination datastore. Transforms take the in-memory
149
+ dataset type read (e.g., Polars DataFrame) and source DeltaCAT URL as
150
+ input and return the same dataset type as output. Transforms are
151
+ applied to the dataset in the order given.
152
+ extension_to_memory_multiplier: Dictionary of file extensions to
153
+ in-memory inflation estimates for that extension (i.e., the amount
154
+ of memory required to read a source file, apply transforms, and write
155
+ it back to a destination file).
156
+ minimum_worker_cpus: The minimum number of Ray worker CPUs
157
+ to wait for before starting distributed execution. Useful for cases
158
+ where the operation is known to suffer from resource starvation (e.g.,
159
+ out-of-memory errors) if started before the cluster has launched a
160
+ minimum number of required worker nodes.
161
+ reader_args: Additional keyword arguments to forward to the reader
162
+ associated with the in-memory dataset and datastore type to read
163
+ (e.g., polars.read_csv(**kwargs)).
164
+ writer_args: Additional keyword arguments to forward to the writer
165
+ associated with the in-memory dataset type read and datastore type to
166
+ write (e.g., polars.DataFrame.write_parquet(**kwargs)).
167
+ filesystem: Optional PyArrow filesystem to use for file IO. Will be
168
+ automatically resolved from the input path if not specified, and
169
+ will attempt to automatically resolve storage read/write
170
+ credentials for the associated source/dest file cloud provider(s).
171
+ Try providing your own filesystem with credentials, retry strategy,
172
+ etc. pre-configured if you encounter latency issues or errors
173
+ reading/writing files.
174
+
175
+ Returns:
176
+ None
177
+ """
178
+ if src.is_deltacat_catalog_url() or dst.is_deltacat_catalog_url():
179
+ return _copy_dc(src, dst, recursive=src.url.endswith("/**"))
180
+ else:
181
+ return _copy_external_ray(
182
+ src,
183
+ dst,
184
+ transforms=transforms,
185
+ extension_to_memory_multiplier=extension_to_memory_multiplier,
186
+ minimum_worker_cpus=minimum_worker_cpus,
187
+ reader_args=reader_args,
188
+ writer_args=writer_args,
189
+ filesystem=filesystem,
190
+ )
191
+
192
+
193
+ def concat(source, destination):
194
+ raise NotImplementedError
195
+
196
+
197
+ def delete(source):
198
+ raise NotImplementedError
199
+
200
+
201
+ def move(source, destination):
202
+ raise NotImplementedError
203
+
204
+
205
+ def _list_all_metafiles(
206
+ url: DeltaCatUrl,
207
+ recursive: bool = False,
208
+ **kwargs,
209
+ ) -> List[Metafile]:
210
+ reader = DeltaCatUrlReader(url)
211
+ list_results: List[ListResult[Metafile]] = []
212
+ lister = reader.listers.pop(0)[0]
213
+ # the top-level lister doesn't have any missing keyword args
214
+ metafiles: ListResult[Metafile] = lister(**kwargs)
215
+ list_results.append(metafiles)
216
+ if recursive:
217
+ for lister, kwarg_name, kwarg_val_resolver_fn in reader.listers:
218
+ # each subsequent lister needs to inject missing keyword args from the parent metafile
219
+ for metafile in metafiles.all_items():
220
+ kwargs_update = (
221
+ {kwarg_name: kwarg_val_resolver_fn(metafile)}
222
+ if kwarg_name and kwarg_val_resolver_fn
223
+ else {}
224
+ )
225
+ lister_kwargs = {
226
+ **kwargs,
227
+ **kwargs_update,
228
+ }
229
+ metafiles = lister(**lister_kwargs)
230
+ list_results.append(metafiles)
231
+ return [
232
+ metafile for list_result in list_results for metafile in list_result.all_items()
233
+ ]
234
+
235
+
236
+ class CustomReadKwargsProvider(ReadKwargsProvider):
237
+ def __init__(
238
+ self,
239
+ datasource_type: str,
240
+ kwargs: Dict[str, Any],
241
+ ):
242
+ self._datasource_type = datasource_type
243
+ self._kwargs = kwargs
244
+
245
+ def _get_kwargs(
246
+ self,
247
+ datasource_type: str,
248
+ kwargs: Dict[str, Any],
249
+ ) -> Dict[str, Any]:
250
+ if datasource_type == self._datasource_type:
251
+ kwargs.update(self._kwargs)
252
+ return kwargs
253
+
254
+
255
+ def list(
256
+ url: DeltaCatUrl,
257
+ *,
258
+ recursive: bool = False,
259
+ dataset_type: Optional[DatasetType] = None,
260
+ **kwargs,
261
+ ) -> Union[List[Metafile], LocalTable, DistributedDataset]:
262
+ if not url.is_deltacat_catalog_url():
263
+ raise NotImplementedError("List only supports DeltaCAT Catalog URLs.")
264
+ if dataset_type in DatasetType.distributed():
265
+ if dataset_type == DatasetType.RAY_DATASET:
266
+ read_type = (
267
+ DeltacatReadType.METADATA_LIST
268
+ if not recursive
269
+ else DeltacatReadType.METADATA_LIST_RECURSIVE
270
+ )
271
+ return read_deltacat(
272
+ [url],
273
+ deltacat_read_type=read_type,
274
+ timestamp_as_of=None,
275
+ merge_on_read=False,
276
+ read_kwargs_provider=CustomReadKwargsProvider(
277
+ datasource_type=url.datastore_type,
278
+ kwargs=kwargs,
279
+ ),
280
+ )
281
+ else:
282
+ raise NotImplementedError(
283
+ f"Unsupported dataset type: {dataset_type.name}. "
284
+ f"Supported Dataset Types: {DatasetType.RAY_DATASET.name}",
285
+ )
286
+ else:
287
+ # return a local list of metafiles
288
+ # TODO(pdames): Cast the list to the appropriate local dataset type.
289
+ return _list_all_metafiles(
290
+ url=url,
291
+ recursive=recursive,
292
+ **kwargs,
293
+ )
294
+
295
+
296
+ def get(
297
+ url,
298
+ *args,
299
+ **kwargs,
300
+ ) -> Union[Metafile, Dataset]:
301
+ reader = DeltaCatUrlReader(url)
302
+ return reader.read(*args, **kwargs)
303
+
304
+
305
+ def put(
306
+ url: DeltaCatUrl,
307
+ metafile: Optional[Metafile] = None,
308
+ *args,
309
+ **kwargs,
310
+ ) -> Union[Metafile, str]:
311
+ writer = DeltaCatUrlWriter(url, metafile)
312
+ return writer.write(*args, **kwargs)
313
+
314
+
315
+ def touch(path):
316
+ raise NotImplementedError
317
+
318
+
319
+ def exists(path):
320
+ raise NotImplementedError
321
+
322
+
323
+ def query(expression):
324
+ raise NotImplementedError
325
+
326
+
327
+ def tail(path):
328
+ raise NotImplementedError
329
+
330
+
331
+ def head(path):
332
+ raise NotImplementedError
333
+
334
+
335
+ def _copy_external_ray(
336
+ src: DeltaCatUrl,
337
+ dst: DeltaCatUrl,
338
+ *,
339
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
340
+ extension_to_memory_multiplier: Dict[str, float] = {
341
+ "pq": 5,
342
+ "parquet": 5,
343
+ "feather": 1.5,
344
+ "arrow": 1.5,
345
+ "csv": 1.5,
346
+ "tsv": 1.5,
347
+ "psv": 1.5,
348
+ "txt": 1.5,
349
+ "json": 1.5,
350
+ "jsonl": 1.5,
351
+ "gz": 35,
352
+ "bz2": 35,
353
+ "zip": 35,
354
+ "7z": 35,
355
+ "*": 2.5,
356
+ },
357
+ minimum_worker_cpus: int = 0,
358
+ reader_args: Dict[str, Any] = {},
359
+ writer_args: Dict[str, Any] = {},
360
+ filesystem: pafs.FileSystem = None,
361
+ ) -> str:
362
+ print(f"DeltaCAT Copy Invocation Received at: {time.time_ns()}")
363
+
364
+ if not isinstance(src, DeltaCatUrl):
365
+ raise ValueError(f"Expected `src` to be a `DeltaCatUrl` but got `{src}`.")
366
+
367
+ # wait for required resources
368
+ head_cpu_count = int(current_node_resources()["CPU"])
369
+ if minimum_worker_cpus > 0:
370
+ print(f"Waiting for {minimum_worker_cpus} worker CPUs...")
371
+ live_cpu_waiter(
372
+ min_live_cpus=minimum_worker_cpus + head_cpu_count,
373
+ )
374
+ print(f"{minimum_worker_cpus} worker CPUs found!")
375
+ # start job execution
376
+ cluster_resources = ray.cluster_resources()
377
+ print(f"Cluster Resources: {cluster_resources}")
378
+ print(f"Available Cluster Resources: {ray.available_resources()}")
379
+ cluster_cpus = int(cluster_resources["CPU"])
380
+ print(f"Cluster CPUs: {cluster_cpus}")
381
+ all_node_resource_keys = live_node_resource_keys()
382
+ print(f"Found {len(all_node_resource_keys)} live nodes: {all_node_resource_keys}")
383
+ worker_node_resource_keys = other_live_node_resource_keys()
384
+ print(
385
+ f"Found {len(worker_node_resource_keys)} live worker nodes: {worker_node_resource_keys}"
386
+ )
387
+ worker_cpu_count = cluster_cpus - head_cpu_count
388
+ print(f"Total worker CPUs: {worker_cpu_count}")
389
+
390
+ # estimate memory requirements based on file extension
391
+ estimated_memory_bytes = 0
392
+ if extension_to_memory_multiplier:
393
+ print(f"Resolving stats collection filesystem for: {src.url_path}.")
394
+ path, filesystem = resolve_path_and_filesystem(src.url_path, filesystem)
395
+ if isinstance(filesystem, pafs.GcsFileSystem):
396
+ from datetime import timedelta
397
+
398
+ # Configure a retry time limit for GcsFileSystem so that it
399
+ # doesn't hang forever trying to get file info (e.g., when
400
+ # trying to get a public file w/o anonymous=True).
401
+ filesystem = pafs.GcsFileSystem(
402
+ anonymous=True,
403
+ retry_time_limit=timedelta(seconds=10),
404
+ )
405
+ print(f"Using filesystem {type(filesystem)} to get file size of: {path}")
406
+ file_info = get_file_info(path, filesystem)
407
+ if file_info.type != FileType.File:
408
+ raise ValueError(
409
+ f"Expected `src` to be a file but got `{file_info.type}` at "
410
+ f"`{src.url_path}`."
411
+ )
412
+ inflation_multiplier = extension_to_memory_multiplier.get(file_info.extension)
413
+ if inflation_multiplier is None:
414
+ inflation_multiplier = extension_to_memory_multiplier.get("*")
415
+ estimated_memory_bytes = inflation_multiplier * file_info.size
416
+ print(
417
+ f"Estimated Memory Required for Copy: "
418
+ f"{estimated_memory_bytes/BYTES_PER_GIBIBYTE} GiB"
419
+ )
420
+ print(f"Starting DeltaCAT Copy at: {time.time_ns()}")
421
+
422
+ index_result = None
423
+ num_cpus = 1
424
+ # TODO(pdames): remove hard-coding - issues encountered when going greater
425
+ # than 2 include verifying that the scope of schedulable nodes doesn't
426
+ # result in all large files lining up for the one large node in the cluster
427
+ # that can actually handle them (which is worse if it's also the head node)
428
+ max_allowed_cpus = 2
429
+ while not index_result:
430
+ copy_task_pending, latency = timed_invocation(
431
+ copy_task.options(num_cpus=num_cpus, memory=estimated_memory_bytes).remote,
432
+ src=src,
433
+ dest=dst,
434
+ dataset_type=DatasetType.POLARS,
435
+ transforms=transforms,
436
+ reader_args=reader_args,
437
+ writer_args=writer_args,
438
+ )
439
+ print(f"Time to Launch Copy Task: {latency} seconds")
440
+ try:
441
+ index_result, latency = timed_invocation(
442
+ ray.get,
443
+ copy_task_pending,
444
+ )
445
+ except OutOfMemoryError as e:
446
+ print(f"Copy Task Ran Out of Memory: {e}")
447
+ max_single_node_cpus = min(
448
+ max_allowed_cpus, find_max_single_node_resource_type("CPU")
449
+ )
450
+ num_cpus += 1
451
+ if num_cpus > max_single_node_cpus:
452
+ raise e
453
+ print(f"Retrying Failed Copy Task with {num_cpus} dedicated CPUs")
454
+
455
+ print(f"Time to Launch Copy Task: {latency} seconds")
456
+ print(f"Time to Complete Copy Task: {latency} seconds")
457
+
458
+ total_gib_indexed = index_result.table_size / BYTES_PER_GIBIBYTE
459
+
460
+ print(f"Records Copied: {index_result.table_length}")
461
+ print(f"Bytes Copied: {total_gib_indexed} GiB")
462
+ print(f"Conversion Rate: {total_gib_indexed/latency} GiB/s")
463
+ print(f"Finished Copy at: {time.time_ns()}")
464
+
465
+ return dst.url
466
+
467
+
468
+ @ray.remote(scheduling_strategy="SPREAD")
469
+ def copy_task(
470
+ src: DeltaCatUrl,
471
+ dest: DeltaCatUrl,
472
+ dataset_type: DatasetType,
473
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
474
+ reader_args: Dict[str, Any] = {},
475
+ writer_args: Dict[str, Any] = {},
476
+ ) -> Tuple[Optional[int], int]:
477
+ """
478
+ Indexes a DeltaCAT source URL into a DeltaCAT destination URL.
479
+ """
480
+ table, latency = timed_invocation(
481
+ read_table,
482
+ src=src,
483
+ dataset_type=dataset_type,
484
+ transforms=transforms,
485
+ reader_args=reader_args,
486
+ )
487
+ print(f"Time to read {src.url_path}: {latency} seconds")
488
+
489
+ table_size = get_table_size(table)
490
+ print(f"Table Size: {table_size/BYTES_PER_GIBIBYTE} GiB")
491
+
492
+ table_length = get_table_length(table)
493
+ print(f"Table Records: {table_length}")
494
+
495
+ writer = DeltaCatUrlWriter(dest, dataset_type)
496
+ written_file_path, latency = timed_invocation(
497
+ writer.write,
498
+ "",
499
+ table,
500
+ **writer_args,
501
+ )
502
+ print(f"Time to write {written_file_path}: {latency}")
503
+
504
+ return CopyResult(table_size, table_length)
505
+
506
+
507
+ def read_table(
508
+ src: DeltaCatUrl,
509
+ dataset_type: DatasetType,
510
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
511
+ reader_args: Dict[str, Any] = {},
512
+ ) -> LocalTable:
513
+ reader = DeltaCatUrlReader(src, dataset_type)
514
+ table: LocalTable = reader.read(**reader_args)
515
+ for transform in transforms:
516
+ table = transform(table, src)
517
+ return table
518
+
519
+
520
+ @dataclass(frozen=True)
521
+ class CopyResult:
522
+ table_size: int
523
+ table_length: int
@@ -48,7 +48,7 @@ from deltacat.types.media import (
48
48
  )
49
49
  from deltacat.types.tables import (
50
50
  TABLE_CLASS_TO_SIZE_FUNC,
51
- TABLE_TYPE_TO_READER_FUNC,
51
+ TABLE_TYPE_TO_S3_READER_FUNC,
52
52
  TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS,
53
53
  DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
54
54
  get_table_length,
@@ -261,7 +261,7 @@ def read_file(
261
261
  **s3_client_kwargs,
262
262
  ) -> LocalTable:
263
263
 
264
- reader = TABLE_TYPE_TO_READER_FUNC[table_type.value]
264
+ reader = TABLE_TYPE_TO_S3_READER_FUNC[table_type.value]
265
265
  try:
266
266
  table = reader(
267
267
  s3_url,
@@ -61,7 +61,7 @@ def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
61
61
  "Daft not installed. Install Daft using pip to run these benchmarks: `pip install getdaft`"
62
62
  )
63
63
 
64
- tbl = daft.table.Table.read_parquet(path, columns=columns)
64
+ tbl = daft.read_parquet(path, columns=columns)
65
65
  return tbl.to_arrow()
66
66
 
67
67