deltacat 1.1.36__tar.gz → 2.0.0b2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (382) hide show
  1. deltacat-2.0.0b2/PKG-INFO +47 -0
  2. deltacat-2.0.0b2/README.md +26 -0
  3. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/__init__.py +42 -3
  4. deltacat-2.0.0b2/deltacat/annotations.py +36 -0
  5. deltacat-2.0.0b2/deltacat/api.py +168 -0
  6. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/aws/s3u.py +4 -4
  7. deltacat-2.0.0b2/deltacat/benchmarking/benchmark_engine.py +82 -0
  8. deltacat-2.0.0b2/deltacat/benchmarking/benchmark_report.py +86 -0
  9. deltacat-2.0.0b2/deltacat/benchmarking/benchmark_suite.py +11 -0
  10. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/benchmarking/conftest.py +21 -0
  11. deltacat-2.0.0b2/deltacat/benchmarking/data/random_row_generator.py +94 -0
  12. deltacat-2.0.0b2/deltacat/benchmarking/data/row_generator.py +10 -0
  13. deltacat-2.0.0b2/deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  14. deltacat-2.0.0b2/deltacat/catalog/__init__.py +14 -0
  15. deltacat-2.0.0b2/deltacat/catalog/delegate.py +377 -0
  16. deltacat-2.0.0b2/deltacat/catalog/iceberg/__init__.py +4 -0
  17. deltacat-2.0.0b2/deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  18. deltacat-2.0.0b2/deltacat/catalog/iceberg/impl.py +368 -0
  19. deltacat-2.0.0b2/deltacat/catalog/iceberg/overrides.py +74 -0
  20. deltacat-2.0.0b2/deltacat/catalog/interface.py +381 -0
  21. deltacat-2.0.0b2/deltacat/catalog/main/impl.py +720 -0
  22. deltacat-2.0.0b2/deltacat/catalog/model/catalog.py +290 -0
  23. deltacat-2.0.0b2/deltacat/catalog/model/properties.py +116 -0
  24. deltacat-2.0.0b2/deltacat/catalog/model/table_definition.py +61 -0
  25. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  26. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/delta_annotated.py +3 -3
  27. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  28. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  29. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/round_completion_info.py +5 -5
  30. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/table_object_store.py +3 -2
  31. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/repartition_session.py +1 -1
  32. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/steps/dedupe.py +11 -4
  33. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  34. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/steps/materialize.py +6 -2
  35. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/utils/io.py +1 -1
  36. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/compaction_session.py +5 -9
  38. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/constants.py +1 -30
  39. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  41. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  42. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/steps/merge.py +17 -126
  43. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  44. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  45. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/utils/io.py +1 -1
  46. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/utils/merge.py +0 -1
  47. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  48. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  49. deltacat-2.0.0b2/deltacat/compute/converter/constants.py +4 -0
  50. deltacat-2.0.0b2/deltacat/compute/converter/converter_session.py +143 -0
  51. deltacat-2.0.0b2/deltacat/compute/converter/model/convert_input.py +69 -0
  52. deltacat-2.0.0b2/deltacat/compute/converter/model/convert_input_files.py +61 -0
  53. deltacat-2.0.0b2/deltacat/compute/converter/model/converter_session_params.py +99 -0
  54. deltacat-2.0.0b2/deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  55. deltacat-2.0.0b2/deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  56. deltacat-2.0.0b2/deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  57. deltacat-2.0.0b2/deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat-2.0.0b2/deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat-2.0.0b2/deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat-2.0.0b2/deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat-2.0.0b2/deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat-2.0.0b2/deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat-2.0.0b2/deltacat/compute/converter/utils/s3u.py +133 -0
  64. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/resource_estimation/delta.py +1 -19
  65. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/constants.py +47 -1
  66. deltacat-2.0.0b2/deltacat/env.py +51 -0
  67. deltacat-2.0.0b2/deltacat/examples/basic_logging.py +101 -0
  68. deltacat-2.0.0b2/deltacat/examples/common/fixtures.py +15 -0
  69. deltacat-2.0.0b2/deltacat/examples/hello_world.py +27 -0
  70. deltacat-2.0.0b2/deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  71. deltacat-2.0.0b2/deltacat/examples/iceberg/iceberg_reader.py +149 -0
  72. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/exceptions.py +51 -9
  73. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/logs.py +4 -1
  74. deltacat-2.0.0b2/deltacat/storage/__init__.py +173 -0
  75. deltacat-2.0.0b2/deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  76. deltacat-2.0.0b2/deltacat/storage/iceberg/impl.py +737 -0
  77. deltacat-2.0.0b2/deltacat/storage/iceberg/model.py +709 -0
  78. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/storage/interface.py +217 -134
  79. deltacat-2.0.0b2/deltacat/storage/main/impl.py +2077 -0
  80. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/storage/model/delta.py +118 -71
  81. deltacat-2.0.0b2/deltacat/storage/model/interop.py +24 -0
  82. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/storage/model/list_result.py +8 -0
  83. deltacat-2.0.0b2/deltacat/storage/model/locator.py +125 -0
  84. {deltacat-1.1.36/deltacat/aws/redshift → deltacat-2.0.0b2/deltacat/storage}/model/manifest.py +122 -98
  85. deltacat-2.0.0b2/deltacat/storage/model/metafile.py +1316 -0
  86. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/storage/model/namespace.py +34 -18
  87. deltacat-2.0.0b2/deltacat/storage/model/partition.py +660 -0
  88. deltacat-2.0.0b2/deltacat/storage/model/scan/push_down.py +19 -0
  89. deltacat-2.0.0b2/deltacat/storage/model/scan/scan_plan.py +10 -0
  90. deltacat-2.0.0b2/deltacat/storage/model/scan/scan_task.py +34 -0
  91. deltacat-2.0.0b2/deltacat/storage/model/schema.py +892 -0
  92. deltacat-2.0.0b2/deltacat/storage/model/shard.py +47 -0
  93. deltacat-2.0.0b2/deltacat/storage/model/sort_key.py +190 -0
  94. deltacat-2.0.0b2/deltacat/storage/model/stream.py +399 -0
  95. deltacat-2.0.0b2/deltacat/storage/model/table.py +225 -0
  96. deltacat-2.0.0b2/deltacat/storage/model/table_version.py +495 -0
  97. deltacat-2.0.0b2/deltacat/storage/model/transaction.py +757 -0
  98. deltacat-2.0.0b2/deltacat/storage/model/transform.py +264 -0
  99. deltacat-2.0.0b2/deltacat/storage/model/types.py +154 -0
  100. deltacat-2.0.0b2/deltacat/storage/rivulet/__init__.py +11 -0
  101. deltacat-2.0.0b2/deltacat/storage/rivulet/arrow/serializer.py +75 -0
  102. deltacat-2.0.0b2/deltacat/storage/rivulet/dataset.py +744 -0
  103. deltacat-2.0.0b2/deltacat/storage/rivulet/dataset_executor.py +87 -0
  104. deltacat-2.0.0b2/deltacat/storage/rivulet/feather/__init__.py +5 -0
  105. deltacat-2.0.0b2/deltacat/storage/rivulet/feather/file_reader.py +136 -0
  106. deltacat-2.0.0b2/deltacat/storage/rivulet/feather/serializer.py +35 -0
  107. deltacat-2.0.0b2/deltacat/storage/rivulet/fs/file_provider.py +105 -0
  108. deltacat-2.0.0b2/deltacat/storage/rivulet/fs/file_store.py +130 -0
  109. deltacat-2.0.0b2/deltacat/storage/rivulet/fs/input_file.py +76 -0
  110. deltacat-2.0.0b2/deltacat/storage/rivulet/fs/output_file.py +86 -0
  111. deltacat-2.0.0b2/deltacat/storage/rivulet/logical_plan.py +105 -0
  112. deltacat-2.0.0b2/deltacat/storage/rivulet/metastore/delta.py +190 -0
  113. deltacat-2.0.0b2/deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  114. deltacat-2.0.0b2/deltacat/storage/rivulet/metastore/sst.py +82 -0
  115. deltacat-2.0.0b2/deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  116. deltacat-2.0.0b2/deltacat/storage/rivulet/mvp/Table.py +101 -0
  117. deltacat-2.0.0b2/deltacat/storage/rivulet/mvp/__init__.py +5 -0
  118. deltacat-2.0.0b2/deltacat/storage/rivulet/parquet/__init__.py +5 -0
  119. deltacat-2.0.0b2/deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  120. deltacat-2.0.0b2/deltacat/storage/rivulet/parquet/serializer.py +37 -0
  121. deltacat-2.0.0b2/deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  122. deltacat-2.0.0b2/deltacat/storage/rivulet/reader/data_reader.py +136 -0
  123. deltacat-2.0.0b2/deltacat/storage/rivulet/reader/data_scan.py +63 -0
  124. deltacat-2.0.0b2/deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  125. deltacat-2.0.0b2/deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  126. deltacat-2.0.0b2/deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  127. deltacat-2.0.0b2/deltacat/storage/rivulet/reader/query_expression.py +99 -0
  128. deltacat-2.0.0b2/deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  129. deltacat-2.0.0b2/deltacat/storage/rivulet/schema/datatype.py +128 -0
  130. deltacat-2.0.0b2/deltacat/storage/rivulet/schema/schema.py +251 -0
  131. deltacat-2.0.0b2/deltacat/storage/rivulet/serializer.py +40 -0
  132. deltacat-2.0.0b2/deltacat/storage/rivulet/serializer_factory.py +42 -0
  133. deltacat-2.0.0b2/deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  134. deltacat-2.0.0b2/deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  135. deltacat-2.0.0b2/deltacat/storage/util/scan_planner.py +26 -0
  136. deltacat-2.0.0b2/deltacat/tests/_io/__init__.py +1 -0
  137. deltacat-2.0.0b2/deltacat/tests/catalog/__init__.py +0 -0
  138. deltacat-2.0.0b2/deltacat/tests/catalog/test_catalogs.py +324 -0
  139. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  140. deltacat-2.0.0b2/deltacat/tests/compute/__init__.py +0 -0
  141. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  142. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  143. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  144. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  145. deltacat-2.0.0b2/deltacat/tests/compute/compactor/__init__.py +0 -0
  146. deltacat-2.0.0b2/deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  147. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  148. deltacat-2.0.0b2/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  149. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  150. deltacat-2.0.0b2/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  151. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  152. deltacat-2.0.0b2/deltacat/tests/compute/compactor_v2/utils/__init__.py +0 -0
  153. deltacat-2.0.0b2/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +33 -0
  154. deltacat-2.0.0b2/deltacat/tests/compute/conftest.py +75 -0
  155. deltacat-2.0.0b2/deltacat/tests/compute/converter/__init__.py +0 -0
  156. deltacat-2.0.0b2/deltacat/tests/compute/converter/conftest.py +80 -0
  157. deltacat-2.0.0b2/deltacat/tests/compute/converter/test_convert_session.py +478 -0
  158. deltacat-2.0.0b2/deltacat/tests/compute/converter/utils.py +123 -0
  159. deltacat-2.0.0b2/deltacat/tests/compute/resource_estimation/__init__.py +0 -0
  160. deltacat-2.0.0b2/deltacat/tests/compute/resource_estimation/data/__init__.py +0 -0
  161. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  162. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  163. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  164. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/test_compact_partition_params.py +3 -3
  165. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  166. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  167. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/test_util_common.py +19 -12
  168. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  169. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  170. deltacat-2.0.0b2/deltacat/tests/storage/__init__.py +0 -0
  171. deltacat-2.0.0b2/deltacat/tests/storage/conftest.py +25 -0
  172. deltacat-2.0.0b2/deltacat/tests/storage/main/__init__.py +0 -0
  173. deltacat-2.0.0b2/deltacat/tests/storage/main/test_main_storage.py +1399 -0
  174. deltacat-2.0.0b2/deltacat/tests/storage/model/__init__.py +0 -0
  175. deltacat-2.0.0b2/deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  176. deltacat-2.0.0b2/deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  177. deltacat-2.0.0b2/deltacat/tests/storage/model/test_schema.py +308 -0
  178. deltacat-2.0.0b2/deltacat/tests/storage/model/test_shard.py +22 -0
  179. deltacat-2.0.0b2/deltacat/tests/storage/model/test_table_version.py +110 -0
  180. deltacat-2.0.0b2/deltacat/tests/storage/model/test_transaction.py +308 -0
  181. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/__init__.py +0 -0
  182. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/conftest.py +149 -0
  183. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  184. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  185. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  186. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  187. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  188. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  189. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  190. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/test_utils.py +122 -0
  191. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  192. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  193. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  194. deltacat-2.0.0b2/deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  195. deltacat-2.0.0b2/deltacat/tests/test_deltacat_api.py +39 -0
  196. deltacat-2.0.0b2/deltacat/tests/test_utils/__init__.py +0 -0
  197. deltacat-2.0.0b2/deltacat/tests/test_utils/filesystem.py +14 -0
  198. deltacat-2.0.0b2/deltacat/tests/test_utils/message_pack_utils.py +54 -0
  199. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/test_utils/pyarrow.py +8 -15
  200. deltacat-2.0.0b2/deltacat/tests/test_utils/storage.py +297 -0
  201. deltacat-2.0.0b2/deltacat/tests/utils/__init__.py +0 -0
  202. deltacat-2.0.0b2/deltacat/tests/utils/data/__init__.py +0 -0
  203. deltacat-2.0.0b2/deltacat/tests/utils/ray_utils/__init__.py +0 -0
  204. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/utils/test_daft.py +3 -3
  205. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat-2.0.0b2/deltacat/types/__init__.py +0 -0
  207. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/types/partial_download.py +1 -1
  208. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/types/tables.py +1 -1
  209. deltacat-2.0.0b2/deltacat/utils/__init__.py +0 -0
  210. deltacat-2.0.0b2/deltacat/utils/export.py +59 -0
  211. deltacat-2.0.0b2/deltacat/utils/filesystem.py +320 -0
  212. deltacat-2.0.0b2/deltacat/utils/metafile_locator.py +73 -0
  213. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2/deltacat/utils/ray_utils/__init__.py +0 -0
  215. deltacat-2.0.0b2/deltacat.egg-info/PKG-INFO +47 -0
  216. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat.egg-info/SOURCES.txt +149 -19
  217. deltacat-2.0.0b2/deltacat.egg-info/requires.txt +20 -0
  218. {deltacat-1.1.36 → deltacat-2.0.0b2}/setup.py +18 -7
  219. deltacat-1.1.36/PKG-INFO +0 -50
  220. deltacat-1.1.36/README.md +0 -30
  221. deltacat-1.1.36/deltacat/aws/redshift/__init__.py +0 -19
  222. deltacat-1.1.36/deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  223. deltacat-1.1.36/deltacat/catalog/delegate.py +0 -284
  224. deltacat-1.1.36/deltacat/catalog/interface.py +0 -184
  225. deltacat-1.1.36/deltacat/catalog/model/catalog.py +0 -83
  226. deltacat-1.1.36/deltacat/catalog/model/table_definition.py +0 -30
  227. deltacat-1.1.36/deltacat/io/dataset.py +0 -73
  228. deltacat-1.1.36/deltacat/io/read_api.py +0 -143
  229. deltacat-1.1.36/deltacat/storage/__init__.py +0 -83
  230. deltacat-1.1.36/deltacat/storage/model/delete_parameters.py +0 -40
  231. deltacat-1.1.36/deltacat/storage/model/locator.py +0 -35
  232. deltacat-1.1.36/deltacat/storage/model/partition.py +0 -335
  233. deltacat-1.1.36/deltacat/storage/model/partition_spec.py +0 -71
  234. deltacat-1.1.36/deltacat/storage/model/sort_key.py +0 -33
  235. deltacat-1.1.36/deltacat/storage/model/stream.py +0 -271
  236. deltacat-1.1.36/deltacat/storage/model/table.py +0 -131
  237. deltacat-1.1.36/deltacat/storage/model/table_version.py +0 -219
  238. deltacat-1.1.36/deltacat/storage/model/transform.py +0 -127
  239. deltacat-1.1.36/deltacat/storage/model/types.py +0 -56
  240. deltacat-1.1.36/deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  241. deltacat-1.1.36/deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  242. deltacat-1.1.36/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -305
  243. deltacat-1.1.36/deltacat/tests/test_utils/storage.py +0 -34
  244. deltacat-1.1.36/deltacat.egg-info/PKG-INFO +0 -50
  245. deltacat-1.1.36/deltacat.egg-info/requires.txt +0 -14
  246. {deltacat-1.1.36 → deltacat-2.0.0b2}/LICENSE +0 -0
  247. {deltacat-1.1.36 → deltacat-2.0.0b2}/MANIFEST.in +0 -0
  248. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/aws/__init__.py +0 -0
  249. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/aws/clients.py +0 -0
  250. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/aws/constants.py +0 -0
  251. {deltacat-1.1.36/deltacat/aws/redshift/model → deltacat-2.0.0b2/deltacat/benchmarking}/__init__.py +0 -0
  252. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
  253. {deltacat-1.1.36/deltacat/benchmarking → deltacat-2.0.0b2/deltacat/benchmarking/data}/__init__.py +0 -0
  254. {deltacat-1.1.36/deltacat/catalog → deltacat-2.0.0b2/deltacat/catalog/main}/__init__.py +0 -0
  255. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/catalog/model/__init__.py +0 -0
  256. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/__init__.py +0 -0
  257. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/__init__.py +0 -0
  258. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/compaction_session.py +0 -0
  259. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/__init__.py +0 -0
  260. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
  261. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/compactor_version.py +0 -0
  262. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
  263. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
  264. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/materialize_result.py +0 -0
  265. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
  266. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  267. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  268. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/steps/__init__.py +0 -0
  269. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/steps/repartition.py +0 -0
  270. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/utils/__init__.py +0 -0
  271. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
  272. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
  273. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor/utils/system_columns.py +0 -0
  274. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/__init__.py +0 -0
  275. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
  276. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +0 -0
  277. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/deletes/delete_strategy.py +0 -0
  278. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +0 -0
  279. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/deletes/model.py +0 -0
  280. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/model/__init__.py +0 -0
  281. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -0
  282. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +0 -0
  283. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
  284. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/model/merge_file_group.py +0 -0
  285. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/model/merge_result.py +0 -0
  286. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/private/__init__.py +0 -0
  287. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  288. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/steps/hash_bucket.py +0 -0
  289. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  290. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/compactor_v2/utils/delta.py +0 -0
  291. {deltacat-1.1.36/deltacat/compute/merge_on_read/model → deltacat-2.0.0b2/deltacat/compute/converter}/__init__.py +0 -0
  292. {deltacat-1.1.36/deltacat/compute/merge_on_read/utils → deltacat-2.0.0b2/deltacat/compute/converter/model}/__init__.py +0 -0
  293. {deltacat-1.1.36/deltacat/compute/stats → deltacat-2.0.0b2/deltacat/compute/converter/pyiceberg}/__init__.py +0 -0
  294. {deltacat-1.1.36/deltacat/compute/stats/models → deltacat-2.0.0b2/deltacat/compute/converter/steps}/__init__.py +0 -0
  295. {deltacat-1.1.36/deltacat/io → deltacat-2.0.0b2/deltacat/compute/converter/utils}/__init__.py +0 -0
  296. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/merge_on_read/__init__.py +0 -0
  297. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/merge_on_read/daft.py +0 -0
  298. {deltacat-1.1.36/deltacat/io/aws → deltacat-2.0.0b2/deltacat/compute/merge_on_read/model}/__init__.py +0 -0
  299. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -0
  300. {deltacat-1.1.36/deltacat/io/aws/redshift → deltacat-2.0.0b2/deltacat/compute/merge_on_read/utils}/__init__.py +0 -0
  301. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/merge_on_read/utils/delta.py +0 -0
  302. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/resource_estimation/__init__.py +0 -0
  303. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/resource_estimation/manifest.py +0 -0
  304. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/resource_estimation/model.py +0 -0
  305. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/resource_estimation/parquet.py +0 -0
  306. {deltacat-1.1.36/deltacat/storage/model → deltacat-2.0.0b2/deltacat/compute/stats}/__init__.py +0 -0
  307. {deltacat-1.1.36/deltacat/tests → deltacat-2.0.0b2/deltacat/compute/stats/models}/__init__.py +0 -0
  308. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  309. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/stats/models/delta_stats.py +0 -0
  310. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  311. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  312. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/stats/models/stats_result.py +0 -0
  313. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/compute/stats/types.py +0 -0
  314. {deltacat-1.1.36/deltacat/tests/aws → deltacat-2.0.0b2/deltacat/examples}/__init__.py +0 -0
  315. {deltacat-1.1.36/deltacat/tests/catalog → deltacat-2.0.0b2/deltacat/examples/common}/__init__.py +0 -0
  316. {deltacat-1.1.36/deltacat/tests/compute → deltacat-2.0.0b2/deltacat/examples/iceberg}/__init__.py +0 -0
  317. {deltacat-1.1.36/deltacat/tests/compute/compactor → deltacat-2.0.0b2/deltacat/io}/__init__.py +0 -0
  318. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/io/file_object_store.py +0 -0
  319. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/io/memcached_object_store.py +0 -0
  320. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/io/object_store.py +0 -0
  321. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/io/ray_plasma_object_store.py +0 -0
  322. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/io/redis_object_store.py +0 -0
  323. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/io/s3_object_store.py +0 -0
  324. {deltacat-1.1.36/deltacat/tests/compute/compactor/steps → deltacat-2.0.0b2/deltacat/storage/iceberg}/__init__.py +0 -0
  325. {deltacat-1.1.36/deltacat/tests/compute/compactor/utils → deltacat-2.0.0b2/deltacat/storage/main}/__init__.py +0 -0
  326. {deltacat-1.1.36/deltacat/tests/compute/compactor_v2 → deltacat-2.0.0b2/deltacat/storage/model}/__init__.py +0 -0
  327. {deltacat-1.1.36/deltacat/tests/compute/compactor_v2/utils → deltacat-2.0.0b2/deltacat/storage/model/scan}/__init__.py +0 -0
  328. {deltacat-1.1.36/deltacat/tests/compute/resource_estimation → deltacat-2.0.0b2/deltacat/storage/rivulet/arrow}/__init__.py +0 -0
  329. {deltacat-1.1.36/deltacat/tests/compute/resource_estimation/data → deltacat-2.0.0b2/deltacat/storage/rivulet/fs}/__init__.py +0 -0
  330. {deltacat-1.1.36/deltacat/tests/io → deltacat-2.0.0b2/deltacat/storage/rivulet/metastore}/__init__.py +0 -0
  331. /deltacat-1.1.36/deltacat/tests/test_utils/__init__.py → /deltacat-2.0.0b2/deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  332. {deltacat-1.1.36/deltacat/tests/utils → deltacat-2.0.0b2/deltacat/storage/rivulet/reader}/__init__.py +0 -0
  333. {deltacat-1.1.36/deltacat/tests/utils/data → deltacat-2.0.0b2/deltacat/storage/rivulet/schema}/__init__.py +0 -0
  334. {deltacat-1.1.36/deltacat/tests/utils/ray_utils → deltacat-2.0.0b2/deltacat/storage/rivulet/writer}/__init__.py +0 -0
  335. {deltacat-1.1.36/deltacat/types → deltacat-2.0.0b2/deltacat/storage/util}/__init__.py +0 -0
  336. {deltacat-1.1.36/deltacat/utils → deltacat-2.0.0b2/deltacat/tests}/__init__.py +0 -0
  337. {deltacat-1.1.36/deltacat/tests/io → deltacat-2.0.0b2/deltacat/tests/_io}/test_cloudpickle_bug_fix.py +0 -0
  338. {deltacat-1.1.36/deltacat/tests/io → deltacat-2.0.0b2/deltacat/tests/_io}/test_file_object_store.py +0 -0
  339. {deltacat-1.1.36/deltacat/tests/io → deltacat-2.0.0b2/deltacat/tests/_io}/test_memcached_object_store.py +0 -0
  340. {deltacat-1.1.36/deltacat/tests/io → deltacat-2.0.0b2/deltacat/tests/_io}/test_ray_plasma_object_store.py +0 -0
  341. {deltacat-1.1.36/deltacat/tests/io → deltacat-2.0.0b2/deltacat/tests/_io}/test_redis_object_store.py +0 -0
  342. {deltacat-1.1.36/deltacat/tests/io → deltacat-2.0.0b2/deltacat/tests/_io}/test_s3_object_store.py +0 -0
  343. {deltacat-1.1.36/deltacat/utils/ray_utils → deltacat-2.0.0b2/deltacat/tests/aws}/__init__.py +0 -0
  344. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/aws/test_clients.py +0 -0
  345. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/aws/test_s3u.py +0 -0
  346. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -0
  347. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
  348. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/resource_estimation/test_manifest.py +0 -0
  349. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/compute/test_util_constant.py +0 -0
  350. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/local_deltacat_storage/exceptions.py +0 -0
  351. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/test_exceptions.py +0 -0
  352. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/test_logs.py +0 -0
  353. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/test_utils/constants.py +0 -0
  354. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/test_utils/utils.py +0 -0
  355. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/utils/ray_utils/test_concurrency.py +0 -0
  356. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/utils/ray_utils/test_dataset.py +0 -0
  357. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/utils/test_cloudpickle.py +0 -0
  358. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/utils/test_metrics.py +0 -0
  359. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/utils/test_placement.py +0 -0
  360. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
  361. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/tests/utils/test_resources.py +0 -0
  362. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/types/media.py +0 -0
  363. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/arguments.py +0 -0
  364. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/cloudpickle.py +0 -0
  365. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/common.py +0 -0
  366. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/daft.py +0 -0
  367. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/metrics.py +0 -0
  368. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/numpy.py +0 -0
  369. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/pandas.py +0 -0
  370. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/performance.py +0 -0
  371. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/placement.py +0 -0
  372. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/ray_utils/collections.py +0 -0
  373. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/ray_utils/concurrency.py +0 -0
  374. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/ray_utils/dataset.py +0 -0
  375. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/ray_utils/performance.py +0 -0
  376. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/ray_utils/runtime.py +0 -0
  377. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/resources.py +0 -0
  378. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/s3fs.py +0 -0
  379. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat/utils/schema.py +0 -0
  380. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat.egg-info/dependency_links.txt +0 -0
  381. {deltacat-1.1.36 → deltacat-2.0.0b2}/deltacat.egg-info/top_level.txt +0 -0
  382. {deltacat-1.1.36 → deltacat-2.0.0b2}/setup.cfg +0 -0
@@ -0,0 +1,47 @@
1
+ Metadata-Version: 2.1
2
+ Name: deltacat
3
+ Version: 2.0.0b2
4
+ Summary: A portable, scalable, fast, and Pythonic Data Lakehouse for AI.
5
+ Home-page: https://github.com/ray-project/deltacat
6
+ Author: Ray Team
7
+ License: UNKNOWN
8
+ Platform: UNKNOWN
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Operating System :: OS Independent
15
+ Requires-Python: >=3.9
16
+ Description-Content-Type: text/markdown
17
+ Provides-Extra: iceberg
18
+ License-File: LICENSE
19
+
20
+ <p align="center">
21
+ <img src="media/deltacat-logo-alpha-750.png" alt="DeltaCAT Logo" style="width:55%; height:auto; text-align: center;">
22
+ </p>
23
+
24
+ DeltaCAT is a portable Pythonic Data Lakehouse powered by [Ray](https://github.com/ray-project/ray). It lets you define and manage
25
+ fast, scalable, ACID-compliant multimodal data lakes, and has been used to [successfully manage exabyte-scale enterprise
26
+ data lakes](https://aws.amazon.com/blogs/opensource/amazons-exabyte-scale-migration-from-apache-spark-to-ray-on-amazon-ec2/).
27
+
28
+ It uses the Ray distributed compute framework together with [Apache Arrow](https://github.com/apache/arrow) and
29
+ [Daft](https://github.com/Eventual-Inc/Daft) to efficiently scale common table management tasks, like petabyte-scale
30
+ merge-on-read and copy-on-write operations.
31
+
32
+ DeltaCAT provides four high-level components:
33
+ 1. **Catalog**: High-level APIs to create, discover, organize, share, and manage datasets.
34
+ 2. **Compute**: Distributed data management procedures to read, write, and optimize datasets.
35
+ 3. **Storage**: In-memory and on-disk multimodal dataset formats.
36
+ 4. **Sync**: Synchronize DeltaCAT datasets to data warehouses and other table formats.
37
+
38
+
39
+ ## Getting Started
40
+
41
+ DeltaCAT is rapidly evolving. Usage instructions will be posted here soon!
42
+
43
+ For now, feel free to peruse some of our examples:
44
+ * https://github.com/ray-project/deltacat/tree/2.0/deltacat/examples/rivulet
45
+ * https://github.com/ray-project/deltacat/tree/2.0/deltacat/examples/iceberg
46
+
47
+
@@ -0,0 +1,26 @@
1
+ <p align="center">
2
+ <img src="media/deltacat-logo-alpha-750.png" alt="DeltaCAT Logo" style="width:55%; height:auto; text-align: center;">
3
+ </p>
4
+
5
+ DeltaCAT is a portable Pythonic Data Lakehouse powered by [Ray](https://github.com/ray-project/ray). It lets you define and manage
6
+ fast, scalable, ACID-compliant multimodal data lakes, and has been used to [successfully manage exabyte-scale enterprise
7
+ data lakes](https://aws.amazon.com/blogs/opensource/amazons-exabyte-scale-migration-from-apache-spark-to-ray-on-amazon-ec2/).
8
+
9
+ It uses the Ray distributed compute framework together with [Apache Arrow](https://github.com/apache/arrow) and
10
+ [Daft](https://github.com/Eventual-Inc/Daft) to efficiently scale common table management tasks, like petabyte-scale
11
+ merge-on-read and copy-on-write operations.
12
+
13
+ DeltaCAT provides four high-level components:
14
+ 1. **Catalog**: High-level APIs to create, discover, organize, share, and manage datasets.
15
+ 2. **Compute**: Distributed data management procedures to read, write, and optimize datasets.
16
+ 3. **Storage**: In-memory and on-disk multimodal dataset formats.
17
+ 4. **Sync**: Synchronize DeltaCAT datasets to data warehouses and other table formats.
18
+
19
+
20
+ ## Getting Started
21
+
22
+ DeltaCAT is rapidly evolving. Usage instructions will be posted here soon!
23
+
24
+ For now, feel free to peruse some of our examples:
25
+ * https://github.com/ray-project/deltacat/tree/2.0/deltacat/examples/rivulet
26
+ * https://github.com/ray-project/deltacat/tree/2.0/deltacat/examples/iceberg
@@ -1,6 +1,12 @@
1
+ import importlib
1
2
  import logging
2
3
 
3
4
  import deltacat.logs # noqa: F401
5
+ from deltacat.api import (
6
+ copy,
7
+ get,
8
+ put,
9
+ )
4
10
  from deltacat.catalog.delegate import (
5
11
  alter_namespace,
6
12
  alter_table,
@@ -24,32 +30,51 @@ from deltacat.catalog.delegate import (
24
30
  from deltacat.catalog.model.catalog import ( # noqa: F401
25
31
  Catalog,
26
32
  Catalogs,
27
- all_catalogs,
33
+ is_initialized,
28
34
  init,
35
+ get_catalog,
36
+ put_catalog,
29
37
  )
30
38
  from deltacat.catalog.model.table_definition import TableDefinition
31
39
  from deltacat.storage import (
32
40
  DistributedDataset,
41
+ Field,
33
42
  LifecycleState,
34
43
  ListResult,
35
44
  LocalDataset,
36
45
  LocalTable,
37
46
  Namespace,
47
+ PartitionKey,
48
+ PartitionScheme,
49
+ Schema,
38
50
  SchemaConsistencyType,
39
51
  SortKey,
40
52
  SortOrder,
53
+ SortScheme,
54
+ NullOrder,
41
55
  )
56
+ from deltacat.storage.rivulet import Dataset, Datatype
42
57
  from deltacat.types.media import ContentEncoding, ContentType, TableType
43
58
  from deltacat.types.tables import TableWriteMode
44
59
 
60
+ __iceberg__ = []
61
+ if importlib.util.find_spec("pyiceberg") is not None:
62
+ from deltacat.catalog.iceberg import impl as IcebergCatalog
63
+
64
+ __iceberg__ = [
65
+ "IcebergCatalog",
66
+ ]
67
+
45
68
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
69
 
47
- __version__ = "1.1.36"
70
+ __version__ = "2.0.0b2"
48
71
 
49
72
 
50
73
  __all__ = [
51
74
  "__version__",
52
- "all_catalogs",
75
+ "copy",
76
+ "get",
77
+ "put",
53
78
  "alter_table",
54
79
  "create_table",
55
80
  "drop_table",
@@ -68,20 +93,34 @@ __all__ = [
68
93
  "default_namespace",
69
94
  "write_to_table",
70
95
  "read_table",
96
+ "get_catalog",
97
+ "put_catalog",
98
+ "is_initialized",
71
99
  "init",
72
100
  "Catalog",
73
101
  "ContentType",
74
102
  "ContentEncoding",
75
103
  "DistributedDataset",
104
+ "Dataset",
105
+ "Datatype",
106
+ "Field",
107
+ "IcebergCatalog",
76
108
  "LifecycleState",
77
109
  "ListResult",
78
110
  "LocalDataset",
79
111
  "LocalTable",
80
112
  "Namespace",
113
+ "NullOrder",
114
+ "PartitionKey",
115
+ "PartitionScheme",
116
+ "Schema",
81
117
  "SchemaConsistencyType",
82
118
  "SortKey",
83
119
  "SortOrder",
120
+ "SortScheme",
84
121
  "TableDefinition",
85
122
  "TableType",
86
123
  "TableWriteMode",
87
124
  ]
125
+
126
+ __all__ += __iceberg__
@@ -0,0 +1,36 @@
1
+ def ExperimentalAPI(obj):
2
+ """Decorator for documenting experimental APIs.
3
+
4
+ Experimental APIs are classes and methods that are in development and may
5
+ change at any time in their development process. You should not expect
6
+ these APIs to be stable until their tag is changed to `DeveloperAPI` or
7
+ `PublicAPI`.
8
+
9
+ Subclasses that inherit from a ``@ExperimentalAPI`` base class can be
10
+ assumed experimental as well.
11
+
12
+ This decorator has no effect on runtime behavior
13
+ """
14
+ return obj
15
+
16
+
17
+ def DeveloperAPI(obj):
18
+ """Decorator for documenting experimental APIs.
19
+
20
+ Developer APIs are classes and methods explicitly exposed to developers
21
+ for low level integrations with DeltaCAT (e.g.: compute engines, other catalogs).
22
+ You can generally expect these APIs to be stable sans minor changes (but less stable than public APIs).
23
+
24
+ This decorator has no effect on runtime behavior
25
+ """
26
+ return obj
27
+
28
+
29
+ def PublicAPI(obj):
30
+ """Decorator for documenting public APIs.
31
+
32
+ Public APIs are classes and methods exposed to end users which are expected to remain stable across releases.
33
+
34
+ This decorator has no effect on runtime behavior
35
+ """
36
+ return obj
@@ -0,0 +1,168 @@
1
+ from typing import Any
2
+
3
+
4
+ import deltacat as dc
5
+ from deltacat.catalog import Catalog
6
+
7
+
8
+ def copy(source, destination):
9
+ src_parts = source.split("/")
10
+ src_parts = [part for part in src_parts if part]
11
+ dst_parts = destination.split("/")
12
+ dst_parts = [part for part in dst_parts if part]
13
+ if not dc.is_initialized():
14
+ raise ValueError("Catalog not initialized.")
15
+ if len(src_parts) != len(dst_parts) and len(src_parts) != len(dst_parts) + 1:
16
+ # TODO(pdames): Better error message.
17
+ raise ValueError(
18
+ f"Cannot copy {source} to {destination}. "
19
+ f"Source and destination must share the same type."
20
+ )
21
+ src_obj = get(source)
22
+ if len(src_parts) == 1:
23
+ # copy the given catalog
24
+ raise NotImplementedError
25
+ elif len(src_parts) == 2:
26
+ # TODO(pdames): Make catalog specification optional if there is only
27
+ # one catalog (e.g., auto-retrieve src_parts[0]/dst_parts[0])
28
+ # copy the given namespace
29
+ src_namespace_name = src_parts[1]
30
+ dst_catalog_name = dst_parts[0]
31
+ dst_namespace_name = dst_parts[1] if len(dst_parts) >= 2 else src_namespace_name
32
+ new_namespace = dc.create_namespace(
33
+ namespace=dst_namespace_name,
34
+ properties=src_obj.properties,
35
+ catalog=dst_catalog_name,
36
+ )
37
+ return new_namespace
38
+ elif len(src_parts) == 3:
39
+ # copy the given table
40
+ raise NotImplementedError
41
+ elif len(src_parts) == 4:
42
+ # copy the given table version
43
+ raise NotImplementedError
44
+ elif len(src_parts) == 5:
45
+ # copy the given stream
46
+ raise NotImplementedError
47
+ elif len(src_parts) == 6:
48
+ # copy the given partition
49
+ raise NotImplementedError
50
+ elif len(src_parts) == 7:
51
+ # copy the given partition delta
52
+ raise NotImplementedError
53
+ raise ValueError(f"Invalid path: {src_parts}")
54
+
55
+
56
+ def concat(source, destination):
57
+ raise NotImplementedError
58
+
59
+
60
+ def delete(source):
61
+ raise NotImplementedError
62
+
63
+
64
+ def move(source, destination):
65
+ raise NotImplementedError
66
+
67
+
68
+ def list(path):
69
+ raise NotImplementedError
70
+
71
+
72
+ def get(path) -> Any:
73
+ parts = path.split("/")
74
+ parts = [part for part in parts if part]
75
+ if not dc.is_initialized():
76
+ # TODO(pdames): Re-initialize DeltaCAT with all catalogs from the
77
+ # last session.
78
+ raise ValueError("Catalog not initialized.")
79
+ if len(parts) == 1:
80
+ # TODO(pdames): Save all catalogs registered from the last session on
81
+ # disk so that users don't need to re-initialize them every time.
82
+ # get the given catalog
83
+ catalog_name = parts[0]
84
+ return dc.get_catalog(catalog_name)
85
+ elif len(parts) == 2:
86
+ # get the given namespace
87
+ catalog_name = parts[0]
88
+ namespace_name = parts[1]
89
+ return dc.get_namespace(
90
+ namespace=namespace_name,
91
+ catalog=catalog_name,
92
+ )
93
+ elif len(parts) == 3:
94
+ # get the given table
95
+ raise NotImplementedError
96
+ elif len(parts) == 4:
97
+ # get the given table version
98
+ raise NotImplementedError
99
+ elif len(parts) == 5:
100
+ # get the given stream
101
+ raise NotImplementedError
102
+ elif len(parts) == 6:
103
+ # get the given partition
104
+ raise NotImplementedError
105
+ elif len(parts) == 7:
106
+ # get the given partition delta
107
+ raise NotImplementedError
108
+ raise ValueError(f"Invalid path: {path}")
109
+
110
+
111
+ def put(path, *args, **kwargs) -> Any:
112
+ parts = path.split("/")
113
+ parts = [part for part in parts if part]
114
+ if len(parts) == 1:
115
+ # TODO(pdames): Save all catalogs registered from the last session on
116
+ # disk so that users don't need to re-initialize them every time.
117
+ # register the given catalog
118
+ catalog_name = parts[0]
119
+ # Initialize default catalog using kwargs
120
+ catalog = Catalog(**kwargs)
121
+ return dc.put_catalog(catalog_name, catalog)
122
+ elif len(parts) == 2:
123
+ # register the given namespace
124
+ catalog_name = parts[0]
125
+ namespace_name = parts[1]
126
+ if not dc.is_initialized():
127
+ # TODO(pdames): Re-initialize DeltaCAT with all catalogs from the
128
+ # last session.
129
+ raise ValueError("Catalog not initialized.")
130
+ new_namespace = dc.create_namespace(
131
+ namespace=namespace_name,
132
+ catalog=catalog_name,
133
+ *args,
134
+ **kwargs,
135
+ )
136
+ return new_namespace
137
+ elif len(parts) == 3:
138
+ # register the given table
139
+ raise NotImplementedError
140
+ elif len(parts) == 4:
141
+ # register the given table version
142
+ raise NotImplementedError
143
+ elif len(parts) == 5:
144
+ # register the given stream
145
+ raise NotImplementedError
146
+ elif len(parts) == 6:
147
+ # register the given partition
148
+ raise NotImplementedError
149
+ elif len(parts) == 7:
150
+ # register the given partition delta
151
+ raise NotImplementedError
152
+ raise ValueError(f"Invalid path: {path}")
153
+
154
+
155
+ def exists(path):
156
+ raise NotImplementedError
157
+
158
+
159
+ def query(path, expression):
160
+ raise NotImplementedError
161
+
162
+
163
+ def tail(path):
164
+ raise NotImplementedError
165
+
166
+
167
+ def head(path):
168
+ raise NotImplementedError
@@ -14,7 +14,7 @@ from deltacat.aws.constants import (
14
14
  DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY,
15
15
  )
16
16
 
17
- import pyarrow as pa
17
+ import pyarrow.fs
18
18
  import ray
19
19
  import s3fs
20
20
  from boto3.resources.base import ServiceResource
@@ -134,7 +134,7 @@ class UuidBlockWritePathProvider(FilenameProvider):
134
134
  self,
135
135
  base_path: str,
136
136
  *,
137
- filesystem: Optional[pa.filesystem.FileSystem] = None,
137
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
138
138
  dataset_uuid: Optional[str] = None,
139
139
  block: Optional[ObjectRef[Block]] = None,
140
140
  block_index: Optional[int] = None,
@@ -150,7 +150,7 @@ class UuidBlockWritePathProvider(FilenameProvider):
150
150
  self,
151
151
  base_path: str,
152
152
  *,
153
- filesystem: Optional[pa.filesystem.FileSystem] = None,
153
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
154
154
  dataset_uuid: Optional[str] = None,
155
155
  block: Optional[ObjectRef[Block]] = None,
156
156
  block_index: Optional[int] = None,
@@ -364,7 +364,7 @@ def upload_table(
364
364
  **s3_client_kwargs,
365
365
  ) -> ManifestEntryList:
366
366
  """
367
- Writes the given table to 1 or more S3 files and return Redshift
367
+ Writes the given table to 1 or more S3 files and return
368
368
  manifest entries describing the uploaded files.
369
369
  """
370
370
  if s3_table_writer_kwargs is None:
@@ -0,0 +1,82 @@
1
+ import sys
2
+ import time
3
+ from contextlib import contextmanager
4
+ from typing import Generator, Tuple
5
+
6
+ from deltacat.benchmarking.benchmark_report import BenchmarkMetric, BenchmarkStep
7
+ from deltacat.storage.rivulet.dataset import Dataset
8
+ from deltacat.storage.rivulet.reader.query_expression import QueryExpression
9
+
10
+
11
+ @contextmanager
12
+ def timed_step(description: str) -> Generator[BenchmarkStep, None, None]:
13
+ """Convenience for computing elapsed time of a block of code as a metric.
14
+
15
+ :param description: description of the step
16
+ :return: a benchmark operation populated with the elapsed time
17
+ """
18
+ metric = BenchmarkStep(description)
19
+ start_time = time.time()
20
+ yield metric
21
+ end_time = time.time()
22
+ metric.add(BenchmarkMetric("elapsed_time", 1000 * (end_time - start_time), "ms"))
23
+
24
+
25
+ class BenchmarkEngine:
26
+ def __init__(self, dataset: Dataset):
27
+ self.dataset = dataset
28
+
29
+ def load_and_commit(
30
+ self, schema_name, generator, count
31
+ ) -> Tuple[str, BenchmarkStep]:
32
+ """Load count number of rows from the generator and commit.
33
+
34
+ :param generator: row generator
35
+ :param count: the number of rows to load into the dataset
36
+ :return: tuple of the manifest URI and a operation measurement
37
+ """
38
+ desc = f"load {count} from {generator}"
39
+ writer = self.dataset.writer(schema_name)
40
+ with timed_step(desc) as step:
41
+ rows = [generator.generate() for _ in range(count)]
42
+ writer.write(rows)
43
+ result = writer.flush()
44
+ step.add(BenchmarkMetric("loaded", count))
45
+ return result, step
46
+
47
+ def scan(self) -> Tuple[set[any], BenchmarkStep]:
48
+ """
49
+ Scans the rows in dataset and prints some basic statistics about the manifest
50
+
51
+ :return: Tuple[set[any], BenchmarkStep] - a tuple containing a set of merge keys and a benchmark step with metrics
52
+ """
53
+ keys = set()
54
+ object_count = 0
55
+ size_b = 0
56
+ # Note that we expect single col merge keys so we can return key set
57
+ # this will fail with validation error if dataset has multiple merge keys
58
+ merge_key_name = self.dataset.schemas["all"].get_merge_key()
59
+ with timed_step("full scan") as step:
60
+ for row in self.dataset.scan(QueryExpression()).to_pydict():
61
+ object_count += 1
62
+ size_b += sum([sys.getsizeof(x) for x in row.values()])
63
+ keys.add(row.get(merge_key_name))
64
+ # TODO replace with the actual metrics we want to measure
65
+ step.add(BenchmarkMetric("rows read", object_count))
66
+ step.add(BenchmarkMetric("size", size_b / (1024 * 1024), "MB"))
67
+ return keys, step
68
+
69
+ def run_queries(
70
+ self, description, manifest_uri, queries: list[QueryExpression]
71
+ ) -> BenchmarkStep:
72
+ object_count = 0
73
+ size_b = 0
74
+ with timed_step(description) as step:
75
+ for query in queries:
76
+ for row in self.dataset.scan(query).to_pydict():
77
+ object_count += 1
78
+ size_b += sum([sys.getsizeof(x) for x in row.values()])
79
+ # TODO replace with the actual metrics we want to measure
80
+ step.add(BenchmarkMetric("rows read", object_count))
81
+ step.add(BenchmarkMetric("size", size_b / (1024 * 1024), "MB"))
82
+ return step
@@ -0,0 +1,86 @@
1
+ from dataclasses import dataclass
2
+ from tabulate import tabulate
3
+ from typing import Union, Optional
4
+
5
+
6
+ @dataclass
7
+ class BenchmarkMetric:
8
+ name: str
9
+ value: Union[float, int]
10
+ unit: Optional[str] = None
11
+
12
+
13
+ class BenchmarkStep:
14
+ """Captures measurements from a given operation"""
15
+
16
+ def __init__(self, description):
17
+ self.description: str = description
18
+ """Description of the operation"""
19
+ self._metrics: dict[str, BenchmarkMetric] = {}
20
+ """Description of the operation"""
21
+
22
+ def add(self, metric: BenchmarkMetric):
23
+ self._metrics[metric.name] = metric
24
+
25
+ def list_metrics(self):
26
+ """List the metrics (sorted by name)"""
27
+ return sorted(self._metrics.values(), key=lambda x: x.name)
28
+
29
+
30
+ class BenchmarkRun:
31
+ """Class for capturing measurements for a given test suite for comparison."""
32
+
33
+ def __init__(self, suite: str, description: str):
34
+ self.suite = suite
35
+ """The test suite associated with this report."""
36
+ self.description = description
37
+ """Description of the report"""
38
+ self.steps: list[BenchmarkStep] = []
39
+ """List of steps and their metrics"""
40
+
41
+ def add(self, operation):
42
+ self.steps.append(operation)
43
+
44
+
45
+ class BenchmarkReport:
46
+ def __init__(self, name):
47
+ self.name = name
48
+ self.runs: list[BenchmarkRun] = []
49
+
50
+ def add(self, run):
51
+ self.runs.append(run)
52
+
53
+ def __str__(self):
54
+ """Pretty-print a table that compares the metrics across each report.
55
+
56
+ We want to transpose these such that each report gets their own column and each metric gets its own row
57
+ (ideally grouped by operation).
58
+ """
59
+ if not self.runs:
60
+ print("No runs to compare!")
61
+ return
62
+ suites = set(r.suite for r in self.runs)
63
+ if len(suites) > 1:
64
+ print("Found more than one type of suite")
65
+ return
66
+ suite = self.runs[0].suite
67
+
68
+ headers = [
69
+ f"{suite} Operation",
70
+ "Metric",
71
+ "Unit",
72
+ *[r.description for r in self.runs],
73
+ ]
74
+ rows = []
75
+ for step_tranche in zip(*[r.steps for r in self.runs]):
76
+ # TODO zip by metric name instead of assuming all metrics are being measured
77
+ step_name = step_tranche[0].description
78
+ for metric_tuple in zip(*[x.list_metrics() for x in step_tranche]):
79
+ row = [
80
+ step_name,
81
+ metric_tuple[0].name,
82
+ metric_tuple[0].unit,
83
+ *[p.value for p in metric_tuple],
84
+ ]
85
+ rows.append(row)
86
+ return tabulate(rows, headers=headers, tablefmt="fancy_outline")
@@ -0,0 +1,11 @@
1
+ from typing import Protocol
2
+
3
+ from deltacat.benchmarking.benchmark_report import BenchmarkRun
4
+
5
+
6
+ class BenchmarkSuite(Protocol):
7
+ def run(self) -> BenchmarkRun:
8
+ """Run the benchmark suite and produce a report.
9
+
10
+ Each report should be comparable against other reports by the same suite"""
11
+ ...
@@ -4,7 +4,9 @@ import pyarrow as pa
4
4
  import pyarrow.fs as pafs
5
5
  import pyarrow.parquet as papq
6
6
  import pytest
7
+ from _pytest.terminal import TerminalReporter
7
8
 
9
+ from deltacat.benchmarking.benchmark_report import BenchmarkReport
8
10
  from deltacat.utils.pyarrow import s3_file_to_table
9
11
  from deltacat.types.media import (
10
12
  ContentEncoding,
@@ -12,6 +14,25 @@ from deltacat.types.media import (
12
14
  )
13
15
 
14
16
 
17
+ @pytest.fixture(autouse=True, scope="function")
18
+ def report(request):
19
+ report = BenchmarkReport(request.node.name)
20
+
21
+ def final_callback():
22
+ terminal_reporter: TerminalReporter = request.config.pluginmanager.get_plugin(
23
+ "terminalreporter"
24
+ )
25
+ capture_manager = request.config.pluginmanager.get_plugin("capturemanager")
26
+ with capture_manager.global_and_fixture_disabled():
27
+ terminal_reporter.ensure_newline()
28
+ terminal_reporter.section(request.node.name, sep="-", blue=True, bold=True)
29
+ terminal_reporter.write(str(report))
30
+ terminal_reporter.ensure_newline()
31
+
32
+ request.addfinalizer(final_callback)
33
+ return report
34
+
35
+
15
36
  def pyarrow_read(path: str, columns: list[str] | None = None) -> pa.Table:
16
37
  assert path.startswith(
17
38
  "s3://"