deltacat 2.0.0.post1__tar.gz → 2.0.0.post3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (454) hide show
  1. {deltacat-2.0.0.post1/deltacat.egg-info → deltacat-2.0.0.post3}/PKG-INFO +409 -94
  2. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/README.md +408 -93
  3. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/__init__.py +10 -3
  4. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/api.py +127 -22
  5. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/__init__.py +6 -0
  6. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/delegate.py +170 -3
  7. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/interface.py +35 -2
  8. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/main/impl.py +159 -207
  9. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/model/catalog.py +150 -35
  10. deltacat-2.0.0.post3/deltacat/catalog/model/properties.py +333 -0
  11. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/private/compaction_utils.py +8 -2
  12. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/steps/merge.py +9 -7
  13. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/converter_session.py +15 -10
  14. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +7 -5
  15. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/io.py +22 -3
  16. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/janitor.py +38 -15
  17. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/constants.py +11 -0
  18. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/bootstrap.py +3 -1
  19. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/explorer.py +0 -1
  20. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/utils/common.py +0 -1
  21. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +0 -1
  22. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/hello_world.py +10 -4
  23. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/indexer/indexer.py +3 -0
  24. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/indexer/job_runner.py +6 -1
  25. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/exceptions.py +15 -0
  26. deltacat-2.0.0.post3/deltacat/experimental/compatibility/backfill_transaction_partitions.py +513 -0
  27. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/converter_agent/table_monitor.py +2 -3
  28. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/daft/daft_catalog.py +1 -0
  29. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +7 -2
  30. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/interface.py +6 -7
  31. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/main/impl.py +209 -121
  32. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/delta.py +22 -8
  33. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/manifest.py +81 -9
  34. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/metafile.py +113 -30
  35. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/namespace.py +11 -3
  36. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/partition.py +19 -3
  37. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/schema.py +17 -4
  38. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/stream.py +10 -3
  39. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/table.py +10 -3
  40. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/table_version.py +10 -3
  41. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/transaction.py +259 -108
  42. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/types.py +1 -0
  43. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/aws/test_s3u.py +9 -1
  44. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +57 -6
  45. deltacat-2.0.0.post3/deltacat/tests/catalog/model/test_properties_transaction_migration.py +232 -0
  46. deltacat-2.0.0.post3/deltacat/tests/catalog/test_catalogs.py +651 -0
  47. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/test_default_catalog_impl.py +1382 -46
  48. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -18
  49. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/converter/test_convert_session.py +2 -2
  50. deltacat-2.0.0.post3/deltacat/tests/compute/converter/test_converter_commit_conflict_resolution.py +626 -0
  51. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_janitor.py +60 -38
  52. deltacat-2.0.0.post3/deltacat/tests/conftest.py +56 -0
  53. deltacat-2.0.0.post3/deltacat/tests/experimental/compatibility/test_backfill_transaction_partitions.py +477 -0
  54. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/main/test_main_storage.py +17 -8
  55. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_metafile_io.py +142 -18
  56. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_transaction_history.py +128 -68
  57. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_deltacat_api.py +334 -25
  58. deltacat-2.0.0.post3/deltacat/tests/utils/test_filesystem.py +3319 -0
  59. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/types/media.py +278 -0
  60. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/types/tables.py +116 -124
  61. deltacat-2.0.0.post3/deltacat/utils/filesystem.py +1590 -0
  62. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/pandas.py +11 -3
  63. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/polars.py +3 -1
  64. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/pyarrow.py +7 -3
  65. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/url.py +111 -18
  66. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3/deltacat.egg-info}/PKG-INFO +409 -94
  67. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat.egg-info/SOURCES.txt +5 -2
  68. deltacat-2.0.0.post1/deltacat/catalog/model/properties.py +0 -155
  69. deltacat-2.0.0.post1/deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +0 -201
  70. deltacat-2.0.0.post1/deltacat/tests/catalog/test_catalogs.py +0 -321
  71. deltacat-2.0.0.post1/deltacat/tests/conftest.py +0 -25
  72. deltacat-2.0.0.post1/deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +0 -582
  73. deltacat-2.0.0.post1/deltacat/utils/filesystem.py +0 -450
  74. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/LICENSE +0 -0
  75. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/MANIFEST.in +0 -0
  76. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/annotations.py +0 -0
  77. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/aws/__init__.py +0 -0
  78. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/aws/clients.py +0 -0
  79. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/aws/constants.py +0 -0
  80. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/aws/s3u.py +0 -0
  81. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/__init__.py +0 -0
  82. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/benchmark_engine.py +0 -0
  83. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
  84. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/benchmark_report.py +0 -0
  85. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/benchmark_suite.py +0 -0
  86. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/conftest.py +0 -0
  87. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/data/__init__.py +0 -0
  88. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/data/random_row_generator.py +0 -0
  89. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/data/row_generator.py +0 -0
  90. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/benchmarking/test_benchmark_pipeline.py +0 -0
  91. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/main/__init__.py +0 -0
  92. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/model/__init__.py +0 -0
  93. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/catalog/model/table_definition.py +0 -0
  94. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/__init__.py +0 -0
  95. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/__init__.py +0 -0
  96. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/compaction_session.py +0 -0
  97. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/__init__.py +0 -0
  98. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
  99. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
  100. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/compactor_version.py +0 -0
  101. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
  102. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
  103. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
  104. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
  105. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
  106. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/materialize_result.py +0 -0
  107. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
  108. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  109. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  110. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
  111. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/model/table_object_store.py +0 -0
  112. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/repartition_session.py +0 -0
  113. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/steps/__init__.py +0 -0
  114. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/steps/dedupe.py +0 -0
  115. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/steps/hash_bucket.py +0 -0
  116. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/steps/materialize.py +0 -0
  117. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/steps/repartition.py +0 -0
  118. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/__init__.py +0 -0
  119. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/io.py +0 -0
  120. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
  121. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/round_completion_reader.py +0 -0
  122. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/sort_key.py +0 -0
  123. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor/utils/system_columns.py +0 -0
  124. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/__init__.py +0 -0
  125. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/compaction_session.py +0 -0
  126. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/constants.py +0 -0
  127. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
  128. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +0 -0
  129. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/delete_strategy.py +0 -0
  130. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +0 -0
  131. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/model.py +0 -0
  132. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/deletes/utils.py +0 -0
  133. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/__init__.py +0 -0
  134. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -0
  135. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +0 -0
  136. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
  137. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/merge_file_group.py +0 -0
  138. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/merge_input.py +0 -0
  139. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/model/merge_result.py +0 -0
  140. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/private/__init__.py +0 -0
  141. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  142. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/steps/hash_bucket.py +0 -0
  143. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  144. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/content_type_params.py +0 -0
  145. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/dedupe.py +0 -0
  146. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/delta.py +0 -0
  147. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/io.py +0 -0
  148. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/merge.py +0 -0
  149. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/primary_key_index.py +0 -0
  150. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/compactor_v2/utils/task_options.py +0 -0
  151. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/__init__.py +0 -0
  152. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/constants.py +0 -0
  153. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/model/__init__.py +0 -0
  154. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/model/convert_input.py +0 -0
  155. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/model/convert_input_files.py +0 -0
  156. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/model/convert_result.py +0 -0
  157. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/model/converter_session_params.py +0 -0
  158. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  159. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/pyiceberg/catalog.py +0 -0
  160. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/pyiceberg/overrides.py +0 -0
  161. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/steps/__init__.py +0 -0
  162. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/steps/convert.py +0 -0
  163. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/steps/dedupe.py +0 -0
  164. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/__init__.py +0 -0
  165. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/convert_task_options.py +0 -0
  166. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/converter_session_utils.py +0 -0
  167. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/iceberg_columns.py +0 -0
  168. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/converter/utils/s3u.py +0 -0
  169. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/jobs/__init__.py +0 -0
  170. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/jobs/client.py +0 -0
  171. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/resource_estimation/__init__.py +0 -0
  172. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/resource_estimation/delta.py +0 -0
  173. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/resource_estimation/manifest.py +0 -0
  174. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/resource_estimation/model.py +0 -0
  175. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/resource_estimation/parquet.py +0 -0
  176. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/__init__.py +0 -0
  177. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/__init__.py +0 -0
  178. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  179. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/delta_stats.py +0 -0
  180. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  181. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  182. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/models/stats_result.py +0 -0
  183. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/compute/stats/types.py +0 -0
  184. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/__init__.py +0 -0
  185. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/autogen/__init__.py +0 -0
  186. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/autogen/schema/__init__.py +0 -0
  187. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  188. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/autogen/schema/inference/generate_type_mappings.py +0 -0
  189. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +0 -0
  190. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/env.py +0 -0
  191. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/__init__.py +0 -0
  192. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/basic_logging.py +0 -0
  193. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/__init__.py +0 -0
  194. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/aws/__init__.py +0 -0
  195. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/compactor.py +0 -0
  196. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/gcp/__init__.py +0 -0
  197. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/job_runner.py +0 -0
  198. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/compactor/utils/__init__.py +0 -0
  199. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/__init__.py +0 -0
  200. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/__init__.py +0 -0
  201. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  202. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  203. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/app.py +0 -0
  204. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/main.py +0 -0
  205. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +0 -0
  206. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +0 -0
  207. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +0 -0
  208. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +0 -0
  209. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/experimental/iceberg/iceberg_reader.py +0 -0
  210. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/indexer/__init__.py +0 -0
  211. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/indexer/aws/__init__.py +0 -0
  212. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/examples/indexer/gcp/__init__.py +0 -0
  213. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/__init__.py +0 -0
  214. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/catalog/__init__.py +0 -0
  215. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/catalog/iceberg/__init__.py +0 -0
  216. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +0 -0
  217. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/catalog/iceberg/impl.py +0 -0
  218. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/catalog/iceberg/overrides.py +0 -0
  219. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/compatibility/__init__.py +0 -0
  220. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/converter_agent/__init__.py +0 -0
  221. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  222. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/converter_agent/beam/managed.py +0 -0
  223. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/daft/__init__.py +0 -0
  224. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/__init__.py +0 -0
  225. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/iceberg/__init__.py +0 -0
  226. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +0 -0
  227. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/iceberg/impl.py +0 -0
  228. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/iceberg/model.py +0 -0
  229. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/iceberg/visitor.py +0 -0
  230. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/__init__.py +0 -0
  231. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  232. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/arrow/serializer.py +0 -0
  233. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/dataset.py +0 -0
  234. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/dataset_executor.py +0 -0
  235. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/feather/__init__.py +0 -0
  236. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/feather/file_reader.py +0 -0
  237. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/feather/serializer.py +0 -0
  238. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  239. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/fs/file_provider.py +0 -0
  240. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/fs/file_store.py +0 -0
  241. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/fs/input_file.py +0 -0
  242. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/fs/output_file.py +0 -0
  243. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/logical_plan.py +0 -0
  244. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  245. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/metastore/delta.py +0 -0
  246. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/metastore/json_sst.py +0 -0
  247. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/metastore/sst.py +0 -0
  248. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +0 -0
  249. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/mvp/Table.py +0 -0
  250. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/mvp/__init__.py +0 -0
  251. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/parquet/__init__.py +0 -0
  252. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  253. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/parquet/file_reader.py +0 -0
  254. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/parquet/serializer.py +0 -0
  255. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  256. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/block_scanner.py +0 -0
  257. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/data_reader.py +0 -0
  258. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/data_scan.py +0 -0
  259. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/dataset_reader.py +0 -0
  260. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +0 -0
  261. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/query_expression.py +0 -0
  262. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +0 -0
  263. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  264. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/schema/datatype.py +0 -0
  265. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/schema/schema.py +0 -0
  266. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/serializer.py +0 -0
  267. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/serializer_factory.py +0 -0
  268. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  269. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/shard/range_shard.py +0 -0
  270. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  271. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/writer/dataset_writer.py +0 -0
  272. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +0 -0
  273. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/__init__.py +0 -0
  274. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/dataset/__init__.py +0 -0
  275. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/dataset/deltacat_dataset.py +0 -0
  276. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/datasink/__init__.py +0 -0
  277. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/datasink/deltacat_datasink.py +0 -0
  278. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/datasource/__init__.py +0 -0
  279. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/datasource/deltacat_datasource.py +0 -0
  280. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/file_object_store.py +0 -0
  281. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/memcached_object_store.py +0 -0
  282. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/object_store.py +0 -0
  283. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/ray_plasma_object_store.py +0 -0
  284. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/reader/__init__.py +0 -0
  285. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/reader/deltacat_read_api.py +0 -0
  286. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/redis_object_store.py +0 -0
  287. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/io/s3_object_store.py +0 -0
  288. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/logs.py +0 -0
  289. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/__init__.py +0 -0
  290. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/main/__init__.py +0 -0
  291. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/__init__.py +0 -0
  292. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/expression/__init__.py +0 -0
  293. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/expression/expression.py +0 -0
  294. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/expression/visitor.py +0 -0
  295. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/interop.py +0 -0
  296. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/list_result.py +0 -0
  297. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/locator.py +0 -0
  298. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/scan/__init__.py +0 -0
  299. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/scan/push_down.py +0 -0
  300. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/scan/scan_plan.py +0 -0
  301. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/scan/scan_task.py +0 -0
  302. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/shard.py +0 -0
  303. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/sort_key.py +0 -0
  304. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/model/transform.py +0 -0
  305. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/util/__init__.py +0 -0
  306. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/storage/util/scan_planner.py +0 -0
  307. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/__init__.py +0 -0
  308. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/__init__.py +0 -0
  309. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/reader/__init__.py +0 -0
  310. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  311. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_cloudpickle_bug_fix.py +0 -0
  312. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_file_object_store.py +0 -0
  313. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_memcached_object_store.py +0 -0
  314. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_ray_plasma_object_store.py +0 -0
  315. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_redis_object_store.py +0 -0
  316. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/_io/test_s3_object_store.py +0 -0
  317. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/aws/__init__.py +0 -0
  318. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/aws/test_clients.py +0 -0
  319. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/__init__.py +0 -0
  320. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/data/__init__.py +0 -0
  321. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/main/__init__.py +0 -0
  322. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +0 -0
  323. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/model/__init__.py +0 -0
  324. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/catalog/model/test_table_definition.py +0 -0
  325. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/__init__.py +0 -0
  326. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +0 -0
  327. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compact_partition_rebase_test_cases.py +0 -0
  328. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +0 -0
  329. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compact_partition_test_cases.py +0 -0
  330. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/__init__.py +0 -0
  331. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  332. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/steps/test_repartition.py +0 -0
  333. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  334. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/utils/test_io.py +0 -0
  335. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +0 -0
  336. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  337. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
  338. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/utils/__init__.py +0 -0
  339. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -0
  340. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -0
  341. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -0
  342. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/conftest.py +0 -0
  343. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/converter/__init__.py +0 -0
  344. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/converter/conftest.py +0 -0
  345. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/converter/utils.py +0 -0
  346. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/resource_estimation/__init__.py +0 -0
  347. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/resource_estimation/data/__init__.py +0 -0
  348. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/resource_estimation/test_delta.py +0 -0
  349. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/resource_estimation/test_manifest.py +0 -0
  350. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_compact_partition_incremental.py +0 -0
  351. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_compact_partition_multiple_rounds.py +0 -0
  352. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_compact_partition_params.py +0 -0
  353. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_compact_partition_rebase.py +0 -0
  354. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +0 -0
  355. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_util_common.py +0 -0
  356. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/compute/test_util_constant.py +0 -0
  357. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/daft/__init__.py +0 -0
  358. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/daft/test_model.py +0 -0
  359. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/__init__.py +0 -0
  360. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/catalog/__init__.py +0 -0
  361. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  362. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +0 -0
  363. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/compatibility/__init__.py +0 -0
  364. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/daft/__init__.py +0 -0
  365. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +0 -0
  366. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/__init__.py +0 -0
  367. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  368. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/conftest.py +0 -0
  369. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  370. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +0 -0
  371. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  372. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +0 -0
  373. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +0 -0
  374. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +0 -0
  375. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  376. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +0 -0
  377. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  378. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +0 -0
  379. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/test_dataset.py +0 -0
  380. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/test_manifest.py +0 -0
  381. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +0 -0
  382. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/test_utils.py +0 -0
  383. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  384. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +0 -0
  385. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +0 -0
  386. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +0 -0
  387. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/__init__.py +0 -0
  388. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/main/__init__.py +0 -0
  389. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/__init__.py +0 -0
  390. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_delete_parameters.py +0 -0
  391. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_expression.py +0 -0
  392. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_manifest.py +0 -0
  393. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_partition_scheme.py +0 -0
  394. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_schema.py +0 -0
  395. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_schema_update.py +0 -0
  396. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_shard.py +0 -0
  397. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_sort_scheme.py +0 -0
  398. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_table_version.py +0 -0
  399. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/storage/model/test_transaction.py +0 -0
  400. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_exceptions.py +0 -0
  401. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_logs.py +0 -0
  402. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/__init__.py +0 -0
  403. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/constants.py +0 -0
  404. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/filesystem.py +0 -0
  405. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/message_pack_utils.py +0 -0
  406. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/pyarrow.py +0 -0
  407. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/storage.py +0 -0
  408. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/test_utils/utils.py +0 -0
  409. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/types/__init__.py +0 -0
  410. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/types/test_tables.py +0 -0
  411. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/__init__.py +0 -0
  412. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/data/__init__.py +0 -0
  413. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/exceptions.py +0 -0
  414. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/main_deltacat_storage_mock.py +0 -0
  415. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/ray_utils/__init__.py +0 -0
  416. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/ray_utils/test_concurrency.py +0 -0
  417. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/ray_utils/test_dataset.py +0 -0
  418. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_cloudpickle.py +0 -0
  419. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_daft.py +0 -0
  420. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_metrics.py +0 -0
  421. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_numpy.py +0 -0
  422. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_pandas.py +0 -0
  423. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_placement.py +0 -0
  424. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_polars.py +0 -0
  425. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_pyarrow.py +0 -0
  426. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
  427. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/tests/utils/test_resources.py +0 -0
  428. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/types/__init__.py +0 -0
  429. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/types/partial_download.py +0 -0
  430. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/__init__.py +0 -0
  431. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/arguments.py +0 -0
  432. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/cloudpickle.py +0 -0
  433. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/common.py +0 -0
  434. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/daft.py +0 -0
  435. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/export.py +0 -0
  436. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/metafile_locator.py +0 -0
  437. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/metrics.py +0 -0
  438. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/numpy.py +0 -0
  439. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/performance.py +0 -0
  440. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/placement.py +0 -0
  441. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/__init__.py +0 -0
  442. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/collections.py +0 -0
  443. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/concurrency.py +0 -0
  444. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/dataset.py +0 -0
  445. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/performance.py +0 -0
  446. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/ray_utils/runtime.py +0 -0
  447. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/reader_compatibility_mapping.py +0 -0
  448. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/resources.py +0 -0
  449. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat/utils/schema.py +0 -0
  450. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat.egg-info/dependency_links.txt +0 -0
  451. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat.egg-info/requires.txt +0 -0
  452. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/deltacat.egg-info/top_level.txt +0 -0
  453. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/setup.cfg +0 -0
  454. {deltacat-2.0.0.post1 → deltacat-2.0.0.post3}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deltacat
3
- Version: 2.0.0.post1
3
+ Version: 2.0.0.post3
4
4
  Summary: DeltaCAT is a portable Pythonic Data Lakehouse powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -53,22 +53,20 @@ Dynamic: summary
53
53
  <img src="https://github.com/ray-project/deltacat/raw/2.0/media/deltacat-logo-alpha-750.png" alt="deltacat logo" style="width:55%; height:auto; text-align: center;">
54
54
  </p>
55
55
 
56
- DeltaCAT is a portable Pythonic Data Lakehouse powered by [Ray](https://github.com/ray-project/ray). It lets you define and manage
57
- fast, scalable, ACID-compliant multimodal data lakes, and has been used to [successfully manage exabyte-scale enterprise
58
- data lakes](https://aws.amazon.com/blogs/opensource/amazons-exabyte-scale-migration-from-apache-spark-to-ray-on-amazon-ec2/).
56
+ DeltaCAT is a portable Multimodal Lakehouse powered by [Ray](https://github.com/ray-project/ray), [Apache Arrow](https://github.com/apache/arrow), and [Daft](https://github.com/Eventual-Inc/Daft). It lets you create ACID-compliant multimodal data lakes [that efficiently scale to manage exabytes of production data](https://aws.amazon.com/blogs/opensource/amazons-exabyte-scale-migration-from-apache-spark-to-ray-on-amazon-ec2/).
59
57
 
60
- It uses the Ray distributed compute framework together with [Apache Arrow](https://github.com/apache/arrow) and
61
- [Daft](https://github.com/Eventual-Inc/Daft) to efficiently scale common table management tasks, like petabyte-scale
62
- merge-on-read and copy-on-write operations.
58
+ It provides data lake level transactions & time travel, zero-copy schema evolution, zero-copy multimodal file processing (image, audio, video, text, etc.), and transparent dataset optimization. It runs locally for rapid development or in the cloud for production workloads. It runs on any filesystem for easy setup and sharing - no external catalog services, lock managers, or key value stores required.
59
+
60
+
61
+ ## Overview
63
62
 
64
63
  DeltaCAT provides the following high-level components:
65
- 1. [**Catalog**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/catalog/interface.py): High-level APIs to create, discover, organize, share, and manage datasets.
66
- 2. [**Compute**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/compute/): Distributed data management procedures to read, write, and optimize datasets.
67
- 3. [**Storage**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/storage/): In-memory and on-disk multimodal dataset formats.
64
+ 1. [**Catalog**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/catalog/interface.py): Pythonic APIs to discover, read, write, and manage datasets.
65
+ 2. [**Compute**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/compute/): Distributed data management procedures that automatically optimize your datasets.
66
+ 3. [**Storage**](https://github.com/ray-project/deltacat/tree/2.0/deltacat/storage/): A portable multimodal data lake format useable with any filesystem.
68
67
  4. **Sync** (in development): Synchronize DeltaCAT datasets to data warehouses and other table formats.
69
68
 
70
- ## Overview
71
- DeltaCAT's **Catalog**, **Compute**, and **Storage** layers work together to bring ACID-compliant data management to any Ray application. These components automate data indexing, change management, dataset read/write optimization, schema evolution, and other common data management tasks across any set of data files readable by Ray Data, Daft, Pandas, Polars, PyArrow, or NumPy.
69
+ DeltaCAT's **Catalog**, **Compute**, and **Storage** layers work together to bring ACID-compliant data management to any Ray application. These components automate data indexing, change management, dataset read/write optimization, schema evolution, and other common data management tasks across any set of data files readable by [Pandas](https://github.com/pandas-dev/pandas), [NumPy](https://github.com/numpy/numpy), [Polars](https://github.com/pola-rs/polars), [PyArrow](https://arrow.apache.org/docs/python/index.html), [Ray Data](https://docs.ray.io/en/latest/data/data.html), and [Daft](https://docs.daft.ai/en/stable/api/dataframe/).
72
70
 
73
71
  <p align="center">
74
72
  <img src="https://github.com/ray-project/deltacat/raw/2.0/media/deltacat-tech-overview.png" alt="deltacat tech overview" style="width:100%; height:auto; text-align: center;">
@@ -79,10 +77,15 @@ Data consumers that prefer to stay within the ecosystem of Pythonic data managem
79
77
  ## Getting Started
80
78
  DeltaCAT applications run anywhere that Ray runs, including your local laptop, cloud computing cluster, or on-premise cluster.
81
79
 
82
- DeltaCAT lets you manage **Tables** across one or more **Catalogs**. A **Table** can be thought of as a named collection of one or more data files. A **Catalog** provides a root location (e.g., a local file path or S3 Bucket) to store table information, and can be rooted in any [PyArrow-compatible Filesystem](https://arrow.apache.org/docs/python/filesystems.html). **Tables** can be created, read, and written using the `dc.write` and `dc.read` APIs.
80
+ DeltaCAT lets you manage **Tables** across one or more **Catalogs**. A **Table** can be thought of as a named collection of data files. A **Catalog** can be thought of as a named data lake that contains a set of **Tables**. A **Catalog** provides a root location (e.g., a local file path or S3 Bucket) to store information about all your **Tables**, and can be rooted in any [PyArrow-compatible Filesystem](https://arrow.apache.org/docs/python/filesystems.html). **Tables** can be created, read, and written using the `dc.write` and `dc.read` APIs.
81
+
83
82
 
84
83
  ### Quick Start
85
84
 
85
+ Install DeltaCAT with: `pip install deltacat`
86
+
87
+ Then run this script to create and read your first table:
88
+
86
89
  ```python
87
90
  import deltacat as dc
88
91
  import pandas as pd
@@ -108,8 +111,8 @@ dc.write(data, "users")
108
111
  daft_df = dc.read("users") # Returns Daft DataFrame (default)
109
112
  daft_df.show() # Materialize and print the DataFrame
110
113
 
111
- # Append more data and add a new column.
112
- # Compaction and schema evolution are handled automatically.
114
+ # Add more data and add a new column.
115
+ # Compaction and zero-copy schema evolution are handled automatically.
113
116
  data = pd.DataFrame({
114
117
  "id": [4, 5, 6],
115
118
  "name": ["Tom", "Simpkin", "Delta"],
@@ -125,13 +128,13 @@ daft_df.select("name", "age", "city").show()
125
128
  ```
126
129
 
127
130
  ### Core Concepts
128
- DeltaCAT can do much more than just append data to tables and read it back again. Expand the sections below to see examples of other core DeltaCAT concepts and APIs.
131
+ DeltaCAT can do much more than just add data to tables and read it back again. Expand the sections below to see examples of other core DeltaCAT concepts and APIs.
129
132
 
130
133
  <details>
131
134
 
132
- <summary><span style="font-size: 1.25em; font-weight: bold;">Replacing and Dropping Tables</span></summary>
135
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Idempotent Writes</span></summary>
133
136
 
134
- If you run the quick start example repeatedly from the same working directory, you'll notice that the table it writes to just keeps growing larger. This is because DeltaCAT always **appends** table data by default. One way to prevent this perpetual table growth and make the example idempotent is to use the **REPLACE** write mode if the table already exists:
137
+ If you run the quick start example repeatedly from the same working directory, you'll notice that the table it writes to just keeps growing larger. This is because DeltaCAT always **adds** table data by default. One way to prevent this perpetual table growth and make the example idempotent is to use the **REPLACE** write mode if the table already exists:
135
138
 
136
139
  ```python
137
140
  import deltacat as dc
@@ -165,7 +168,7 @@ dc.write(data, "users", mode=write_mode)
165
168
  daft_df = dc.read("users") # Returns Daft DataFrame (default)
166
169
  daft_df.show() # Materialize and print the DataFrame
167
170
 
168
- # Explicitly append more data and add a new column.
171
+ # Explicitly add more data and add a new column.
169
172
  # Compaction and schema evolution are handled automatically.
170
173
  data = pd.DataFrame({
171
174
  "id": [4, 5, 6],
@@ -173,7 +176,7 @@ data = pd.DataFrame({
173
176
  "age": [2, 12, 4],
174
177
  "city": ["Hollywood", "Gloucester", "San Francisco"]
175
178
  })
176
- dc.write(data, "users", mode=dc.TableWriteMode.APPEND)
179
+ dc.write(data, "users", mode=dc.TableWriteMode.ADD)
177
180
 
178
181
  # Read the full table back into a Daft DataFrame.
179
182
  daft_df = dc.read("users")
@@ -217,7 +220,7 @@ dc.write(data, "users", mode=dc.TableWriteMode.CREATE)
217
220
  daft_df = dc.read("users") # Returns Daft DataFrame (default)
218
221
  daft_df.show() # Materialize and print the DataFrame
219
222
 
220
- # Explicitly append more data and add a new column.
223
+ # Explicitly add more data and add a new column.
221
224
  # Compaction and schema evolution are handled automatically.
222
225
  data = pd.DataFrame({
223
226
  "id": [4, 5, 6],
@@ -225,7 +228,7 @@ data = pd.DataFrame({
225
228
  "age": [2, 12, 4],
226
229
  "city": ["Hollywood", "Gloucester", "San Francisco"]
227
230
  })
228
- dc.write(data, "users", mode=dc.TableWriteMode.APPEND)
231
+ dc.write(data, "users", mode=dc.TableWriteMode.ADD)
229
232
 
230
233
  # Read the full table back into a Daft DataFrame.
231
234
  daft_df = dc.read("users")
@@ -237,9 +240,117 @@ assert dc.dataset_length(daft_df) == 6
237
240
 
238
241
  </details>
239
242
 
243
+
240
244
  <details>
241
245
 
242
- <summary><span style="font-size: 1.25em; font-weight: bold;">Supported Dataset and File Formats</span></summary>
246
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Ordered Writes</span></summary>
247
+ DeltaCAT writes are unordered by default, which means that the order of data written to the table isn't guaranteed to match the order that it is read back. While this is useful for preventing conflicts between concurrent writers, you can also use the **APPEND** write mode to preserve write order and raise explicit concurrency conflicts between parallel writers:
248
+
249
+ ```python
250
+ import deltacat as dc
251
+ import pandas as pd
252
+
253
+ # Initialize DeltaCAT with a default local catalog.
254
+ # Ray will be initialized automatically.
255
+ # Catalog files will be stored in .deltacat/ in the current working directory.
256
+ dc.init_local()
257
+
258
+ # Create data to write.
259
+ data = pd.DataFrame({
260
+ "id": [1, 2],
261
+ "name": ["Cheshire", "Dinah"],
262
+ "age": [3, 7]
263
+ })
264
+
265
+ # Derive a DeltaCAT schema for the data.
266
+ schema = dc.Schema.of(dc.dataset_schema(data))
267
+
268
+ # Create an empty table to hold ordered user data.
269
+ if not dc.table_exists("users_ordered"):
270
+ dc.create_table("users_ordered", schema=schema)
271
+
272
+ # Write the first ordered delta to the table.
273
+ dc.write(data, "users_ordered", mode=dc.TableWriteMode.APPEND)
274
+
275
+ # Write the second ordered delta to the table.
276
+ data = pd.DataFrame({
277
+ "id": [3, 4],
278
+ "name": ["Felix", "Tom"],
279
+ "age": [2, 12],
280
+ "city": ["Hollywood", "Gloucester"]
281
+ })
282
+ dc.write(data, "users_ordered", mode=dc.TableWriteMode.APPEND)
283
+
284
+ # Write the third ordered delta to the table.
285
+ data = pd.DataFrame({
286
+ "id": [5, 6],
287
+ "name": ["Simpkin", "Delta"],
288
+ "age": [12, 4],
289
+ "city": ["San Francisco", "San Francisco"]
290
+ })
291
+ dc.write(data, "users_ordered", mode=dc.TableWriteMode.APPEND)
292
+
293
+ # Read the data back as a Pandas DataFrame, and ensure that the
294
+ # order of the records returned matches the order they were written.
295
+ pandas_df = dc.read("users_ordered", read_as=dc.DatasetType.PANDAS)
296
+ print(pandas_df)
297
+ ```
298
+
299
+ </details>
300
+
301
+ <details>
302
+
303
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Schemaless Tables</span></summary>
304
+ Tables created automatically via `dc.write` have a schema inferred from the data written by default. However, if you create an empty table without providing a schema, it defaults to schemaless. Writes to schemaless tables are more efficient and flexible, since they simply track the location and basic metadata associated with the data files written to the table. However, if you know that a unified schema can be derived for your schemaless data, then you can you can still read it back as a structured dataset:
305
+
306
+ ```python
307
+ import deltacat as dc
308
+ import pandas as pd
309
+
310
+ # Initialize DeltaCAT with a default local catalog.
311
+ # Ray will be initialized automatically.
312
+ # Catalog files will be stored in .deltacat/ in the current working directory.
313
+ dc.init_local()
314
+
315
+ # Create data to write.
316
+ data = pd.DataFrame({
317
+ "id": [1, 2],
318
+ "name": ["Cheshire", "Dinah"],
319
+ "age": [3, 7]
320
+ })
321
+
322
+ # Create an empty schemaless table to hold ordered user data.
323
+ if not dc.table_exists("users_schemaless"):
324
+ dc.create_table("users_schemaless")
325
+
326
+ # Write the first ordered delta to the table.
327
+ dc.write(data, "users_schemaless", mode=dc.TableWriteMode.APPEND)
328
+
329
+ # Write the second ordered delta to the table.
330
+ data = pd.DataFrame({
331
+ "id": [3, 4],
332
+ "name": ["Felix", "Tom"],
333
+ "age": [2, 12],
334
+ "city": ["Hollywood", "Gloucester"]
335
+ })
336
+ dc.write(data, "users_schemaless", mode=dc.TableWriteMode.APPEND)
337
+
338
+ # Read back the file manifest of the schemaless table.
339
+ # Notice that file paths, sizes, etc. are returned instead of the dataframes written.
340
+ manifest_df = dc.read("users_schemaless", read_as=dc.DatasetType.PANDAS)
341
+ print(manifest_df)
342
+
343
+ # Use from_manifest_table to convert the manifest table to a structured dataset.
344
+ structured_daft_df = dc.from_manifest_table(manifest_df)
345
+ structured_daft_df.show()
346
+ ```
347
+
348
+ </details>
349
+
350
+
351
+ <details>
352
+
353
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Working Across Dataset and File Types</span></summary>
243
354
 
244
355
  DeltaCAT natively supports a variety of open dataset and file formats already integrated with Ray and Arrow. You can use `dc.read` to read tables back as a Daft DataFrame, Ray Dataset, Pandas DataFrame, PyArrow Table, Polars DataFrame, NumPy Array, or list of PyArrow ParquetFile objects:
245
356
 
@@ -329,7 +440,7 @@ print("\n=== NumPy Table ===")
329
440
  dc.read("my_numpy_table").show()
330
441
  ```
331
442
 
332
- Or write to different table file formats:
443
+ DeltaCAT tables also support persisting data in heterogeneous table file formats like Avro, ORC, or Feather:
333
444
 
334
445
  ```python
335
446
  data = pd.DataFrame({"id": [1], "name": ["Cheshire"], "age": [3]})
@@ -372,9 +483,9 @@ print(pandas_df)
372
483
 
373
484
  <details>
374
485
 
375
- <summary><span style="font-size: 1.25em; font-weight: bold;">Merging and Deleting Data</span></summary>
486
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Live Feature Enrichment</span></summary>
376
487
 
377
- DeltaCAT can automatically merge and delete data by defining a table schema with one or more merge keys:
488
+ DeltaCAT can update your datasets on-the-fly to keep up with a continuous stream of new insights, and support common ML use-cases like feature enrichment. Just define a table schema with one or more merge keys to start updating and deleting existing records:
378
489
 
379
490
  ```python
380
491
  import deltacat as dc
@@ -385,53 +496,50 @@ import tempfile
385
496
  # Initialize DeltaCAT with a fresh temporary catalog
386
497
  dc.init_local(tempfile.mkdtemp())
387
498
 
388
- # Define a schema with user_id as a merge key.
389
- schema = dc.Schema.of([
499
+ # Start with minimal schema - just user_id as merge key and name
500
+ initial_schema = dc.Schema.of([
390
501
  dc.Field.of(pa.field("user_id", pa.int64()), is_merge_key=True),
391
502
  dc.Field.of(pa.field("name", pa.string())),
392
- dc.Field.of(pa.field("age", pa.int32())),
393
- dc.Field.of(pa.field("status", pa.string())),
394
503
  ])
395
504
 
396
- # Initial user data
505
+ # Initial user data - just basic info
397
506
  initial_users = pd.DataFrame({
398
507
  "user_id": [1, 2, 3],
399
- "name": ["Cheshire", "Dinah", "Felix"],
400
- "age": [3, 7, 2],
401
- "status": ["active", "active", "inactive"]
508
+ "name": ["Jim", "Dinah", "Bob"],
402
509
  })
403
510
 
404
- # Write initial data with the merge key schema
405
- dc.write(initial_users, "users", schema=schema)
511
+ # Write initial data with minimal schema
512
+ dc.write(initial_users, "users", schema=initial_schema)
406
513
 
407
- # Read the data back as a Pandas DataFrame.
514
+ # Read the data back as a Pandas DataFrame
408
515
  df = dc.read("users", read_as=dc.DatasetType.PANDAS)
409
- print("=== Initial Users ===")
516
+ print("=== Initial Users (Basic Info) ===")
410
517
  print(df.sort_values("user_id"))
411
518
 
412
- # Update data for existing users + add new users
413
- updated_users = pd.DataFrame({
414
- "user_id": [2, 3, 4, 5, 6],
415
- "name": ["Dinah", "Felix", "Tom", "Simpkin", "Delta"],
416
- "age": [7, 2, 5, 12, 4],
417
- "status": ["premium", "active", "active", "active", "active"]
519
+ # Later, enrich with new insights: add age/job features + new users
520
+ enriched_data = pd.DataFrame({
521
+ "user_id": [1, 3, 4, 5, 6],
522
+ "name": ["Cheshire", "Felix", "Tom", "Simpkin", "Delta"],
523
+ "age": [3, 2, 5, 12, 4],
524
+ "job": ["Tour Guide", "Drifter", "Housekeeper", "Mouser", "Engineer"]
418
525
  })
419
526
 
420
- # Write automatically detects that the schema has a merge key and:
421
- # 1. Updates existing records with matching user IDs.
422
- # 2. Inserts new records with new user IDs.
423
- dc.write(updated_users, "users", schema=schema)
527
+ # DeltaCAT automatically evolves the schema and merges by user_id:
528
+ # 1. Enriches existing users (Jim -> Cheshire age=3, job="Tour Guide"; Bob -> Felix)
529
+ # 2. Adds new age/job columns with automatic schema evolution
530
+ # 3. Inserts new users (Tom, Simpkin, Delta) with full feature set
531
+ dc.write(enriched_data, "users")
424
532
 
425
- # Read back to see merged results
533
+ # Read back to see live feature enrichment results
426
534
  df = dc.read("users", read_as=dc.DatasetType.PANDAS)
427
- print("\n=== After Merge ===")
535
+ print("\n=== Enriched Users (Age & Job) ===")
428
536
  print(df.sort_values("user_id"))
429
537
 
430
- # - Cheshire (user_id=1) remains unchanged
431
- # - Dinah (user_id=2) status updated to "premium"
432
- # - Felix (user_id=3) updated to "active"
433
- # - New users (4,5,6), (Tom, Simpkin, Delta) added
434
- # - No duplicate user_id values exist
538
+ # - Cheshire (user_id=1) name updated from Jim, gets age=3, job="Tour Guide"
539
+ # - Dinah (user_id=2) keeps original name, gets null age/job (missing features)
540
+ # - Felix (user_id=3) name updated from Bob, gets age=2, job="Drifter"
541
+ # - New users (4,5,6) added with complete feature set
542
+ # - Schema automatically evolved to include age/job columns
435
543
 
436
544
  # Specify the users to delete.
437
545
  # We only need to specify matching merge key values.
@@ -440,7 +548,7 @@ users_to_delete = pd.DataFrame({
440
548
  })
441
549
 
442
550
  # Delete the records that match our merge keys.
443
- dc.write(users_to_delete, "users", schema=schema, mode=dc.TableWriteMode.DELETE)
551
+ dc.write(users_to_delete, "users", mode=dc.TableWriteMode.DELETE)
444
552
 
445
553
  # Read the table back to confirm target users have been deleted.
446
554
  df = dc.read("users", read_as=dc.DatasetType.PANDAS)
@@ -456,6 +564,117 @@ print(df.sort_values("user_id"))
456
564
 
457
565
  <details>
458
566
 
567
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Zero-Copy Multimodal URL Processing</span></summary>
568
+
569
+ DeltaCAT can register and process existing multimodal datasets from local or remote URLs. This enables zero-copy distributed processing of images, audio, text, and other file formats:
570
+
571
+ ```python
572
+ import deltacat as dc
573
+ import pandas as pd
574
+ import pyarrow as pa
575
+ import tempfile
576
+ import ray
577
+
578
+ # Initialize DeltaCAT with a fresh temporary catalog
579
+ dc.init_local(tempfile.mkdtemp())
580
+
581
+ # Create dataset with DeltaCAT URLs pointing to existing files
582
+ urls_df = pd.DataFrame({
583
+ "file_id": [1, 2, 3, 4, 5, 6],
584
+ "url": [
585
+ # URLs with common file extensions will have their content type inferred.
586
+ "https://picsum.photos/id/237/400/300.jpg",
587
+ "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv",
588
+ "https://raw.githubusercontent.com/SergLam/Audio-Sample-files/master/sample.mp3",
589
+ "https://raw.githubusercontent.com/burningtree/awesome-json/master/README.md",
590
+ "https://raw.githubusercontent.com/microsoft/vscode/main/package.json",
591
+ # URLs without common file extensions will be read as binary by default.
592
+ "https://picsum.photos/200"
593
+ ]
594
+ })
595
+
596
+ # Create empty table with merge key to efficiently add insights about each file
597
+ dc.create_table(
598
+ "multimodal_files",
599
+ schema=dc.Schema.of([
600
+ dc.Field.of(pa.field("file_id", pa.int64()), is_merge_key=True),
601
+ dc.Field.of(pa.field("url", pa.string()))
602
+ ])
603
+ )
604
+
605
+ # Write URLs to DeltaCAT table
606
+ dc.write(urls_df, "multimodal_files")
607
+
608
+ # UDF to process each file in parallel using Ray Dataset map method
609
+ def analyze_file(row):
610
+ file_id = row["file_id"]
611
+ url = row["url"]
612
+
613
+ # DeltaCAT automatically infers the right Ray Data reader for the URL
614
+ dataset = dc.get(url)
615
+ records = dataset.take_all()
616
+ url_type = dc.DatastoreType.from_url(url)
617
+
618
+ # Extract standard Ray Dataset fields for each file type
619
+ if url_type == dc.DatastoreType.IMAGES:
620
+ image = records[0]["image"]
621
+ analysis = f"Image {image.shape[1]}x{image.shape[0]} pixels"
622
+ elif url_type == dc.DatastoreType.CSV:
623
+ analysis = f"CSV with {len(records)} rows, {len(records[0].keys())} columns"
624
+ elif url_type == dc.DatastoreType.AUDIO:
625
+ sample_rate = records[0]["sample_rate"]
626
+ duration = len(records[0]["amplitude"][0]) / sample_rate
627
+ analysis = f"Audio {duration:.1f}s, {sample_rate}Hz"
628
+ elif url_type == dc.DatastoreType.JSON:
629
+ analysis = f"JSON with {len(records[0].keys())} fields"
630
+ elif url_type == dc.DatastoreType.TEXT:
631
+ analysis = f"Text with {len(records)} records"
632
+ else:
633
+ analysis = f"Binary with {len(records[0]['bytes'])} bytes"
634
+
635
+ return {"file_id": file_id, "analysis": analysis}
636
+
637
+ # Read the multimodal_files table as a Ray Dataset
638
+ ray_dataset = dc.read("multimodal_files", read_as=dc.DatasetType.RAY_DATASET)
639
+ # Download and analyze each URL in parallel using map
640
+ results_dataset = ray_dataset.map(analyze_file)
641
+
642
+ # Write results back to the multimodal_files table
643
+ dc.write(results_dataset, "multimodal_files", mode=dc.TableWriteMode.MERGE)
644
+
645
+ # Read final results and compare to initial dataset
646
+ print("\n=== Initial Dataset ===")
647
+ print(dc.to_pandas(ray_dataset))
648
+
649
+ print("\n=== Final Results with Analysis ===")
650
+ print(dc.read("multimodal_files", read_as=dc.DatasetType.PANDAS))
651
+ ```
652
+
653
+ The default dataset type used by `dc.get` is a Ray Dataset but, similar to `dc.read`, `dc.get` can also read URLs into other dataset types like Daft:
654
+
655
+ ```python
656
+ import deltacat as dc
657
+
658
+ # Create dataset with DeltaCAT URLs pointing to existing files
659
+ urls = [
660
+ # URLs with common file extensions will have their content type inferred.
661
+ "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv",
662
+ "https://raw.githubusercontent.com/burningtree/awesome-json/master/README.md",
663
+ # URLs without common file extensions will be read as binary by default.
664
+ "https://picsum.photos/200"
665
+ ]
666
+
667
+ # Download each URL into a Daft DataFrame serially
668
+ for url in urls:
669
+ dataset = dc.get(url, read_as=dc.DatasetType.DAFT)
670
+ print(f"\n=== {url} ===")
671
+ print(dataset.show())
672
+ ```
673
+
674
+ </details>
675
+
676
+ <details>
677
+
459
678
  <summary><span style="font-size: 1.25em; font-weight: bold;">Organizing Tables with Namespaces</span></summary>
460
679
 
461
680
  In DeltaCAT, table **Namespaces** are optional but useful for organizing related tables within a catalog:
@@ -486,6 +705,10 @@ order_data = pd.DataFrame({
486
705
  "product_id": [101, 102, 103],
487
706
  "quantity": [2, 1, 2]
488
707
  })
708
+ # Create identity, inventory, and sales namespaces
709
+ dc.create_namespace("identity")
710
+ dc.create_namespace("inventory")
711
+ dc.create_namespace("sales")
489
712
 
490
713
  # Write tables to different namespaces to organize them by domain
491
714
  dc.write(user_data, "users", namespace="identity")
@@ -511,7 +734,10 @@ finance_users = pd.DataFrame({
511
734
  "preferred_payment_method": ["credit", "cash", "paypal"]
512
735
  })
513
736
 
737
+ dc.create_namespace("marketing")
514
738
  dc.write(marketing_users, "users", namespace="marketing")
739
+
740
+ dc.create_namespace("finance")
515
741
  dc.write(finance_users, "users", namespace="finance")
516
742
 
517
743
  # Each namespace maintains its own "users" table with different schemas
@@ -534,9 +760,9 @@ print(finance_df)
534
760
 
535
761
  <details>
536
762
 
537
- <summary><span style="font-size: 1.25em; font-weight: bold;">Multi-Table Transactions</span></summary>
763
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Data Lake Level Transactions</span></summary>
538
764
 
539
- DeltaCAT transactions can span multiple tables and namespaces. Since all operations within a transaction either succeed or fail together, this simplifies keeping related datasets in sync across your entire catalog.
765
+ DeltaCAT transactions can span multiple tables and namespaces. Since transaction history is maintained at the catalog level, every transaction operates against a consistent snapshot of every object in your data lake. Since all operations within a transaction either succeed or fail together, this simplifies keeping related datasets in sync across your entire catalog.
540
766
 
541
767
  Consider the previous example that organized tables with namespaces. One table tracked customer orders, and another table tracked the lifetime payments of each customer. If one table was updated but not the other, then it would result in an accounting discrepancy. This edge case can be eliminated by using multi-table transactions:
542
768
 
@@ -557,6 +783,7 @@ product_data = pd.DataFrame({
557
783
  })
558
784
 
559
785
  # The product catalog can be created independently.
786
+ dc.create_namespace("inventory")
560
787
  dc.write(product_data, "catalog", namespace="inventory")
561
788
 
562
789
  print(f"\n=== Initial Product Data ===")
@@ -583,7 +810,9 @@ finance_schema = dc.Schema.of([
583
810
  # Create user identities and user finance data within a single transaction.
584
811
  # Since transactions are atomic, this prevents accounting discrepancies.
585
812
  with dc.transaction():
813
+ dc.create_namespace("identity")
586
814
  dc.write(user_data, "users", namespace="identity")
815
+ dc.create_namespace("finance")
587
816
  dc.write(initial_finance, "users", namespace="finance", schema=finance_schema)
588
817
 
589
818
  print(f"\n=== Initial User Data ===")
@@ -602,6 +831,7 @@ new_orders = pd.DataFrame({
602
831
  # Process new orders and update lifetime payment totals within a single transaction.
603
832
  with dc.transaction():
604
833
  # Step 1: Write the new orders
834
+ dc.create_namespace("sales")
605
835
  dc.write(new_orders, "transactions", namespace="sales")
606
836
 
607
837
  # Step 2: Read back transactions and products to compute actual totals
@@ -617,6 +847,7 @@ with dc.transaction():
617
847
  finance_updates.columns = ["user_id", "lifetime_payments"]
618
848
 
619
849
  # Step 4: Write the computed totals
850
+ dc.create_namespace("finance")
620
851
  dc.write(finance_updates, "users", namespace="finance", mode=dc.TableWriteMode.MERGE)
621
852
 
622
853
  # Verify that orders and and lifetime payments are kept in sync.
@@ -630,7 +861,7 @@ print(dc.read("users", namespace="finance", read_as=dc.DatasetType.PANDAS))
630
861
 
631
862
  <details>
632
863
 
633
- <summary><span style="font-size: 1.25em; font-weight: bold;">Working with Multiple Catalogs</span></summary>
864
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Managing Multiple Data Lakes</span></summary>
634
865
 
635
866
  DeltaCAT lets you work with multiple catalogs in a single application. All catalogs registered with DeltaCAT are tracked by a Ray Actor to make them available to all workers in your Ray application.
636
867
 
@@ -646,16 +877,14 @@ import tempfile
646
877
  from decimal import Decimal
647
878
 
648
879
  # Initialize catalogs with separate names and catalog roots.
649
- dc.init(catalogs={
650
- "staging": dc.Catalog(config=dc.CatalogProperties(
651
- root=tempfile.mkdtemp(), # Use temporary directory for staging
652
- filesystem=pa.fs.LocalFileSystem()
653
- )),
654
- "prod": dc.Catalog(config=dc.CatalogProperties(
655
- root=tempfile.mkdtemp(), # Use temporary directory for prod
656
- filesystem=pa.fs.LocalFileSystem()
657
- ))
658
- })
880
+ dc.init(
881
+ catalogs={
882
+ # Use temporary directory for staging
883
+ "staging": dc.Catalog(dc.CatalogProperties(tempfile.mkdtemp())),
884
+ # Use S3 for prod
885
+ "prod": dc.Catalog(dc.CatalogProperties("s3://example/deltacat"))
886
+ }
887
+ )
659
888
 
660
889
  # Create a PyArrow table with decimal256 data
661
890
  decimal_table = pa.table({
@@ -705,9 +934,95 @@ print(dc.read("financial_data", catalog="prod", read_as=dc.DatasetType.PANDAS))
705
934
 
706
935
  <details>
707
936
 
708
- <summary><span style="font-size: 1.25em; font-weight: bold;">Transaction History & Time Travel</span></summary>
937
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Data Lake Sharing & Portability</span></summary>
938
+
939
+ DeltaCAT catalogs are self-contained directories on a filesystem, so you can easily share your data lake with others. A local catalog on your laptop can be compressed and sent anywhere. A cloud catalog in S3, GCS, or Azure Blog Storage can be shared via URL. The read/write permissions of your catalog are the read/write permissions of your filesystem.
940
+
941
+ For example, you can zip up your local catalog and upload it to S3 via:
942
+ ```bash
943
+ # zip a local catalog
944
+ zip -r catalog.zip .deltacat/
945
+
946
+ # copy the catalog to a cloud bucket
947
+ aws s3 cp catalog.zip s3://my-bucket/catalog.zip
948
+ ```
949
+
950
+ The person you shared it with can retrieve and decompress it via:
951
+ ```bash
952
+ # copy the cloud catalog to local disk
953
+ aws s3 cp s3://my-bucket/catalog.zip .
954
+
955
+ # unzip the catalog to a local directory
956
+ unzip catalog.zip -d .deltacat_copy/
957
+ ```
958
+
959
+ And then initialize it together with any other catalogs they're working with:
960
+ ```python
961
+ import deltacat as dc
962
+
963
+ # Initialize catalogs with separate names and catalog roots.
964
+ dc.init(
965
+ catalogs={
966
+ "original": dc.Catalog(dc.CatalogProperties(".deltacat")),
967
+ "copy": dc.Catalog(dc.CatalogProperties(".deltacat_copy")),
968
+ "prod_aws": dc.Catalog(dc.CatalogProperties("s3://prod/deltacat")),
969
+ "prod_gcp": dc.Catalog(dc.CatalogProperties("gs://prod/deltacat")),
970
+ "prod_azure": dc.Catalog(dc.CatalogProperties("az://prod/deltacat")),
971
+ }
972
+ )
973
+
974
+ # List all namespaces in the original catalog
975
+ namespaces = dc.list("dc://original")
976
+ print([namespace.name for namespace in namespaces])
977
+
978
+ # List all namespaces in the copy catalog
979
+ namespaces = dc.list("dc://copy")
980
+ print([namespace.name for namespace in namespaces])
981
+
982
+ # List all tables in the default namespace of the original catalog
983
+ tables = dc.list("dc://original/default")
984
+ print([table.name for table in tables])
985
+
986
+ # List all tables in the default namespace of the copy catalog
987
+ tables = dc.list("dc://copy/default")
988
+ print([table.name for table in tables])
989
+ ```
990
+
991
+ `dc.copy` can also be used to copy namespaces and tables between catalogs:
992
+ ```python
993
+ # Copy the "default" namespace from the original local catalog over to the "myspace" namespace in the copy catalog
994
+ dc.copy("dc://original/default", "dc://copy/default/myspace")
995
+
996
+ # By default, no tables are copied from the source namespace to the destination
997
+ tables = dc.list("dc://copy/myspace")
998
+ print(f"{len(tables)} tables in myspace.")
999
+
1000
+ # Copy the "users" table from the original local catalog over to "local_users" in the prod_aws catalog
1001
+ dc.copy("dc://original/default/users", "dc://prod_aws/default/local_users")
1002
+
1003
+ # Read the copied table back
1004
+ df = dc.read("local_users", catalog="prod_aws")
1005
+ df.show()
1006
+
1007
+ # We can also copy all tables in the default namespace using **
1008
+ dc.copy("dc://original/default/**", "dc://copy/default/myspace")
1009
+ tables = dc.list("dc://copy/myspace")
1010
+ print(f"{len(tables)} tables in myspace.")
1011
+
1012
+ # Or we can copy all namespaces from the original catalog using *
1013
+ dc.copy("dc://original/*", "dc://copy")
1014
+ namespaces = dc.list("dc://copy")
1015
+ print([namespace.name for namespace in namespaces])
1016
+ ```
1017
+
1018
+ </details>
1019
+
1020
+
1021
+ <details>
1022
+
1023
+ <summary><span style="font-size: 1.25em; font-weight: bold;">Data Lake Level Time Travel</span></summary>
709
1024
 
710
- DeltaCAT supports time travel queries that let you read all tables in a catalog as they existed at any point in the past. Combined with multi-table transactions, this enables consistent point-in-time views across your entire data catalog.
1025
+ DeltaCAT supports time travel queries that let you read all tables in a catalog as they existed at any point in the past. Combined with catalog-level transactions, this enables consistent point-in-time views across your entire data lake.
711
1026
 
712
1027
  ```python
713
1028
  import deltacat as dc
@@ -744,10 +1059,10 @@ initial_finance = pd.DataFrame({
744
1059
 
745
1060
  # Write initial state atomically with a commit message
746
1061
  with dc.transaction(commit_message="Initial data load: users, products, orders, and finance"):
747
- dc.write(initial_users, "users", namespace="identity")
748
- dc.write(initial_products, "catalog", namespace="inventory")
749
- dc.write(initial_orders, "transactions", namespace="sales")
750
- dc.write(initial_finance, "users", namespace="finance")
1062
+ dc.write(initial_users, "users", namespace="identity", auto_create_namespace=True)
1063
+ dc.write(initial_products, "catalog", namespace="inventory", auto_create_namespace=True)
1064
+ dc.write(initial_orders, "transactions", namespace="sales", auto_create_namespace=True)
1065
+ dc.write(initial_finance, "users", namespace="finance", auto_create_namespace=True)
751
1066
 
752
1067
  # Sleep briefly to ensure transaction timestamp separation
753
1068
  time.sleep(0.1)
@@ -847,7 +1162,7 @@ print("\nTime travel validation successful!")
847
1162
 
848
1163
  <summary><span style="font-size: 1.25em; font-weight: bold;">Multimodal Batch Inference</span></summary>
849
1164
 
850
- DeltaCAT's support for merging new fields into existing records and multimodal datasets can be used to build a multimodal batch inference pipeline. For example, the following code indexes images of cats, then merges in new fields with breed precitions predictions for each image:
1165
+ DeltaCAT's support for merging new fields into existing records and multimodal datasets can be used to build a multimodal batch inference pipeline. For example, the following code indexes images of cats, then merges in new fields with breed predictions for each image:
851
1166
 
852
1167
  > **Requirements**: This example requires PyTorch ≥ 2.8.0 and torchvision ≥ 0.23.0. Install via: `pip install torch>=2.8.0 torchvision>=0.23.0`
853
1168
 
@@ -938,7 +1253,7 @@ final_df.show()
938
1253
 
939
1254
  <summary><span style="font-size: 1.25em; font-weight: bold;">LLM Batch Inference</span></summary>
940
1255
 
941
- DeltaCAT multi-table transactions, time travel queries, and automatic schema evolution can be used to create auditable LLM batch inference pipelines. For example, the following code tries different approaches to analyze the overall tone of customer feedback, then generates customer service responses based on the analysis:
1256
+ DeltaCAT multi-table transactions, data lake time travel, and automatic schema evolution can be used to create auditable LLM batch inference pipelines. For example, the following code tries different approaches to analyze the overall tone of customer feedback, then generates customer service responses based on the analysis:
942
1257
 
943
1258
  ```python
944
1259
  import deltacat as dc
@@ -963,7 +1278,7 @@ daft_docs = daft_docs.with_column("content", daft_docs["path"].url.download().de
963
1278
  # Capture basic feedback sentiment analysis in a parallel multi-table transaction
964
1279
  with dc.transaction():
965
1280
  # Write the full customer feedback to a new "documents" table.
966
- dc.write(daft_docs, "documents", namespace="analysis")
1281
+ dc.write(daft_docs, "documents")
967
1282
 
968
1283
  # Define a UDF to analyze customer feedback sentiment.
969
1284
  @daft.udf(return_dtype=daft.DataType.struct({
@@ -1000,14 +1315,14 @@ with dc.transaction():
1000
1315
  dc.Field.of(pa.field("confidence", pa.float64())),
1001
1316
  dc.Field.of(pa.field("model_version", pa.large_string())),
1002
1317
  ])
1003
- dc.write(daft_results, "insights", namespace="analysis", schema=initial_schema)
1318
+ dc.write(daft_results, "insights", schema=initial_schema)
1004
1319
 
1005
1320
  # Write to a new audit trail table.
1006
1321
  audit_df = pd.DataFrame([{
1007
1322
  "version": "v1.0",
1008
1323
  "docs_processed": dc.dataset_length(daft_docs),
1009
1324
  }])
1010
- dc.write(audit_df, "audit", namespace="analysis")
1325
+ dc.write(audit_df, "audit")
1011
1326
 
1012
1327
  print("=== V1.0: Customer feedback sentiment analysis processing complete! ===")
1013
1328
 
@@ -1048,9 +1363,9 @@ with dc.transaction():
1048
1363
  )
1049
1364
 
1050
1365
  # Merge new V2.0 insights into the existing V1.0 insights table.
1051
- dc.write(daft_emotions, "insights", namespace="analysis")
1366
+ dc.write(daft_emotions, "insights")
1052
1367
  audit_df = pd.DataFrame([{"version": "v2.0", "docs_processed": dc.dataset_length(daft_docs)}])
1053
- dc.write(audit_df, "audit", namespace="analysis")
1368
+ dc.write(audit_df, "audit")
1054
1369
 
1055
1370
  print("=== V2.0: Customer feedback emotion analysis processing complete! ===")
1056
1371
 
@@ -1062,7 +1377,7 @@ time.sleep(0.1)
1062
1377
  # Generate customer service responses based on emotion analysis results.
1063
1378
  with dc.transaction():
1064
1379
  # First, read the current insights table with emotion analysis
1065
- current_insights = dc.read("insights", namespace="analysis")
1380
+ current_insights = dc.read("insights")
1066
1381
 
1067
1382
  # Define a UDF to generate customer service responses based on analysis results.
1068
1383
  @daft.udf(return_dtype=daft.DataType.struct({
@@ -1109,39 +1424,39 @@ with dc.transaction():
1109
1424
  )
1110
1425
  # Merge new V3.0 responses into the existing V2.0 insights table.
1111
1426
  # The new response columns are automatically joined by document ID.
1112
- dc.write(daft_responses, "insights", namespace="analysis")
1427
+ dc.write(daft_responses, "insights")
1113
1428
  audit_df = pd.DataFrame([{"version": "v3.0", "docs_processed": dc.dataset_length(current_insights)}])
1114
- dc.write(audit_df, "audit", namespace="analysis")
1429
+ dc.write(audit_df, "audit")
1115
1430
 
1116
1431
  print("=== V3.0: Customer service response generation processing complete! ===")
1117
1432
 
1118
1433
  print("\n=== Time Travel Comparison of all Versions ===")
1119
1434
  with dc.transaction(as_of=checkpoint_v1):
1120
1435
  print(f"== V1.0 Insights (sentiment) ==")
1121
- print(dc.read("insights", namespace="analysis").show())
1436
+ print(dc.read("insights").show())
1122
1437
  print(f"== V1.0 Audit ==")
1123
- print(dc.read("audit", namespace="analysis").show())
1438
+ print(dc.read("audit").show())
1124
1439
 
1125
1440
  with dc.transaction(as_of=checkpoint_v2):
1126
1441
  print(f"== V2.0 Insights (emotion) ==")
1127
- print(dc.read("insights", namespace="analysis").show())
1442
+ print(dc.read("insights").show())
1128
1443
  print(f"== V2.0 Audit ==")
1129
- print(dc.read("audit", namespace="analysis").show())
1444
+ print(dc.read("audit").show())
1130
1445
 
1131
- v3_results = dc.read("insights", namespace="analysis")
1446
+ v3_results = dc.read("insights")
1132
1447
  print(f"== V3.0 Insights (customer service response) ==")
1133
- print(dc.read("insights", namespace="analysis").show())
1448
+ print(dc.read("insights").show())
1134
1449
  print(f"== V3.0 Audit ==")
1135
- print(dc.read("audit", namespace="analysis").show())
1450
+ print(dc.read("audit").show())
1136
1451
  ```
1137
1452
 
1138
1453
  </details>
1139
1454
 
1140
1455
  ## Runtime Environment Requirements
1141
1456
 
1142
- DeltaCAT's transaction system assumes that the host machine provides strong system clock accuracy guarantees, and that the filesystem hosting the catalog root directory offers strong consistency.
1457
+ DeltaCAT's transaction system assumes that the host machine provides strong system clock accuracy guarantees, and that the filesystem hosting the catalog root directory offers strong read-after-write consistency.
1143
1458
 
1144
- Taken together, these requirements make DeltaCAT suitable for production use on most major cloud computing hosts (e.g., EC2, GCE, Azure VMs) and storage systems (e.g., S3, GCS, Azure Blob Storage), but local laptops should typically be limited to testing/experimental purposes.
1459
+ Taken together, these requirements make DeltaCAT suitable for production use on most major cloud computing hosts (e.g., EC2, GCE, Azure VMs) and storage systems (e.g., S3, GCS, Azure Blob Storage), but local laptops should typically be limited to testing/experimental purposes (e.g., due to potential system clock drift).
1145
1460
 
1146
1461
  ## Additional Resources
1147
1462
  ### Table Documentation