deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,38 +1,42 @@
1
- from typing import Any, Callable, Dict, List, Optional, Set, Union
2
-
3
- import pyarrow as pa
1
+ from typing import Any, Callable, Dict, List, Optional, Union, Tuple
4
2
 
5
3
  from deltacat.storage import (
6
- DeleteParameters,
4
+ EntryParams,
5
+ EntryType,
7
6
  Delta,
8
7
  DeltaLocator,
8
+ DeltaProperties,
9
9
  DeltaType,
10
10
  DistributedDataset,
11
11
  LifecycleState,
12
12
  ListResult,
13
13
  LocalDataset,
14
14
  LocalTable,
15
- Manifest,
16
15
  ManifestAuthor,
17
16
  Namespace,
17
+ NamespaceProperties,
18
18
  Partition,
19
- SchemaConsistencyType,
19
+ PartitionLocator,
20
+ PartitionScheme,
21
+ PartitionValues,
22
+ Schema,
23
+ SortScheme,
20
24
  Stream,
25
+ StreamFormat,
21
26
  StreamLocator,
22
27
  Table,
28
+ TableProperties,
23
29
  TableVersion,
24
- SortKey,
25
- PartitionLocator,
26
- PartitionFilter,
27
- PartitionValues,
28
- DeltaPartitionSpec,
29
- StreamPartitionSpec,
30
+ TableVersionLocator,
31
+ TableVersionProperties,
30
32
  )
33
+ from deltacat.storage.model.manifest import Manifest
34
+ from deltacat.storage.model.partition import UNKNOWN_PARTITION_ID
31
35
  from deltacat.types.media import (
32
36
  ContentType,
33
- StorageType,
34
- TableType,
35
37
  DistributedDatasetType,
38
+ StorageType,
39
+ DatasetType,
36
40
  )
37
41
  from deltacat.utils.common import ReadKwargsProvider
38
42
 
@@ -64,12 +68,26 @@ def list_table_versions(
64
68
  raise NotImplementedError("list_table_versions not implemented")
65
69
 
66
70
 
71
+ def list_streams(
72
+ namespace: str,
73
+ table_name: str,
74
+ table_version: str,
75
+ *args,
76
+ **kwargs,
77
+ ) -> ListResult[Stream]:
78
+ """
79
+ Lists a page of streams for the given table version.
80
+ Raises an error if the table version does not exist.
81
+ """
82
+ raise NotImplementedError("list_streams not implemented")
83
+
84
+
67
85
  def list_partitions(
68
86
  namespace: str,
69
87
  table_name: str,
70
88
  table_version: Optional[str] = None,
71
89
  *args,
72
- **kwargs
90
+ **kwargs,
73
91
  ) -> ListResult[Partition]:
74
92
  """
75
93
  Lists a page of partitions for the given table version. Partitions are
@@ -96,9 +114,9 @@ def list_deltas(
96
114
  last_stream_position: Optional[int] = None,
97
115
  ascending_order: Optional[bool] = None,
98
116
  include_manifest: bool = False,
99
- partition_filter: Optional[PartitionFilter] = None,
117
+ partition_scheme_id: Optional[str] = None,
100
118
  *args,
101
- **kwargs
119
+ **kwargs,
102
120
  ) -> ListResult[Delta]:
103
121
  """
104
122
  Lists a page of deltas for the given table version and committed partition.
@@ -106,15 +124,13 @@ def list_deltas(
106
124
  limited to inclusive first and last stream positions. Deltas are returned by
107
125
  descending stream position by default. Table version resolves to the latest
108
126
  active table version if not specified. Partition values should not be
109
- specified for unpartitioned tables. Raises an error if the given table
110
- version or partition does not exist.
127
+ specified for unpartitioned tables. Partition scheme ID resolves to the
128
+ table version's current partition scheme by default. Raises an error if the
129
+ given table version or partition does not exist.
111
130
 
112
131
  To conserve memory, the deltas returned do not include manifests by
113
132
  default. The manifests can either be optionally retrieved as part of this
114
133
  call or lazily loaded via subsequent calls to `get_delta_manifest`.
115
-
116
- Note: partition_values is deprecated and will be removed in future releases.
117
- Use partition_filter instead.
118
134
  """
119
135
  raise NotImplementedError("list_deltas not implemented")
120
136
 
@@ -126,7 +142,7 @@ def list_partition_deltas(
126
142
  ascending_order: bool = False,
127
143
  include_manifest: bool = False,
128
144
  *args,
129
- **kwargs
145
+ **kwargs,
130
146
  ) -> ListResult[Delta]:
131
147
  """
132
148
  Lists a page of deltas committed to the given partition.
@@ -145,22 +161,21 @@ def get_delta(
145
161
  partition_values: Optional[PartitionValues] = None,
146
162
  table_version: Optional[str] = None,
147
163
  include_manifest: bool = False,
148
- partition_filter: Optional[PartitionFilter] = None,
164
+ partition_scheme_id: Optional[str] = None,
149
165
  *args,
150
- **kwargs
166
+ **kwargs,
151
167
  ) -> Optional[Delta]:
152
168
  """
153
169
  Gets the delta for the given table version, partition, and stream position.
154
170
  Table version resolves to the latest active table version if not specified.
155
- Partition values should not be specified for unpartitioned tables. Raises
156
- an error if the given table version or partition does not exist.
171
+ Partition values should not be specified for unpartitioned tables. Partition
172
+ scheme ID resolves to the table version's current partition scheme by
173
+ default. Raises an error if the given table version or partition does not
174
+ exist.
157
175
 
158
176
  To conserve memory, the delta returned does not include a manifest by
159
177
  default. The manifest can either be optionally retrieved as part of this
160
178
  call or lazily loaded via a subsequent call to `get_delta_manifest`.
161
-
162
- Note: partition_values is deprecated and will be removed in future releases.
163
- Use partition_filter instead.
164
179
  """
165
180
  raise NotImplementedError("get_delta not implemented")
166
181
 
@@ -171,50 +186,43 @@ def get_latest_delta(
171
186
  partition_values: Optional[PartitionValues] = None,
172
187
  table_version: Optional[str] = None,
173
188
  include_manifest: bool = False,
174
- partition_filter: Optional[PartitionFilter] = None,
189
+ partition_scheme_id: Optional[str] = None,
175
190
  *args,
176
- **kwargs
191
+ **kwargs,
177
192
  ) -> Optional[Delta]:
178
193
  """
179
194
  Gets the latest delta (i.e. the delta with the greatest stream position) for
180
195
  the given table version and partition. Table version resolves to the latest
181
196
  active table version if not specified. Partition values should not be
182
- specified for unpartitioned tables. Raises an error if the given table
183
- version or partition does not exist.
197
+ specified for unpartitioned tables. Partition scheme ID resolves to the
198
+ table version's current partition scheme by default. Raises an error if the
199
+ given table version or partition does not exist.
184
200
 
185
201
  To conserve memory, the delta returned does not include a manifest by
186
202
  default. The manifest can either be optionally retrieved as part of this
187
203
  call or lazily loaded via a subsequent call to `get_delta_manifest`.
188
-
189
- Note: partition_values is deprecated and will be removed in future releases.
190
- Use partition_filter instead.
191
204
  """
192
205
  raise NotImplementedError("get_latest_delta not implemented")
193
206
 
194
207
 
195
208
  def download_delta(
196
209
  delta_like: Union[Delta, DeltaLocator],
197
- table_type: TableType = TableType.PYARROW,
210
+ table_type: DatasetType = DatasetType.PYARROW,
198
211
  storage_type: StorageType = StorageType.DISTRIBUTED,
199
212
  max_parallelism: Optional[int] = None,
200
213
  columns: Optional[List[str]] = None,
201
214
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
202
215
  ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
203
216
  distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
204
- partition_filter: Optional[PartitionFilter] = None,
205
217
  *args,
206
- **kwargs
218
+ **kwargs,
207
219
  ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
208
220
  """
209
- Download the given delta or delta locator into either a list of
221
+ Reads the given delta or delta locator into either a list of
210
222
  tables resident in the local node's memory, or into a dataset distributed
211
223
  across this Ray cluster's object store memory. Ordered table N of a local
212
224
  table list, or ordered block N of a distributed dataset, always contain
213
225
  the contents of ordered delta manifest entry N.
214
-
215
- partition_filter is an optional parameter which determines which files to
216
- download from the delta manifest. A delta manifest contains all the data files
217
- for a given delta.
218
226
  """
219
227
  raise NotImplementedError("download_delta not implemented")
220
228
 
@@ -222,19 +230,19 @@ def download_delta(
222
230
  def download_delta_manifest_entry(
223
231
  delta_like: Union[Delta, DeltaLocator],
224
232
  entry_index: int,
225
- table_type: TableType = TableType.PYARROW,
233
+ table_type: DatasetType = DatasetType.PYARROW,
226
234
  columns: Optional[List[str]] = None,
227
235
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
228
236
  *args,
229
- **kwargs
237
+ **kwargs,
230
238
  ) -> LocalTable:
231
239
  """
232
- Downloads a single manifest entry into the specified table type for the
240
+ Reads a single manifest entry into the specified table type for the
233
241
  given delta or delta locator. If a delta is provided with a non-empty
234
- manifest, then the entry is downloaded from this manifest. Otherwise, the
235
- manifest is first retrieved then the given entry index downloaded.
242
+ manifest, then the entry is read from this manifest. Otherwise, the
243
+ manifest is first retrieved then the given entry index read.
236
244
 
237
- NOTE: The entry will be downloaded in the current node's memory.
245
+ NOTE: The entry will be read in the current node's memory.
238
246
  """
239
247
  raise NotImplementedError("download_delta_manifest_entry not implemented")
240
248
 
@@ -244,17 +252,21 @@ def get_delta_manifest(
244
252
  ) -> Manifest:
245
253
  """
246
254
  Get the manifest associated with the given delta or delta locator. This
247
- always retrieves the authoritative remote copy of the delta manifest, and
248
- never the local manifest defined for any input delta.
255
+ always retrieves the authoritative durable copy of the delta manifest, and
256
+ never the local manifest defined for any input delta. Raises an error if
257
+ the delta can't be found, or if it doesn't contain a manifest.
249
258
  """
250
259
  raise NotImplementedError("get_delta_manifest not implemented")
251
260
 
252
261
 
253
262
  def create_namespace(
254
- namespace: str, permissions: Dict[str, Any], *args, **kwargs
263
+ namespace: str,
264
+ properties: Optional[NamespaceProperties] = None,
265
+ *args,
266
+ **kwargs,
255
267
  ) -> Namespace:
256
268
  """
257
- Creates a table namespace with the given name and permissions. Returns
269
+ Creates a table namespace with the given name and properties. Returns
258
270
  the created namespace.
259
271
  """
260
272
  raise NotImplementedError("create_namespace not implemented")
@@ -262,13 +274,13 @@ def create_namespace(
262
274
 
263
275
  def update_namespace(
264
276
  namespace: str,
265
- permissions: Optional[Dict[str, Any]] = None,
277
+ properties: Optional[NamespaceProperties] = None,
266
278
  new_namespace: Optional[str] = None,
267
279
  *args,
268
- **kwargs
280
+ **kwargs,
269
281
  ) -> None:
270
282
  """
271
- Updates a table namespace's name and/or permissions. Raises an error if the
283
+ Updates a table namespace's name and/or properties. Raises an error if the
272
284
  given namespace does not exist.
273
285
  """
274
286
  raise NotImplementedError("update_namespace not implemented")
@@ -278,71 +290,60 @@ def create_table_version(
278
290
  namespace: str,
279
291
  table_name: str,
280
292
  table_version: Optional[str] = None,
281
- schema: Optional[Union[pa.Schema, str, bytes]] = None,
282
- schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
283
- partition_keys: Optional[List[Dict[str, Any]]] = None,
284
- primary_key_column_names: Optional[Set[str]] = None,
285
- sort_keys: Optional[List[SortKey]] = None,
293
+ lifecycle_state: Optional[LifecycleState] = LifecycleState.CREATED,
294
+ schema: Optional[Schema] = None,
295
+ partition_scheme: Optional[PartitionScheme] = None,
296
+ sort_keys: Optional[SortScheme] = None,
286
297
  table_version_description: Optional[str] = None,
287
- table_version_properties: Optional[Dict[str, str]] = None,
288
- table_permissions: Optional[Dict[str, Any]] = None,
298
+ table_version_properties: Optional[TableVersionProperties] = None,
289
299
  table_description: Optional[str] = None,
290
- table_properties: Optional[Dict[str, str]] = None,
300
+ table_properties: Optional[TableProperties] = None,
291
301
  supported_content_types: Optional[List[ContentType]] = None,
292
- partition_spec: Optional[StreamPartitionSpec] = None,
293
302
  *args,
294
- **kwargs
295
- ) -> Stream:
303
+ **kwargs,
304
+ ) -> Tuple[Table, TableVersion, Stream]:
296
305
  """
297
- Create a table version with an unreleased lifecycle state and an empty delta
298
- stream. Table versions may be schemaless and unpartitioned, or partitioned
299
- according to a list of partition key names and types. Note that partition
300
- keys are not required to exist in the table's schema, and can thus still be
301
- used with schemaless tables. This can be useful for creating logical shards
302
- of a delta stream where partition keys are known but not projected onto each
303
- row of the table (e.g. all rows of a customer orders table are known to
304
- correspond to a given order day, even if this column doesn't exist in the
305
- table). Primary and sort keys must exist within the table's schema.
306
- Permissions specified at the table level override any conflicting
307
- permissions specified at the table namespace level. Returns the stream
308
- for the created table version. Raises an error if the given namespace does
309
- not exist.
310
-
311
- Schemas are optional for DeltaCAT tables and can be used to inform the data
312
- consistency checks run for each field. If a schema is present, it can be
313
- used to enforce the following column-level data consistency policies at
314
- table load time:
306
+ Create a table version with the given or CREATED lifecycle state and an empty delta
307
+ stream. Table versions may be schemaless and unpartitioned to improve write
308
+ performance, or have their writes governed by a schema and partition scheme
309
+ to improve data consistency and read performance.
315
310
 
316
- None: No consistency checks are run. May be mixed with the below two
317
- policies by specifying column names to pass through together with
318
- column names to coerce/validate.
311
+ Returns a tuple containing the created/updated table, table version, and
312
+ stream (respectively).
319
313
 
320
- Coerce: Coerce fields to fit the schema whenever possible. An explicit
321
- subset of column names to coerce may optionally be specified.
314
+ Raises an error if the given namespace does not exist.
315
+ """
316
+ raise NotImplementedError("create_table_version not implemented")
322
317
 
323
- Validate: Raise an error for any fields that don't fit the schema. An
324
- explicit subset of column names to validate may optionally be specified.
325
318
 
326
- Either partition_keys or partition_spec must be specified but not both.
319
+ def create_table(
320
+ namespace: str,
321
+ table_name: str,
322
+ description: Optional[str] = None,
323
+ properties: Optional[TableProperties] = None,
324
+ *args,
325
+ **kwargs,
326
+ ) -> Table:
327
327
  """
328
- raise NotImplementedError("create_table_version not implemented")
328
+ Create a new table. Raises an error if the given table already exists.
329
+ """
330
+ raise NotImplementedError("create_table not implemented")
329
331
 
330
332
 
331
333
  def update_table(
332
334
  namespace: str,
333
335
  table_name: str,
334
- permissions: Optional[Dict[str, Any]] = None,
335
336
  description: Optional[str] = None,
336
- properties: Optional[Dict[str, str]] = None,
337
+ properties: Optional[TableProperties] = None,
337
338
  new_table_name: Optional[str] = None,
338
339
  *args,
339
- **kwargs
340
- ) -> None:
340
+ **kwargs,
341
+ ) -> Table:
341
342
  """
342
343
  Update table metadata describing the table versions it contains. By default,
343
- a table's properties are empty, and its description and permissions are
344
- equal to those given when its first table version was created. Raises an
345
- error if the given table does not exist.
344
+ a table's properties are empty, and its description is equal to that given
345
+ when its first table version was created. Raises an error if the given
346
+ table does not exist.
346
347
  """
347
348
  raise NotImplementedError("update_table not implemented")
348
349
 
@@ -352,13 +353,15 @@ def update_table_version(
352
353
  table_name: str,
353
354
  table_version: str,
354
355
  lifecycle_state: Optional[LifecycleState] = None,
355
- schema: Optional[Union[pa.Schema, str, bytes]] = None,
356
- schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
356
+ schema: Optional[Schema] = None,
357
357
  description: Optional[str] = None,
358
- properties: Optional[Dict[str, str]] = None,
358
+ properties: Optional[TableVersionProperties] = None,
359
+ partition_scheme: Optional[PartitionScheme] = None,
360
+ # TODO(pdames): rename to `sort_scheme`
361
+ sort_keys: Optional[SortScheme] = None,
359
362
  *args,
360
- **kwargs
361
- ) -> None:
363
+ **kwargs,
364
+ ) -> Tuple[Optional[Table], TableVersion, Optional[Stream]]:
362
365
  """
363
366
  Update a table version. Notably, updating an unreleased table version's
364
367
  lifecycle state to 'active' telegraphs that it is ready for external
@@ -375,18 +378,27 @@ def stage_stream(
375
378
  namespace: str,
376
379
  table_name: str,
377
380
  table_version: Optional[str] = None,
381
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
378
382
  *args,
379
- **kwargs
383
+ **kwargs,
380
384
  ) -> Stream:
381
385
  """
382
386
  Stages a new delta stream for the given table version. Resolves to the
383
- latest active table version if no table version is given. Returns the
384
- staged stream. Raises an error if the table version does not exist.
387
+ latest active table version if no table version is given. Resolves to the
388
+ DeltaCAT stream format if no stream format is given. If this stream
389
+ will replace another stream with the same format and scheme, then it will
390
+ have its previous stream ID set to the ID of the stream being replaced.
391
+ Returns the staged stream. Raises an error if the table version does not
392
+ exist.
385
393
  """
386
394
  raise NotImplementedError("stage_stream not implemented")
387
395
 
388
396
 
389
- def commit_stream(stream: Stream, *args, **kwargs) -> Stream:
397
+ def commit_stream(
398
+ stream: Stream,
399
+ *args,
400
+ **kwargs,
401
+ ) -> Stream:
390
402
  """
391
403
  Registers a delta stream with a target table version, replacing any
392
404
  previous stream registered for the same table version. Returns the
@@ -399,43 +411,111 @@ def delete_stream(
399
411
  namespace: str,
400
412
  table_name: str,
401
413
  table_version: Optional[str] = None,
414
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
402
415
  *args,
403
- **kwargs
416
+ **kwargs,
404
417
  ) -> None:
405
418
  """
406
419
  Deletes the delta stream currently registered with the given table version.
407
420
  Resolves to the latest active table version if no table version is given.
408
- Raises an error if the table version does not exist.
421
+ Resolves to the deltacat stream format if no stream format is given.
422
+ Raises an error if the stream does not exist.
409
423
  """
410
424
  raise NotImplementedError("delete_stream not implemented")
411
425
 
412
426
 
427
+ def delete_table(
428
+ namespace: str,
429
+ table_name: str,
430
+ purge: bool = False,
431
+ *args,
432
+ **kwargs,
433
+ ) -> None:
434
+ """
435
+ Drops the given table from the catalog. If purge is True, also removes
436
+ all data files associated with the table. Raises an error if the given table
437
+ does not exist.
438
+ """
439
+ raise NotImplementedError("delete_table not implemented")
440
+
441
+
442
+ def delete_namespace(
443
+ namespace: str,
444
+ purge: bool = False,
445
+ *args,
446
+ **kwargs,
447
+ ) -> None:
448
+ """
449
+ Drops the given namespace from the catalog. If purge is True, also removes
450
+ all data files associated with the namespace. Raises an error if the given
451
+ namespace does not exist.
452
+ """
453
+ raise NotImplementedError("drop_namespace not implemented")
454
+
455
+
456
+ def get_stream_by_id(
457
+ table_version_locator: TableVersionLocator,
458
+ stream_id: str,
459
+ *args,
460
+ **kwargs,
461
+ ) -> Optional[Partition]:
462
+ """
463
+ Gets the stream for the given table version locator and stream ID.
464
+ Returns None if the stream does not exist. Raises an error if the given
465
+ table version locator does not exist.
466
+ """
467
+ raise NotImplementedError("get_stream_by_id not implemented")
468
+
469
+
413
470
  def get_stream(
414
471
  namespace: str,
415
472
  table_name: str,
416
473
  table_version: Optional[str] = None,
474
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
417
475
  *args,
418
- **kwargs
476
+ **kwargs,
419
477
  ) -> Optional[Stream]:
420
478
  """
421
- Gets the most recently committed stream for the given table version and
422
- partition key values. Resolves to the latest active table version if no
423
- table version is given. Returns None if the table version does not exist.
479
+ Gets the most recently committed stream for the given table version.
480
+ Resolves to the latest active table version if no table version is given.
481
+ Resolves to the deltacat stream format if no stream format is given.
482
+ Returns None if the table version or stream format does not exist.
424
483
  """
425
484
  raise NotImplementedError("get_stream not implemented")
426
485
 
427
486
 
487
+ def stream_exists(
488
+ namespace: str,
489
+ table_name: str,
490
+ table_version: Optional[str] = None,
491
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
492
+ *args,
493
+ **kwargs,
494
+ ) -> bool:
495
+ """
496
+ Returns True if the given Stream exists, False if not.
497
+ Resolves to the latest active table version if no table version is given.
498
+ Resolves to the DeltaCAT stream format if no stream format is given.
499
+ Returns None if the table version or stream format does not exist.
500
+ """
501
+ raise NotImplementedError("stream_exists not implemented")
502
+
503
+
428
504
  def stage_partition(
429
- stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
505
+ stream: Stream,
506
+ partition_values: Optional[PartitionValues] = None,
507
+ partition_scheme_id: Optional[str] = None,
508
+ *args,
509
+ **kwargs,
430
510
  ) -> Partition:
431
511
  """
432
512
  Stages a new partition for the given stream and partition values. Returns
433
513
  the staged partition. If this partition will replace another partition
434
- with the same partition values, then it will have its previous partition ID
435
- set to the ID of the partition being replaced. Partition keys should not be
436
- specified for unpartitioned tables.
514
+ with the same partition values and scheme, then it will have its previous
515
+ partition ID set to the ID of the partition being replaced. Partition values
516
+ should not be specified for unpartitioned tables.
437
517
 
438
- The partition_values must represents the results of transforms in a partition
518
+ The partition_values must represent the results of transforms in a partition
439
519
  spec specified in the stream.
440
520
  """
441
521
  raise NotImplementedError("stage_partition not implemented")
@@ -444,14 +524,20 @@ def stage_partition(
444
524
  def commit_partition(
445
525
  partition: Partition,
446
526
  previous_partition: Optional[Partition] = None,
527
+ expected_previous_partition_id: Optional[str] = UNKNOWN_PARTITION_ID,
447
528
  *args,
448
- **kwargs
529
+ **kwargs,
449
530
  ) -> Partition:
450
531
  """
451
- Commits the given partition to its associated table version stream,
452
- replacing any previous partition (i.e., "partition being replaced") registered for the same stream and
532
+ Commits the staged partition to its associated table version stream,
533
+ replacing any previous partition registered for the same stream and
453
534
  partition values.
454
- If the previous_partition is passed as an argument, the specified previous_partition will be the partition being replaced, otherwise it will be retrieved.
535
+
536
+ If previous partition is given then it will be replaced with its deltas
537
+ prepended to the new partition being committed. Otherwise the latest
538
+ committed partition with the same keys and partition scheme ID will be
539
+ retrieved.
540
+
455
541
  Returns the registered partition. If the partition's
456
542
  previous delta stream position is specified, then the commit will
457
543
  be rejected if it does not match the actual previous stream position of
@@ -463,33 +549,48 @@ def commit_partition(
463
549
 
464
550
 
465
551
  def delete_partition(
466
- namespace: str,
467
- table_name: str,
468
- table_version: Optional[str] = None,
552
+ stream_locator: StreamLocator,
469
553
  partition_values: Optional[PartitionValues] = None,
554
+ partition_scheme_id: Optional[str] = None,
470
555
  *args,
471
- **kwargs
556
+ **kwargs,
472
557
  ) -> None:
473
558
  """
474
- Deletes the given partition from the specified table version. Resolves to
475
- the latest active table version if no table version is given. Partition
559
+ Deletes the given partition from the specified stream. Partition
476
560
  values should not be specified for unpartitioned tables. Raises an error
477
- if the table version or partition does not exist.
561
+ if the partition does not exist.
478
562
  """
479
563
  raise NotImplementedError("delete_partition not implemented")
480
564
 
481
565
 
566
+ def get_partition_by_id(
567
+ stream_locator: StreamLocator,
568
+ partition_id: str,
569
+ *args,
570
+ **kwargs,
571
+ ) -> Optional[Partition]:
572
+ """
573
+ Gets the partition for the given stream locator and partition ID.
574
+ Returns None if the partition does not exist. Raises an error if the
575
+ given stream locator does not exist.
576
+ """
577
+ raise NotImplementedError("get_partition_by_id not implemented")
578
+
579
+
482
580
  def get_partition(
483
581
  stream_locator: StreamLocator,
484
582
  partition_values: Optional[PartitionValues] = None,
583
+ partition_scheme_id: Optional[str] = None,
485
584
  *args,
486
- **kwargs
585
+ **kwargs,
487
586
  ) -> Optional[Partition]:
488
587
  """
489
588
  Gets the most recently committed partition for the given stream locator and
490
589
  partition key values. Returns None if no partition has been committed for
491
590
  the given table version and/or partition key values. Partition values
492
- should not be specified for unpartitioned tables.
591
+ should not be specified for unpartitioned tables. Partition scheme ID
592
+ resolves to the table version's current partition scheme by default.
593
+ Raises an error if the given stream locator does not exist.
493
594
  """
494
595
  raise NotImplementedError("get_partition not implemented")
495
596
 
@@ -500,26 +601,20 @@ def stage_delta(
500
601
  delta_type: DeltaType = DeltaType.UPSERT,
501
602
  max_records_per_entry: Optional[int] = None,
502
603
  author: Optional[ManifestAuthor] = None,
503
- properties: Optional[Dict[str, str]] = None,
504
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
604
+ properties: Optional[DeltaProperties] = None,
605
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
505
606
  content_type: ContentType = ContentType.PARQUET,
506
- delete_parameters: Optional[DeleteParameters] = None,
507
- partition_spec: Optional[DeltaPartitionSpec] = None,
508
- partition_values: Optional[PartitionValues] = None,
607
+ entry_params: Optional[EntryParams] = None,
608
+ entry_type: Optional[EntryType] = EntryType.DATA,
609
+ schema: Optional[Schema] = None,
610
+ sort_scheme_id: Optional[str] = None,
509
611
  *args,
510
- **kwargs
612
+ **kwargs,
511
613
  ) -> Delta:
512
614
  """
513
- Writes the given table to 1 or more S3 files. Returns an unregistered
615
+ Writes the given dataset to 1 or more files. Returns an unregistered
514
616
  delta whose manifest entries point to the uploaded files. Applies any
515
617
  schema consistency policies configured for the parent table version.
516
-
517
- The partition spec will be used to split the input table into
518
- multiple files. Optionally, partition_values can be provided to avoid
519
- this method to recompute partition_values from the provided data.
520
-
521
- Raises an error if the provided data does not conform to a unique ordered
522
- list of partition_values
523
618
  """
524
619
  raise NotImplementedError("stage_delta not implemented")
525
620
 
@@ -601,7 +696,7 @@ def get_table_version_column_names(
601
696
  table_name: str,
602
697
  table_version: Optional[str] = None,
603
698
  *args,
604
- **kwargs
699
+ **kwargs,
605
700
  ) -> Optional[List[str]]:
606
701
  """
607
702
  Gets a list of column names for the specified table version, or for the
@@ -619,8 +714,8 @@ def get_table_version_schema(
619
714
  table_name: str,
620
715
  table_version: Optional[str] = None,
621
716
  *args,
622
- **kwargs
623
- ) -> Optional[Union[pa.Schema, str, bytes]]:
717
+ **kwargs,
718
+ ) -> Optional[Schema]:
624
719
  """
625
720
  Gets the schema for the specified table version, or for the latest active
626
721
  table version if none is specified. Returns None if the table version is
@@ -640,13 +735,23 @@ def table_version_exists(
640
735
 
641
736
  def can_categorize(e: BaseException, *args, **kwargs) -> bool:
642
737
  """
643
- Return whether input error is from storage implementation layer.
738
+ True if the input error originated from the storage
739
+ implementation layer and can be categorized under an
740
+ existing DeltaCatError. The "categorize_errors" decorator
741
+ uses this to determine if an unknown error from the storage
742
+ implementation can be categorized prior to casting it to
743
+ the equivalent DeltaCatError via `raise_categorized_error`
644
744
  """
645
745
  raise NotImplementedError
646
746
 
647
747
 
648
748
  def raise_categorized_error(e: BaseException, *args, **kwargs):
649
749
  """
650
- Raise and handle storage implementation layer specific errors.
750
+ Casts a categorizable error that originaed from the storage
751
+ implementation layer to its equivalent DeltaCatError
752
+ for uniform handling (e.g., determining whether an error
753
+ is retryable or not) via the "categorize_errors" decorator.
754
+ Raises an UnclassifiedDeltaCatError from the input exception
755
+ if the error cannot be categorized.
651
756
  """
652
757
  raise NotImplementedError