deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1421 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+
4
+ import copy
5
+
6
+ from typing import Optional, Tuple, List, Union, Set
7
+
8
+ import base64
9
+ import json
10
+ import msgpack
11
+ import pyarrow.fs
12
+ import posixpath
13
+ import uuid
14
+ import deltacat
15
+
16
+ from deltacat.constants import (
17
+ METAFILE_FORMAT,
18
+ REVISION_DIR_NAME,
19
+ METAFILE_EXT,
20
+ SUPPORTED_METAFILE_FORMATS,
21
+ TXN_DIR_NAME,
22
+ TXN_PART_SEPARATOR,
23
+ SUCCESS_TXN_DIR_NAME,
24
+ )
25
+ from deltacat.exceptions import (
26
+ ObjectNotFoundError,
27
+ ObjectDeletedError,
28
+ ObjectAlreadyExistsError,
29
+ ConcurrentModificationError,
30
+ )
31
+ from deltacat.storage.model.list_result import ListResult
32
+ from deltacat.storage.model.locator import Locator
33
+ from deltacat.storage.model.types import TransactionOperationType
34
+ from deltacat.utils.filesystem import (
35
+ resolve_path_and_filesystem,
36
+ list_directory,
37
+ get_file_info,
38
+ )
39
+
40
+
41
+ class MetafileRevisionInfo(dict):
42
+ """
43
+ Base class for DeltaCAT metafile revision info.
44
+ """
45
+
46
+ @staticmethod
47
+ def undefined() -> MetafileRevisionInfo:
48
+ mri = MetafileRevisionInfo()
49
+ mri.revision = 0
50
+ mri.txn_id = None
51
+ mri.txn_op_type = None
52
+ mri.dir_path = None
53
+ return mri
54
+
55
+ @staticmethod
56
+ def parse(revision_file_path: str) -> MetafileRevisionInfo:
57
+ dir_path = posixpath.dirname(revision_file_path)
58
+ metafile_name = posixpath.basename(revision_file_path)
59
+ metafile_and_ext = posixpath.splitext(metafile_name)
60
+ metafile_ext = metafile_and_ext[1] if len(metafile_and_ext) > 1 else None
61
+ metafile_rev_and_txn_info = metafile_and_ext[0]
62
+ txn_info_parts = metafile_rev_and_txn_info.split(TXN_PART_SEPARATOR)
63
+
64
+ mri = MetafileRevisionInfo()
65
+ mri.dir_path = dir_path
66
+ mri.extension = metafile_ext
67
+ mri.revision = int(txn_info_parts[0])
68
+ mri.txn_op_type = txn_info_parts[1]
69
+ mri.txn_id = f"{txn_info_parts[2]}{TXN_PART_SEPARATOR}{txn_info_parts[3]}"
70
+ return mri
71
+
72
+ @staticmethod
73
+ def list_revisions(
74
+ revision_dir_path: str,
75
+ filesystem: pyarrow.fs.FileSystem,
76
+ success_txn_log_dir: str,
77
+ current_txn_start_time: Optional[int] = None,
78
+ current_txn_id: Optional[str] = None,
79
+ limit: Optional[int] = None,
80
+ ) -> List[MetafileRevisionInfo]:
81
+ if not success_txn_log_dir:
82
+ err_msg = f"No transaction log found for: {revision_dir_path}."
83
+ raise ObjectNotFoundError(err_msg)
84
+ # find the latest committed revision of the target metafile
85
+ sorted_metafile_paths = MetafileRevisionInfo._sorted_file_paths(
86
+ revision_dir_path=revision_dir_path,
87
+ filesystem=filesystem,
88
+ ignore_missing_revision=True,
89
+ )
90
+ revisions = []
91
+ while sorted_metafile_paths:
92
+ latest_metafile_path = sorted_metafile_paths.pop()
93
+ mri = MetafileRevisionInfo.parse(latest_metafile_path)
94
+ if not current_txn_id or mri.txn_id == current_txn_id:
95
+ # consider the current transaction (if any) to be committed
96
+ revisions.append(mri)
97
+ elif current_txn_start_time is not None:
98
+ # the current transaction can only build on top of the snapshot
99
+ # of commits from transactions that completed before it started
100
+ txn_end_time = (
101
+ deltacat.storage.model.transaction.Transaction.read_end_time(
102
+ path=posixpath.join(success_txn_log_dir, mri.txn_id),
103
+ filesystem=filesystem,
104
+ )
105
+ )
106
+ if txn_end_time is not None and txn_end_time < current_txn_start_time:
107
+ revisions.append(mri)
108
+ else:
109
+ raise ValueError(
110
+ f"Current transaction ID `{current_txn_id} provided "
111
+ f"without a transaction start time."
112
+ )
113
+ if limit <= len(revisions):
114
+ break
115
+ return revisions
116
+
117
+ @staticmethod
118
+ def latest_revision(
119
+ revision_dir_path: str,
120
+ filesystem: pyarrow.fs.FileSystem,
121
+ success_txn_log_dir: str,
122
+ current_txn_start_time: Optional[int] = None,
123
+ current_txn_id: Optional[str] = None,
124
+ ignore_missing_revision: bool = False,
125
+ ) -> MetafileRevisionInfo:
126
+ """
127
+ Fetch latest revision of a metafile, or return None if no
128
+ revisions exist.
129
+ :param revision_dir_path: root path of directory for metafile
130
+ :param ignore_missing_revision: if True, will return
131
+ MetafileRevisionInfo.undefined() on no revisions
132
+ :raises ObjectNotFoundError if no revisions are found AND
133
+ ignore_missing_revision=False
134
+ """
135
+ revisions = MetafileRevisionInfo.list_revisions(
136
+ revision_dir_path=revision_dir_path,
137
+ filesystem=filesystem,
138
+ success_txn_log_dir=success_txn_log_dir,
139
+ current_txn_start_time=current_txn_start_time,
140
+ current_txn_id=current_txn_id,
141
+ limit=1,
142
+ )
143
+ if not revisions and not ignore_missing_revision:
144
+ err_msg = f"No committed revision found at {revision_dir_path}."
145
+ raise ObjectNotFoundError(err_msg)
146
+ return revisions[0] if revisions else MetafileRevisionInfo.undefined()
147
+
148
+ @staticmethod
149
+ def new_revision(
150
+ revision_dir_path: str,
151
+ current_txn_op_type: deltacat.storage.model.transaction.TransactionOperationType,
152
+ current_txn_start_time: int,
153
+ current_txn_id: str,
154
+ filesystem: pyarrow.fs.FileSystem,
155
+ extension: Optional[str] = METAFILE_EXT,
156
+ success_txn_log_dir: Optional[str] = None,
157
+ ) -> MetafileRevisionInfo:
158
+ """
159
+ Creates and returns a new MetafileRevisionInfo object for the next
160
+ revision of the metafile.
161
+
162
+ This method determines the next revision information based on the
163
+ latest existing revision in the specified directory path and the
164
+ current transaction details.
165
+
166
+ Args:
167
+ revision_dir_path (str): Metafile revision directory path to
168
+ generate the next metafile revision info for.
169
+ current_txn_op_type (TransactionOperationType): The current
170
+ transaction's operation type.
171
+ current_txn_start_time (int): The current transaction's start time.
172
+ current_txn_id (str): The current transaction's ID.
173
+ filesystem (pyarrow.fs.FileSystem): The filesystem interface to
174
+ use for file operations
175
+ extension (str, optional): The file extension for metafiles.
176
+ Defaults to METAFILE_EXT.
177
+ success_txn_log_dir (Optional[str], optional): Directory path for
178
+ successful transaction logs. Will be automatically discovered by
179
+ traversing revision directory parent paths if not specified.
180
+
181
+ Returns:
182
+ MetafileRevisionInfo: A new revision info object containing
183
+ metadata for the next revision
184
+
185
+ Notes:
186
+ - For CREATE operations, the method will ignore missing previous
187
+ revisions.
188
+ - The method validates the transaction operation type before
189
+ creating the new revision.
190
+ - Uses the pyarrow filesystem interface for file operations.
191
+ """
192
+ is_create_txn = current_txn_op_type == TransactionOperationType.CREATE
193
+ mri = MetafileRevisionInfo.latest_revision(
194
+ revision_dir_path=revision_dir_path,
195
+ filesystem=filesystem,
196
+ success_txn_log_dir=success_txn_log_dir,
197
+ current_txn_start_time=current_txn_start_time,
198
+ current_txn_id=current_txn_id,
199
+ ignore_missing_revision=is_create_txn,
200
+ )
201
+ # validate the transaction operation type
202
+ if mri.exists():
203
+ # update/delete fails if the last metafile was deleted
204
+ if mri.txn_op_type == TransactionOperationType.DELETE:
205
+ if current_txn_op_type != TransactionOperationType.CREATE:
206
+ raise ObjectDeletedError(
207
+ f"Metafile {current_txn_op_type.value} failed "
208
+ f"for transaction ID {current_txn_id} failed. "
209
+ f"Metafile state at {mri.path} is deleted."
210
+ )
211
+ # create fails unless the last metafile was deleted
212
+ elif is_create_txn:
213
+ raise ObjectAlreadyExistsError(
214
+ f"Metafile creation for transaction ID {current_txn_id} "
215
+ f"failed. Metafile commit at {mri.path} already exists."
216
+ )
217
+ elif not is_create_txn:
218
+ # update/delete fails if the last metafile doesn't exist
219
+ raise ObjectNotFoundError(
220
+ f"Metafile {current_txn_op_type.value} failed for "
221
+ f"transaction ID {current_txn_id} failed. Metafile at "
222
+ f"{mri.path} does not exist."
223
+ )
224
+ mri.revision = mri.revision + 1
225
+ mri.txn_id = current_txn_id
226
+ mri.txn_op_type = current_txn_op_type
227
+ mri.dir_path = revision_dir_path
228
+ mri.extension = extension
229
+ return mri
230
+
231
+ @staticmethod
232
+ def check_for_concurrent_txn_conflict(
233
+ success_txn_log_dir: str,
234
+ current_txn_revision_file_path: str,
235
+ filesystem: pyarrow.fs.FileSystem,
236
+ ) -> None:
237
+ """
238
+ Checks for a concurrent modification conflict between a file commited
239
+ by the current transaction and another parallel transaction. Raises
240
+ an exception if a concurrent modification conflict is found.
241
+
242
+ :param success_txn_log_dir: Path to the log of successful transactions.
243
+ :param current_txn_revision_file_path: Path to a metafile revision
244
+ written by the current transaction to check for conflicts against.
245
+ :param filesystem: Filesystem that can read the metafile revision.
246
+ :raises ConcurrentModificationError: if a conflict is found with another transaction.
247
+ """
248
+ revision_dir_path = posixpath.dirname(current_txn_revision_file_path)
249
+ cur_txn_mri = MetafileRevisionInfo.parse(current_txn_revision_file_path)
250
+
251
+ sorted_metafile_paths = MetafileRevisionInfo._sorted_file_paths(
252
+ revision_dir_path=revision_dir_path,
253
+ filesystem=filesystem,
254
+ )
255
+ conflict_mris = []
256
+ while sorted_metafile_paths:
257
+ next_metafile_path = sorted_metafile_paths.pop()
258
+ mri = MetafileRevisionInfo.parse(next_metafile_path)
259
+ if mri.revision < cur_txn_mri.revision:
260
+ # no conflict was found
261
+ break
262
+ elif (
263
+ mri.revision == cur_txn_mri.revision
264
+ and mri.txn_id != cur_txn_mri.txn_id
265
+ ):
266
+ # we've found a conflict between txn_id and current_txn_id
267
+ # defer to the transaction with the higher lexicographic order
268
+ # (i.e., the transaction that started most recently)
269
+ # TODO(pdames): Ensure the conflicting transaction is alive
270
+ # (e.g., give each transaction a heartbeat timeout that gives
271
+ # it 1-2 seconds per operation, and record known failed
272
+ # transaction IDs)
273
+ if mri.txn_id > cur_txn_mri.txn_id:
274
+ raise ConcurrentModificationError(
275
+ f"Aborting transaction {cur_txn_mri.txn_id} due to "
276
+ f"concurrent conflict at "
277
+ f"{current_txn_revision_file_path} with transaction "
278
+ f"{mri.txn_id} at {next_metafile_path}."
279
+ )
280
+ conflict_mris.append(mri)
281
+ if conflict_mris:
282
+ # current txn wins the ordering challenge among all conflicts,
283
+ # but we still need to ensure that no conflicting transactions
284
+ # completed before seeing the conflict with this transaction
285
+ for mri in conflict_mris:
286
+ txn_end_time = (
287
+ deltacat.storage.model.transaction.Transaction.read_end_time(
288
+ path=posixpath.join(success_txn_log_dir, mri.txn_id),
289
+ filesystem=filesystem,
290
+ )
291
+ )
292
+ # TODO(pdames): Resolve risk of passing this check if it
293
+ # runs before the conflicting transaction marks itself as
294
+ # complete in the txn log. Some fixes include enforcing
295
+ # serializable isolation of the txn log, eventually
296
+ # consistent detection & repair, writing a mutex file
297
+ # that tells future transactions to only consider this txn
298
+ # complete if the conflicting txn is not complete, etc.
299
+ if txn_end_time:
300
+ raise ConcurrentModificationError(
301
+ f"Aborting transaction {cur_txn_mri.txn_id} due to "
302
+ f"concurrent conflict at {revision_dir_path} with "
303
+ f"previously completed transaction {mri.txn_id} at "
304
+ f"{next_metafile_path}."
305
+ )
306
+
307
+ @staticmethod
308
+ def _sorted_file_paths(
309
+ revision_dir_path: str,
310
+ filesystem: pyarrow.fs.FileSystem,
311
+ ignore_missing_revision: bool = False,
312
+ ) -> List[str]:
313
+ file_paths_and_sizes = list_directory(
314
+ path=revision_dir_path,
315
+ filesystem=filesystem,
316
+ ignore_missing_path=True,
317
+ )
318
+ if not file_paths_and_sizes and not ignore_missing_revision:
319
+ err_msg = (
320
+ f"Expected to find at least 1 Metafile at "
321
+ f"{revision_dir_path} but found none."
322
+ )
323
+ raise ObjectNotFoundError(err_msg)
324
+ return list(list(zip(*file_paths_and_sizes))[0]) if file_paths_and_sizes else []
325
+
326
+ @property
327
+ def revision(self) -> int:
328
+ return self["revision"]
329
+
330
+ @revision.setter
331
+ def revision(self, revision: int):
332
+ self["revision"] = revision
333
+
334
+ @property
335
+ def txn_id(self) -> Optional[str]:
336
+ return self["txn_id"]
337
+
338
+ @txn_id.setter
339
+ def txn_id(self, txn_id: str):
340
+ self["txn_id"] = txn_id
341
+
342
+ @property
343
+ def txn_op_type(self) -> Optional[TransactionOperationType]:
344
+ op_type = self.get("txn_op_type")
345
+ return None if op_type is None else TransactionOperationType(op_type)
346
+
347
+ @txn_op_type.setter
348
+ def txn_op_type(self, txn_op_type: TransactionOperationType):
349
+ self["txn_op_type"] = txn_op_type
350
+
351
+ @property
352
+ def dir_path(self) -> Optional[str]:
353
+ return self["dir_path"]
354
+
355
+ @dir_path.setter
356
+ def dir_path(self, dir_path: str):
357
+ self["dir_path"] = dir_path
358
+
359
+ @property
360
+ def extension(self) -> str:
361
+ return self.get("extension") or METAFILE_EXT
362
+
363
+ @extension.setter
364
+ def extension(self, extension: str):
365
+ self["extension"] = extension
366
+
367
+ @property
368
+ def file_name(self) -> Optional[str]:
369
+ return (
370
+ TXN_PART_SEPARATOR.join(
371
+ [
372
+ f"{self.revision:020}",
373
+ self.txn_op_type,
374
+ f"{self.txn_id}{self.extension}",
375
+ ]
376
+ )
377
+ if self.txn_op_type and self.txn_id
378
+ else None
379
+ )
380
+
381
+ @property
382
+ def path(self) -> Optional[str]:
383
+ file_name = self.file_name
384
+ return (
385
+ posixpath.join(
386
+ self.dir_path,
387
+ file_name,
388
+ )
389
+ if self.dir_path and file_name
390
+ else None
391
+ )
392
+
393
+ def exists(self) -> bool:
394
+ return bool(self.revision)
395
+
396
+
397
+ class Metafile(dict):
398
+ """
399
+ Base class for DeltaCAT metadata files, with read and write methods
400
+ for dict-based DeltaCAT models. Uses msgpack (https://msgpack.org/) for
401
+ cross-language-compatible serialization and deserialization.
402
+ """
403
+
404
+ @staticmethod
405
+ def update_for(other: Optional[Metafile]) -> Optional[Metafile]:
406
+ """
407
+ Returns a new metafile that can be used as the destination metafile
408
+ in an update transaction operation against the input source metafile.
409
+ The returned metafile starts as an identical deep copy of the input
410
+ metafile such that, if the output is changed and committed as part of
411
+ an update transaction operation on the source metafile, then it will
412
+ update instead of replace the source metafile.
413
+ :param other: Source metafile for the copy.
414
+ :return: New copy of the source metafile.
415
+ """
416
+ return copy.deepcopy(other) if other is not None else None
417
+
418
+ @staticmethod
419
+ def based_on(
420
+ other: Optional[Metafile],
421
+ new_id: Optional[str] = None,
422
+ ) -> Optional[Metafile]:
423
+ """
424
+ Returns a new metafile equivalent to the input metafile, but with a new
425
+ ID assigned to distinguish it as a separate catalog object. This means
426
+ that, if the output is simply committed as part of an update transaction
427
+ operation on the source metafile, then it will replace instead of update
428
+ the source metafile.
429
+ :param other: Source metafile that is the basis for the new metafile.
430
+ :param new_id: New immutable ID to assign to the new metafile. Should
431
+ not be specified for metafiles with mutable names (e.g., namespaces and
432
+ tables).
433
+ :return: A new metafile based on the input metafile with a different ID.
434
+ """
435
+ metafile_copy = Metafile.update_for(other)
436
+ if metafile_copy:
437
+ # remove the source metafile ID so that this is treated as a
438
+ # different catalog object with otherwise identical properties
439
+ if not other.named_immutable_id:
440
+ metafile_copy.pop("id", None)
441
+ if new_id:
442
+ raise ValueError(
443
+ f"New ID cannot be specified for metafiles that "
444
+ f"don't have a named immutable ID."
445
+ )
446
+ else:
447
+ if not new_id:
448
+ raise ValueError(
449
+ f"New ID must be specified for metafiles that have a "
450
+ f"named immutable ID."
451
+ )
452
+ metafile_copy.named_immutable_id = new_id
453
+ # remove all ancestors of the original source metafile
454
+ metafile_copy.pop("ancestor_ids", None)
455
+ return metafile_copy
456
+
457
+ @staticmethod
458
+ def read_txn(
459
+ catalog_root_dir: str,
460
+ success_txn_log_dir: str,
461
+ current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
462
+ current_txn_start_time: int,
463
+ current_txn_id: str,
464
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
465
+ ) -> ListResult[Metafile]:
466
+ """
467
+ Read one or more metadata files within the context of a transaction.
468
+ :param catalog_root_dir: Catalog root dir to read the metafile from.
469
+ :param success_txn_log_dir: Catalog root successful transaction log
470
+ directory.
471
+ :param current_txn_op: Transaction operation for this read.
472
+ :param current_txn_start_time: Transaction start time for this read.
473
+ :param current_txn_id: Transaction ID for this read.
474
+ :param filesystem: File system to use for reading the metadata file. If
475
+ not given, a default filesystem will be automatically selected based on
476
+ the catalog root path.
477
+ :return: ListResult of deserialized metadata files read.
478
+ """
479
+ kwargs = {
480
+ "catalog_root": catalog_root_dir,
481
+ "success_txn_log_dir": success_txn_log_dir,
482
+ "current_txn_start_time": current_txn_start_time,
483
+ "current_txn_id": current_txn_id,
484
+ "filesystem": filesystem,
485
+ "limit": current_txn_op.read_limit,
486
+ }
487
+ if current_txn_op.type == TransactionOperationType.READ_SIBLINGS:
488
+ return current_txn_op.dest_metafile.siblings(**kwargs)
489
+ elif current_txn_op.type == TransactionOperationType.READ_CHILDREN:
490
+ return current_txn_op.dest_metafile.children(**kwargs)
491
+ elif current_txn_op.type == TransactionOperationType.READ_LATEST:
492
+ kwargs["limit"] = 1
493
+ elif current_txn_op.type == TransactionOperationType.READ_EXISTS:
494
+ kwargs["limit"] = 1
495
+ kwargs["materialize_revisions"] = False
496
+ else:
497
+ raise ValueError(
498
+ f"Unsupported transaction operation type: {current_txn_op.type}"
499
+ )
500
+ # return the latest metafile revision for READ_LATEST and READ_EXISTS
501
+ list_result = current_txn_op.dest_metafile.revisions(**kwargs)
502
+ revisions = list_result.all_items()
503
+ metafiles = []
504
+ if revisions:
505
+ op_type = revisions[0][0]
506
+ if op_type != TransactionOperationType.DELETE:
507
+ metafiles.append(revisions[0][1])
508
+ # TODO(pdames): Add Optional[Metafile] to return type and just
509
+ # return the latest metafile (if any) directly?
510
+ return ListResult.of(
511
+ items=metafiles,
512
+ pagination_key=None,
513
+ next_page_provider=None,
514
+ )
515
+ else:
516
+ # Could not find any revisions in list operations - return no results
517
+ return ListResult.empty()
518
+
519
+ @staticmethod
520
+ def get_class(serialized_dict: dict):
521
+ """
522
+ Given a serialized dictionary of Metafile data, gets the metafile child
523
+ class type to instantiate.
524
+ """
525
+ # TODO: more robust implementation. Right now this relies on the
526
+ # assumption that XLocator key will only be present in class X, and
527
+ # is brittle to renames. On the other hand, this implementation does
528
+ # not require any marker fields to be persisted, and a regression
529
+ # will be quickly detected by test_metafile.io or other unit tests
530
+ if serialized_dict.__contains__("tableLocator"):
531
+ return deltacat.storage.model.table.Table
532
+ elif serialized_dict.__contains__("namespaceLocator"):
533
+ return deltacat.storage.model.namespace.Namespace
534
+ elif serialized_dict.__contains__("tableVersionLocator"):
535
+ return deltacat.storage.model.table_version.TableVersion
536
+ elif serialized_dict.__contains__("partitionLocator"):
537
+ return deltacat.storage.model.partition.Partition
538
+ elif serialized_dict.__contains__("streamLocator"):
539
+ return deltacat.storage.model.stream.Stream
540
+ elif serialized_dict.__contains__("deltaLocator"):
541
+ return deltacat.storage.model.delta.Delta
542
+ else:
543
+ raise ValueError(
544
+ f"Could not find metafile class from serialized form: "
545
+ f"${serialized_dict}"
546
+ )
547
+
548
+ @staticmethod
549
+ def get_type_name(serialized_dict: dict):
550
+ """
551
+ Given a serialized dictionary of Metafile data, gets the type name of
552
+ the metafile class.
553
+ """
554
+ return Metafile.get_class(serialized_dict).__name__
555
+
556
+ @classmethod
557
+ def deserialize(
558
+ cls,
559
+ serialized: Union[bytes, str],
560
+ meta_format: Optional[str] = METAFILE_FORMAT,
561
+ ) -> Metafile:
562
+ """
563
+ Deserialize a metadata file from the given bytes or string.
564
+ :param serialized: Serialized metadata file data.
565
+ :param meta_format: Format to use for deserializing the metadata file.
566
+ :return: Deserialized metadata file.
567
+ """
568
+ if meta_format not in SUPPORTED_METAFILE_FORMATS:
569
+ raise ValueError(
570
+ f"Unsupported format '{meta_format}'. "
571
+ f"Supported formats include: {SUPPORTED_METAFILE_FORMATS}."
572
+ )
573
+ reader = {
574
+ "json": lambda b: json.loads(
575
+ b.decode("utf-8"),
576
+ object_hook=lambda obj: {
577
+ k: base64.b64decode(v)
578
+ if isinstance(v, str) and v.startswith("b64:")
579
+ else v
580
+ for k, v in obj.items()
581
+ },
582
+ ),
583
+ "msgpack": msgpack.loads,
584
+ }[meta_format]
585
+ data = reader(serialized)
586
+ # cast this Metafile into the appropriate child class type
587
+ clazz = Metafile.get_class(data)
588
+ return clazz(**data)
589
+
590
+ @classmethod
591
+ def read(
592
+ cls,
593
+ path: str,
594
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
595
+ meta_format: Optional[str] = METAFILE_FORMAT,
596
+ ) -> Metafile:
597
+ """
598
+ Read a metadata file and return the deserialized object.
599
+ :param path: Metadata file path to read.
600
+ :param filesystem: File system to use for reading the metadata file.
601
+ :param meta_format: Format to use for deserializing the metadata file.
602
+ :return: Deserialized object from the metadata file.
603
+ """
604
+ if not filesystem:
605
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
606
+ with filesystem.open_input_stream(path) as file:
607
+ serialized = file.readall()
608
+ metafile = Metafile.deserialize(serialized, meta_format)
609
+ return metafile.from_serializable(path, filesystem)
610
+
611
+ def write_txn(
612
+ self,
613
+ catalog_root_dir: str,
614
+ success_txn_log_dir: str,
615
+ current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
616
+ current_txn_start_time: int,
617
+ current_txn_id: str,
618
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
619
+ ) -> Tuple[List[str], List[str]]:
620
+ """
621
+ Serialize and write this object to a metadata file within the context
622
+ of a transaction.
623
+ :param catalog_root_dir: Catalog root dir to write the metafile to.
624
+ :param success_txn_log_dir: Catalog root successful transaction log
625
+ directory.
626
+ :param current_txn_op: Transaction operation for this write.
627
+ :param current_txn_start_time: Transaction start time for this write.
628
+ :param current_txn_id: Transaction ID for this write.
629
+ :param filesystem: File system to use for writing the metadata file. If
630
+ not given, a default filesystem will be automatically selected based on
631
+ the catalog root path.
632
+ :return: List of fully qualified paths to the metadata files written.
633
+ """
634
+ if not filesystem:
635
+ catalog_root_dir, filesystem = resolve_path_and_filesystem(
636
+ path=catalog_root_dir,
637
+ filesystem=filesystem,
638
+ )
639
+
640
+ return self._write_metafile_revisions(
641
+ catalog_root=catalog_root_dir,
642
+ success_txn_log_dir=success_txn_log_dir,
643
+ current_txn_op=current_txn_op,
644
+ current_txn_start_time=current_txn_start_time,
645
+ current_txn_id=current_txn_id,
646
+ filesystem=filesystem,
647
+ )
648
+
649
+ def serialize(
650
+ self,
651
+ meta_format: Optional[str] = METAFILE_FORMAT,
652
+ ) -> Union[bytes, str]:
653
+ """
654
+ Serialize this object to the given metafile format.
655
+ :param meta_format: Format to use for serializing the metadata file.
656
+ :return: Serialized metadata file bytes or string (format dependent).
657
+ """
658
+ if meta_format not in SUPPORTED_METAFILE_FORMATS:
659
+ raise ValueError(
660
+ f"Unsupported format '{meta_format}'. "
661
+ f"Supported formats include: {SUPPORTED_METAFILE_FORMATS}."
662
+ )
663
+ serializer = {
664
+ "json": lambda data: json.dumps(
665
+ data,
666
+ indent=4,
667
+ default=lambda b: base64.b64encode(b).decode("utf-8")
668
+ if isinstance(b, bytes)
669
+ else b,
670
+ ).encode("utf-8"),
671
+ "msgpack": msgpack.dumps,
672
+ }[meta_format]
673
+ return serializer(self.to_serializable())
674
+
675
+ def write(
676
+ self,
677
+ path: str,
678
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
679
+ meta_format: Optional[str] = METAFILE_FORMAT,
680
+ ) -> None:
681
+ """
682
+ Serialize and write this object to a metadata file.
683
+ :param path: Metadata file path to write to.
684
+ :param filesystem: File system to use for writing the metadata file. If
685
+ not given, a default filesystem will be automatically selected based on
686
+ the catalog root path.
687
+ :param meta_format: Format to use for serializing the metadata file.
688
+ """
689
+ serialized = self.serialize(meta_format)
690
+ if not filesystem:
691
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
692
+ revision_dir_path = posixpath.dirname(path)
693
+ filesystem.create_dir(revision_dir_path, recursive=True)
694
+ with filesystem.open_output_stream(path) as file:
695
+ file.write(serialized)
696
+
697
+ @staticmethod
698
+ def _equivalent_minus_exclusions(d1: dict, d2: dict, exclusions: Set[str]) -> bool:
699
+ if d1.get("streamLocator") and d2.get("streamLocator"):
700
+ # stream locators should be equivalent minus streamId
701
+ exclusions.add("streamId")
702
+ if not Metafile._equivalent_minus_exclusions(
703
+ d1["streamLocator"], d2["streamLocator"], exclusions
704
+ ):
705
+ return False
706
+ if d1.get("partitionLocator") and d2.get("partitionLocator"):
707
+ # partition locators should be equivalent minus partitionId and parent stream locator streamId
708
+ exclusions.add("partitionId")
709
+ if not Metafile._equivalent_minus_exclusions(
710
+ d1["partitionLocator"], d2["partitionLocator"], exclusions
711
+ ):
712
+ return False
713
+ if d1.get("deltaLocator") and d2.get("deltaLocator"):
714
+ # delta locators should be equivalent minus parent partition/stream locator partitionId and streamId
715
+ if not Metafile._equivalent_minus_exclusions(
716
+ d1["deltaLocator"], d2["deltaLocator"], exclusions
717
+ ):
718
+ return False
719
+ for k, v in d1.items():
720
+ if k == "partitionValues" and not d2.get(k):
721
+ # consider [] and None equivalent unpartitioned values
722
+ v = v or d2.get(k)
723
+ if k not in exclusions and (k not in d2 or d2[k] != v):
724
+ return False
725
+ for k in d2.keys():
726
+ if k not in exclusions and k not in d1:
727
+ return False
728
+ return True
729
+
730
+ def equivalent_to(self, other: Metafile) -> bool:
731
+ """
732
+ True if this Metafile is equivalent to the other Metafile minus its
733
+ unique ID, ancestor IDs, and other internal system properties.
734
+
735
+ :param other: Metafile to compare to.
736
+ :return: True if the other metafile is equivalent, false if not.
737
+ """
738
+ identifiers = {
739
+ "id",
740
+ "ancestor_ids",
741
+ "previousStreamId",
742
+ "previousPartitionId",
743
+ "streamLocator",
744
+ "partitionLocator",
745
+ "deltaLocator",
746
+ "compactionRoundCompletionInfo",
747
+ }
748
+ return Metafile._equivalent_minus_exclusions(self, other, identifiers)
749
+
750
+ @property
751
+ def named_immutable_id(self) -> Optional[str]:
752
+ """
753
+ If this metafile's locator name is immutable (i.e., if the object it
754
+ refers to can't be renamed) then returns an immutable ID suitable for
755
+ use in URLS or filesystem paths. Returns None if this locator name is
756
+ mutable (i.e., if the object it refers to can be renamed).
757
+ """
758
+ return self.locator.name.immutable_id
759
+
760
+ @named_immutable_id.setter
761
+ def named_immutable_id(self, immutable_id: Optional[str]) -> None:
762
+ """
763
+ If this metafile's locator name is immutable (i.e., if the object it
764
+ refers to can't be renamed), then sets an immutable ID for this
765
+ locator name suitable for use in URLS or filesystem paths. Note that
766
+ the ID is only considered immutable in durable catalog storage, and
767
+ remains mutable in transient memory (i.e., this setter remains
768
+ functional regardless of whether an ID is already assigned, but each
769
+ update will cause it to refer to a different, distinct object in
770
+ durable storage).
771
+ :raises NotImplementedError: If this metafile type does not have a
772
+ named immutable ID (i.e., its immutable ID is auto-generated).
773
+ """
774
+ self.locator.name.immutable_id = immutable_id
775
+
776
+ @property
777
+ def id(self) -> str:
778
+ """
779
+ Returns an existing immutable ID for this metafile or generates a new
780
+ one. This ID can be used for equality checks (i.e. 2 metafiles refer
781
+ to the same catalog object if they have the same ID) and deterministic
782
+ references (e.g. for generating a root namespace or table path that
783
+ remains the same regardless of renames).
784
+ """
785
+
786
+ # check if the locator name can be reused as an immutable ID
787
+ # or if we need to use a generated UUID as an immutable ID
788
+ _id = self.locator.name.immutable_id or self.get("id")
789
+ if not _id:
790
+ _id = self["id"] = str(uuid.uuid4())
791
+ return _id
792
+
793
+ @property
794
+ def name(self) -> Optional[str]:
795
+ """
796
+ Returns the common name of this metafile. Used as a human
797
+ readable name for this metafile that is unique amongst its
798
+ siblings (e.g., namespace/table name, table version, stream
799
+ format, partition values + scheme ID, delta stream position).
800
+ """
801
+ return (
802
+ self.locator_alias.name.join()
803
+ if self.locator_alias
804
+ else self.locator.name.join()
805
+ )
806
+
807
+ @property
808
+ def locator(self) -> Optional[Locator]:
809
+ """
810
+ Returns the canonical locator for this metafile, which is typically used
811
+ to efficiently resolve internal system references to this object.
812
+ """
813
+ raise NotImplementedError()
814
+
815
+ @property
816
+ def locator_alias(self) -> Optional[Locator]:
817
+ """
818
+ Returns an optional locator alias for this metafile. This is
819
+ typically used to resolve a unique, human-readable reference to this
820
+ object (e.g., by using partition values instead of partition ID or
821
+ stream format name instead of stream ID). Locator aliases are
822
+ typically used during partition predicate pushdown (e.g., by
823
+ partition value + partition scheme ID) or to display unique
824
+ human-readable metafile names.
825
+ """
826
+ return None
827
+
828
+ def children(
829
+ self,
830
+ catalog_root: str,
831
+ success_txn_log_dir: str,
832
+ current_txn_start_time: Optional[int] = None,
833
+ current_txn_id: Optional[str] = None,
834
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
835
+ limit: Optional[int] = None,
836
+ ) -> ListResult[Metafile]:
837
+ """
838
+ Retrieve all children of this object.
839
+ :return: ListResult containing all children of this object.
840
+ """
841
+ catalog_root, filesystem = resolve_path_and_filesystem(
842
+ catalog_root,
843
+ filesystem,
844
+ )
845
+ metafile_root_dir_path = self.metafile_root_path(
846
+ catalog_root=catalog_root,
847
+ current_txn_start_time=current_txn_start_time,
848
+ current_txn_id=current_txn_id,
849
+ filesystem=filesystem,
850
+ )
851
+ # List metafiles with respect to this metafile's URI as root
852
+ return self._list_metafiles(
853
+ success_txn_log_dir=success_txn_log_dir,
854
+ metafile_root_dir_path=metafile_root_dir_path,
855
+ current_txn_start_time=current_txn_start_time,
856
+ current_txn_id=current_txn_id,
857
+ filesystem=filesystem,
858
+ limit=limit,
859
+ )
860
+
861
+ def siblings(
862
+ self,
863
+ catalog_root: str,
864
+ success_txn_log_dir: str,
865
+ current_txn_start_time: Optional[int] = None,
866
+ current_txn_id: Optional[str] = None,
867
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
868
+ limit: Optional[int] = None,
869
+ ) -> ListResult[Metafile]:
870
+ """
871
+ Retrieve all siblings of this object.
872
+ :return: ListResult containing all siblings of this object.
873
+ """
874
+ catalog_root, filesystem = resolve_path_and_filesystem(
875
+ catalog_root,
876
+ filesystem,
877
+ )
878
+ parent_obj_path = self.parent_root_path(
879
+ catalog_root=catalog_root,
880
+ current_txn_start_time=current_txn_start_time,
881
+ current_txn_id=current_txn_id,
882
+ filesystem=filesystem,
883
+ )
884
+ return self._list_metafiles(
885
+ success_txn_log_dir=success_txn_log_dir,
886
+ metafile_root_dir_path=parent_obj_path,
887
+ current_txn_start_time=current_txn_start_time,
888
+ current_txn_id=current_txn_id,
889
+ filesystem=filesystem,
890
+ limit=limit,
891
+ )
892
+
893
+ def revisions(
894
+ self,
895
+ catalog_root: str,
896
+ success_txn_log_dir: str,
897
+ current_txn_start_time: Optional[int] = None,
898
+ current_txn_id: Optional[str] = None,
899
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
900
+ limit: Optional[int] = None,
901
+ materialize_revisions: bool = True,
902
+ ) -> ListResult[Tuple[TransactionOperationType, Optional[Metafile]]]:
903
+ """
904
+ Retrieve all revisions of this object.
905
+ :return: ListResult containing all revisions of this object.
906
+ """
907
+ catalog_root, filesystem = resolve_path_and_filesystem(
908
+ catalog_root,
909
+ filesystem,
910
+ )
911
+ try:
912
+ parent_root = self.parent_root_path(
913
+ catalog_root=catalog_root,
914
+ current_txn_start_time=current_txn_start_time,
915
+ current_txn_id=current_txn_id,
916
+ filesystem=filesystem,
917
+ )
918
+ except ObjectNotFoundError:
919
+ # one or more ancestor's don't exist - return an empty list result
920
+ return ListResult.empty()
921
+ try:
922
+ locator = (
923
+ self.locator
924
+ if self.locator.name.exists()
925
+ else self.locator_alias
926
+ if self.locator_alias and self.locator_alias.name.exists()
927
+ else None
928
+ )
929
+ immutable_id = (
930
+ # TODO(pdames): Refactor id lazy assignment into explicit getter/setter
931
+ self.get("id")
932
+ or Metafile._locator_to_id(
933
+ locator=locator,
934
+ catalog_root=catalog_root,
935
+ metafile_root=parent_root,
936
+ filesystem=filesystem,
937
+ txn_start_time=current_txn_start_time,
938
+ txn_id=current_txn_id,
939
+ )
940
+ if locator
941
+ else None
942
+ )
943
+ except ObjectNotFoundError:
944
+ # the metafile does not exist
945
+ return ListResult.empty()
946
+ if not immutable_id:
947
+ # the metafile has been deleted
948
+ return ListResult.empty()
949
+ revision_dir_path = posixpath.join(
950
+ parent_root,
951
+ immutable_id,
952
+ REVISION_DIR_NAME,
953
+ )
954
+ revisions = MetafileRevisionInfo.list_revisions(
955
+ revision_dir_path=revision_dir_path,
956
+ filesystem=filesystem,
957
+ success_txn_log_dir=success_txn_log_dir,
958
+ current_txn_start_time=current_txn_start_time,
959
+ current_txn_id=current_txn_id,
960
+ limit=limit,
961
+ )
962
+ items = []
963
+ for mri in revisions:
964
+ if mri.exists():
965
+ metafile = (
966
+ {}
967
+ if not materialize_revisions
968
+ else self.read(
969
+ path=mri.path,
970
+ filesystem=filesystem,
971
+ )
972
+ )
973
+ items.append((mri.txn_op_type, metafile))
974
+ # TODO(pdames): Add pagination.
975
+ return ListResult.of(
976
+ items=items,
977
+ pagination_key=None,
978
+ next_page_provider=None,
979
+ )
980
+
981
+ def to_serializable(self) -> Metafile:
982
+ """
983
+ Prepare the object for serialization by converting any non-serializable
984
+ types to serializable types. May also run any required pre-write
985
+ validations on the serialized or deserialized object.
986
+ :return: a serializable version of the object
987
+ """
988
+ return self
989
+
990
+ def from_serializable(
991
+ self,
992
+ path: str,
993
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
994
+ ) -> Metafile:
995
+ """
996
+ Restore any non-serializable types from a serializable version of this
997
+ object. May also run any required post-read validations on the
998
+ serialized or deserialized object.
999
+ :return: a deserialized version of the object
1000
+ """
1001
+ return self
1002
+
1003
+ def parent_root_path(
1004
+ self,
1005
+ catalog_root: str,
1006
+ current_txn_start_time: Optional[int] = None,
1007
+ current_txn_id: Optional[str] = None,
1008
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1009
+ ) -> str:
1010
+ ancestor_ids = self.ancestor_ids(
1011
+ catalog_root=catalog_root,
1012
+ current_txn_start_time=current_txn_start_time,
1013
+ current_txn_id=current_txn_id,
1014
+ filesystem=filesystem,
1015
+ )
1016
+ return posixpath.join(*[catalog_root] + ancestor_ids)
1017
+
1018
+ def metafile_root_path(
1019
+ self,
1020
+ catalog_root: str,
1021
+ current_txn_start_time: Optional[int] = None,
1022
+ current_txn_id: Optional[str] = None,
1023
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1024
+ ) -> str:
1025
+ parent_obj_path = self.parent_root_path(
1026
+ catalog_root=catalog_root,
1027
+ current_txn_start_time=current_txn_start_time,
1028
+ current_txn_id=current_txn_id,
1029
+ filesystem=filesystem,
1030
+ )
1031
+ return posixpath.join(
1032
+ parent_obj_path,
1033
+ self.id,
1034
+ )
1035
+
1036
+ def ancestor_ids(
1037
+ self,
1038
+ catalog_root: str,
1039
+ current_txn_start_time: Optional[int] = None,
1040
+ current_txn_id: Optional[str] = None,
1041
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1042
+ ) -> List[str]:
1043
+ """
1044
+ Returns the IDs for this metafile's ancestor metafiles. IDs are
1045
+ listed in order from root to immediate parent.
1046
+ """
1047
+ ancestor_ids = self.get("ancestor_ids") or []
1048
+ if not ancestor_ids:
1049
+ ancestor_ids = Metafile._ancestor_ids(
1050
+ locator=self.locator,
1051
+ catalog_root=catalog_root,
1052
+ current_txn_start_time=current_txn_start_time,
1053
+ current_txn_id=current_txn_id,
1054
+ filesystem=filesystem,
1055
+ )
1056
+ self["ancestor_ids"] = ancestor_ids
1057
+ return ancestor_ids
1058
+
1059
+ @staticmethod
1060
+ def _parent_metafile_rev_dir_path(
1061
+ base_metafile_path: str,
1062
+ parent_number,
1063
+ ):
1064
+ # TODO(pdames): Stop parent traversal at catalog root.
1065
+ current_dir = posixpath.dirname( # base metafile root dir
1066
+ posixpath.dirname( # base metafile revision dir
1067
+ base_metafile_path,
1068
+ )
1069
+ )
1070
+ while parent_number and current_dir != posixpath.sep:
1071
+ current_dir = posixpath.dirname(current_dir)
1072
+ parent_number -= 1
1073
+ return posixpath.join(
1074
+ current_dir,
1075
+ REVISION_DIR_NAME,
1076
+ )
1077
+
1078
+ @staticmethod
1079
+ def _locator_to_id(
1080
+ locator: Locator,
1081
+ catalog_root: str,
1082
+ metafile_root: str,
1083
+ filesystem: pyarrow.fs.FileSystem,
1084
+ txn_start_time: Optional[int] = None,
1085
+ txn_id: Optional[str] = None,
1086
+ ) -> Optional[str]:
1087
+ """
1088
+ Resolves the immutable metafile ID for the given locator.
1089
+
1090
+ :return: Immutable ID read from mapping file. None if no mapping exists.
1091
+ :raises: ObjectNotFoundError if the id is not found.
1092
+ """
1093
+ metafile_id = locator.name.immutable_id
1094
+ if not metafile_id:
1095
+ # the locator name is mutable, so we need to resolve the mapping
1096
+ # from the locator back to its immutable metafile ID
1097
+ locator_path = locator.path(metafile_root)
1098
+ success_txn_log_dir = posixpath.join(
1099
+ catalog_root,
1100
+ TXN_DIR_NAME,
1101
+ SUCCESS_TXN_DIR_NAME,
1102
+ )
1103
+ mri = MetafileRevisionInfo.latest_revision(
1104
+ revision_dir_path=locator_path,
1105
+ filesystem=filesystem,
1106
+ success_txn_log_dir=success_txn_log_dir,
1107
+ current_txn_start_time=txn_start_time,
1108
+ current_txn_id=txn_id,
1109
+ ignore_missing_revision=True,
1110
+ )
1111
+ if not mri.exists():
1112
+ return None
1113
+ if mri.txn_op_type == TransactionOperationType.DELETE:
1114
+ # Return None for DELETE revisions to allow graceful handling
1115
+ # of renamed objects. The from_serializable mechanism can then
1116
+ # restore the correct locator from parent metadata.
1117
+ return None
1118
+ metafile_id = posixpath.splitext(mri.path)[1][1:]
1119
+ return metafile_id
1120
+
1121
+ @staticmethod
1122
+ def _ancestor_ids(
1123
+ locator: Locator,
1124
+ catalog_root: str,
1125
+ current_txn_start_time: Optional[int] = None,
1126
+ current_txn_id: Optional[str] = None,
1127
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1128
+ ) -> List[str]:
1129
+ ancestor_ids = []
1130
+ catalog_root, filesystem = resolve_path_and_filesystem(
1131
+ path=catalog_root,
1132
+ filesystem=filesystem,
1133
+ )
1134
+ parent_locators = []
1135
+ # TODO(pdames): Correctly resolve missing parents and K of N
1136
+ # specified ancestors by using placeholder IDs for missing
1137
+ # ancestors
1138
+ parent_locator = locator.parent
1139
+ while parent_locator:
1140
+ parent_locators.append(parent_locator)
1141
+ parent_locator = parent_locator.parent
1142
+ metafile_root = catalog_root
1143
+ while parent_locators:
1144
+ parent_locator = parent_locators.pop()
1145
+ ancestor_id = Metafile._locator_to_id(
1146
+ locator=parent_locator,
1147
+ catalog_root=catalog_root,
1148
+ metafile_root=metafile_root,
1149
+ filesystem=filesystem,
1150
+ txn_start_time=current_txn_start_time,
1151
+ txn_id=current_txn_id,
1152
+ )
1153
+ if not ancestor_id:
1154
+ err_msg = f"Ancestor does not exist: {parent_locator}."
1155
+ raise ObjectNotFoundError(err_msg)
1156
+ metafile_root = posixpath.join(
1157
+ metafile_root,
1158
+ ancestor_id,
1159
+ )
1160
+ try:
1161
+ get_file_info(
1162
+ path=metafile_root,
1163
+ filesystem=filesystem,
1164
+ )
1165
+ except FileNotFoundError:
1166
+ raise ObjectNotFoundError(
1167
+ f"Ancestor {parent_locator} does not exist at: " f"{metafile_root}"
1168
+ )
1169
+ ancestor_ids.append(ancestor_id)
1170
+ return ancestor_ids
1171
+
1172
+ def _write_locator_to_id_map_file(
1173
+ self,
1174
+ locator: Locator,
1175
+ success_txn_log_dir: str,
1176
+ parent_obj_path: str,
1177
+ current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
1178
+ current_txn_op_type: TransactionOperationType,
1179
+ current_txn_start_time: int,
1180
+ current_txn_id: str,
1181
+ filesystem: pyarrow.fs.FileSystem,
1182
+ ) -> str:
1183
+ name_resolution_dir_path = locator.path(parent_obj_path)
1184
+ # TODO(pdames): Don't write updated revisions with the same mapping as
1185
+ # the latest revision.
1186
+ mri = MetafileRevisionInfo.new_revision(
1187
+ revision_dir_path=name_resolution_dir_path,
1188
+ current_txn_op_type=current_txn_op_type,
1189
+ current_txn_start_time=current_txn_start_time,
1190
+ current_txn_id=current_txn_id,
1191
+ filesystem=filesystem,
1192
+ extension=f".{self.id}",
1193
+ success_txn_log_dir=success_txn_log_dir,
1194
+ )
1195
+ revision_file_path = mri.path
1196
+ filesystem.create_dir(posixpath.dirname(revision_file_path), recursive=True)
1197
+ with filesystem.open_output_stream(revision_file_path):
1198
+ pass # Just create an empty ID file to map to the locator
1199
+ current_txn_op.append_locator_write_path(revision_file_path)
1200
+ return revision_file_path
1201
+
1202
+ def _write_metafile_revision(
1203
+ self,
1204
+ success_txn_log_dir: str,
1205
+ revision_dir_path: str,
1206
+ current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
1207
+ current_txn_op_type: TransactionOperationType,
1208
+ current_txn_start_time: int,
1209
+ current_txn_id: str,
1210
+ filesystem: pyarrow.fs.FileSystem,
1211
+ ) -> str:
1212
+ mri = MetafileRevisionInfo.new_revision(
1213
+ revision_dir_path=revision_dir_path,
1214
+ current_txn_op_type=current_txn_op_type,
1215
+ current_txn_start_time=current_txn_start_time,
1216
+ current_txn_id=current_txn_id,
1217
+ filesystem=filesystem,
1218
+ success_txn_log_dir=success_txn_log_dir,
1219
+ )
1220
+ self.write(
1221
+ path=mri.path,
1222
+ filesystem=filesystem,
1223
+ )
1224
+ current_txn_op.append_metafile_write_path(mri.path)
1225
+ return mri.path
1226
+
1227
+ def _write_metafile_revisions(
1228
+ self,
1229
+ catalog_root: str,
1230
+ success_txn_log_dir: str,
1231
+ current_txn_op: deltacat.storage.model.transaction.TransactionOperation,
1232
+ current_txn_start_time: int,
1233
+ current_txn_id: str,
1234
+ filesystem: pyarrow.fs.FileSystem,
1235
+ ) -> Tuple[List[str], List[str]]:
1236
+ """
1237
+ Generates the fully qualified paths required to write this metafile as
1238
+ part of the given transaction. All paths returned will be based in the
1239
+ given root directory.
1240
+ """
1241
+ metafile_write_paths = []
1242
+ locator_write_paths = []
1243
+ parent_obj_path = self.parent_root_path(
1244
+ catalog_root=catalog_root,
1245
+ current_txn_start_time=current_txn_start_time,
1246
+ current_txn_id=current_txn_id,
1247
+ filesystem=filesystem,
1248
+ )
1249
+ mutable_src_locator = None
1250
+ mutable_dest_locator = None
1251
+ # metafiles without named immutable IDs have mutable name mappings
1252
+ if not self.named_immutable_id:
1253
+ mutable_src_locator = (
1254
+ current_txn_op.src_metafile.locator
1255
+ if current_txn_op.src_metafile
1256
+ else None
1257
+ )
1258
+ mutable_dest_locator = current_txn_op.dest_metafile.locator
1259
+ # metafiles with named immutable IDs may have aliases
1260
+ elif self.locator_alias:
1261
+ mutable_src_locator = (
1262
+ current_txn_op.src_metafile.locator_alias
1263
+ if current_txn_op.src_metafile
1264
+ else None
1265
+ )
1266
+ mutable_dest_locator = current_txn_op.dest_metafile.locator_alias
1267
+ if mutable_dest_locator:
1268
+ # the locator name is mutable, so we need to persist a mapping
1269
+ # from the locator back to its immutable metafile ID
1270
+ if current_txn_op.type == TransactionOperationType.UPDATE:
1271
+ # mutable locator updates are used to either transition
1272
+ # staged streams/partitions (which have no locator alias) to
1273
+ # committed (and create the locator alias) or to rename an
1274
+ # existing mutable locator
1275
+ if mutable_src_locator != mutable_dest_locator:
1276
+ if mutable_src_locator is not None:
1277
+ # this update includes a rename
1278
+ # mark the source metafile mapping as deleted
1279
+ locator_write_path = (
1280
+ current_txn_op.src_metafile._write_locator_to_id_map_file(
1281
+ locator=mutable_src_locator,
1282
+ success_txn_log_dir=success_txn_log_dir,
1283
+ parent_obj_path=parent_obj_path,
1284
+ current_txn_op=current_txn_op,
1285
+ current_txn_op_type=TransactionOperationType.DELETE,
1286
+ current_txn_start_time=current_txn_start_time,
1287
+ current_txn_id=current_txn_id,
1288
+ filesystem=filesystem,
1289
+ )
1290
+ )
1291
+ locator_write_paths.append(locator_write_path)
1292
+ # mark the dest metafile mapping as created
1293
+ locator_write_path = self._write_locator_to_id_map_file(
1294
+ locator=mutable_dest_locator,
1295
+ success_txn_log_dir=success_txn_log_dir,
1296
+ parent_obj_path=parent_obj_path,
1297
+ current_txn_op=current_txn_op,
1298
+ current_txn_op_type=TransactionOperationType.CREATE,
1299
+ current_txn_start_time=current_txn_start_time,
1300
+ current_txn_id=current_txn_id,
1301
+ filesystem=filesystem,
1302
+ )
1303
+ locator_write_paths.append(locator_write_path)
1304
+ # else this is a mutable locator no-op update - do nothing
1305
+ else:
1306
+ # this is either a create/delete operation or a
1307
+ # replace operation that is part of an overwrite/restate
1308
+ # transaction (e.g. committing a staged replacement for a
1309
+ # previously committed stream/partition).
1310
+ locator_write_path = self._write_locator_to_id_map_file(
1311
+ locator=mutable_dest_locator,
1312
+ success_txn_log_dir=success_txn_log_dir,
1313
+ parent_obj_path=parent_obj_path,
1314
+ current_txn_op=current_txn_op,
1315
+ current_txn_op_type=current_txn_op.type,
1316
+ current_txn_start_time=current_txn_start_time,
1317
+ current_txn_id=current_txn_id,
1318
+ filesystem=filesystem,
1319
+ )
1320
+ locator_write_paths.append(locator_write_path)
1321
+ metafile_revision_dir_path = posixpath.join(
1322
+ parent_obj_path,
1323
+ self.id,
1324
+ REVISION_DIR_NAME,
1325
+ )
1326
+ if (
1327
+ current_txn_op.type
1328
+ in [TransactionOperationType.UPDATE, TransactionOperationType.REPLACE]
1329
+ and current_txn_op.src_metafile.id != current_txn_op.dest_metafile.id
1330
+ ):
1331
+ # TODO(pdames): block operations including both a rename & replace?
1332
+ # this update includes a replace
1333
+ # mark the source metafile as deleted
1334
+ src_metafile_revision_dir_path = posixpath.join(
1335
+ parent_obj_path,
1336
+ current_txn_op.src_metafile.id,
1337
+ REVISION_DIR_NAME,
1338
+ )
1339
+ metafile_write_path = self._write_metafile_revision(
1340
+ success_txn_log_dir=success_txn_log_dir,
1341
+ revision_dir_path=src_metafile_revision_dir_path,
1342
+ current_txn_op=current_txn_op,
1343
+ current_txn_op_type=TransactionOperationType.DELETE,
1344
+ current_txn_start_time=current_txn_start_time,
1345
+ current_txn_id=current_txn_id,
1346
+ filesystem=filesystem,
1347
+ )
1348
+ metafile_write_paths.append(metafile_write_path)
1349
+ try:
1350
+ # mark the dest metafile as created
1351
+ metafile_write_path = self._write_metafile_revision(
1352
+ success_txn_log_dir=success_txn_log_dir,
1353
+ revision_dir_path=metafile_revision_dir_path,
1354
+ current_txn_op=current_txn_op,
1355
+ current_txn_op_type=TransactionOperationType.CREATE,
1356
+ current_txn_start_time=current_txn_start_time,
1357
+ current_txn_id=current_txn_id,
1358
+ filesystem=filesystem,
1359
+ )
1360
+ metafile_write_paths.append(metafile_write_path)
1361
+ except ObjectAlreadyExistsError:
1362
+ # src metafile is being replaced by an existing dest metafile
1363
+ pass
1364
+
1365
+ else:
1366
+ metafile_write_path = self._write_metafile_revision(
1367
+ success_txn_log_dir=success_txn_log_dir,
1368
+ revision_dir_path=metafile_revision_dir_path,
1369
+ current_txn_op=current_txn_op,
1370
+ current_txn_op_type=current_txn_op.type,
1371
+ current_txn_start_time=current_txn_start_time,
1372
+ current_txn_id=current_txn_id,
1373
+ filesystem=filesystem,
1374
+ )
1375
+ metafile_write_paths.append(metafile_write_path)
1376
+ return metafile_write_paths, locator_write_paths
1377
+
1378
+ def _list_metafiles(
1379
+ self,
1380
+ success_txn_log_dir: str,
1381
+ metafile_root_dir_path: str,
1382
+ current_txn_start_time: Optional[int] = None,
1383
+ current_txn_id: Optional[str] = None,
1384
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
1385
+ limit: Optional[int] = None,
1386
+ ) -> ListResult[Metafile]:
1387
+ file_paths_and_sizes = list_directory(
1388
+ path=metafile_root_dir_path,
1389
+ filesystem=filesystem,
1390
+ ignore_missing_path=True,
1391
+ )
1392
+ # TODO(pdames): Exclude name resolution directories
1393
+ revision_dir_paths = [
1394
+ posixpath.join(file_path_and_size[0], REVISION_DIR_NAME)
1395
+ for file_path_and_size in file_paths_and_sizes
1396
+ if file_path_and_size[0] != success_txn_log_dir
1397
+ ]
1398
+ items = []
1399
+ for path in revision_dir_paths:
1400
+ mri = MetafileRevisionInfo.latest_revision(
1401
+ revision_dir_path=path,
1402
+ filesystem=filesystem,
1403
+ success_txn_log_dir=success_txn_log_dir,
1404
+ current_txn_start_time=current_txn_start_time,
1405
+ current_txn_id=current_txn_id,
1406
+ ignore_missing_revision=True,
1407
+ )
1408
+ if mri.exists() and mri.txn_op_type != TransactionOperationType.DELETE:
1409
+ item = self.read(
1410
+ path=mri.path,
1411
+ filesystem=filesystem,
1412
+ )
1413
+ items.append(item)
1414
+ if limit and limit <= len(items):
1415
+ break
1416
+ # TODO(pdames): Add pagination.
1417
+ return ListResult.of(
1418
+ items=items,
1419
+ pagination_key=None,
1420
+ next_page_provider=None,
1421
+ )