deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/utils/url.py ADDED
@@ -0,0 +1,1325 @@
1
+ import functools
2
+ import json
3
+ from typing import Callable, List, Tuple, Any, Union, Optional
4
+ from urllib.parse import urlparse, urlunparse, parse_qs
5
+
6
+ import ray
7
+ import daft
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ import pyarrow as pa
12
+ import polars as pl
13
+ import deltacat as dc
14
+
15
+ import pyarrow.csv as pacsv
16
+ import pyarrow.json as pajson
17
+
18
+ from deltacat.catalog import CatalogProperties
19
+ from deltacat.constants import DEFAULT_NAMESPACE
20
+ from deltacat.types.media import (
21
+ DatasetType,
22
+ DatastoreType,
23
+ )
24
+ from deltacat.utils import pyarrow as pa_utils
25
+
26
+ from deltacat.storage import (
27
+ metastore,
28
+ Dataset,
29
+ Delta,
30
+ DeltaLocator,
31
+ ListResult,
32
+ Metafile,
33
+ Namespace,
34
+ NamespaceLocator,
35
+ Partition,
36
+ Stream,
37
+ StreamFormat,
38
+ StreamLocator,
39
+ PartitionLocator,
40
+ Table,
41
+ TableLocator,
42
+ TableVersion,
43
+ TableVersionLocator,
44
+ )
45
+
46
+
47
+ def _normalize_partition_values_from_json(partition_values):
48
+ """
49
+ Normalize partition values parsed from JSON URLs.
50
+
51
+ Both None and empty list [] represent unpartitioned data, but they should be
52
+ normalized to None for consistent lookup and validation.
53
+
54
+ Args:
55
+ partition_values: Partition values parsed from JSON
56
+
57
+ Returns:
58
+ None for unpartitioned data (both None and [] inputs),
59
+ original value for partitioned data
60
+ """
61
+ if partition_values is None or (
62
+ isinstance(partition_values, list) and len(partition_values) == 0
63
+ ):
64
+ return None
65
+ return partition_values
66
+
67
+
68
+ RAY_DATASTORE_TYPE_TO_READER = {
69
+ DatastoreType.AUDIO: lambda url: functools.partial(
70
+ ray.data.read_audio,
71
+ url.url_path,
72
+ **url.query_params,
73
+ ),
74
+ DatastoreType.AVRO: lambda url: functools.partial(
75
+ ray.data.read_avro,
76
+ url.url_path,
77
+ **url.query_params,
78
+ ),
79
+ DatastoreType.BIGQUERY: lambda url: functools.partial(
80
+ ray.data.read_bigquery,
81
+ project_id=url.parsed.netloc,
82
+ dataset=url.path_elements[0] if url.path_elements else None,
83
+ **url.query_params,
84
+ ),
85
+ DatastoreType.BINARY: lambda url: functools.partial(
86
+ ray.data.read_binary_files,
87
+ url.url_path,
88
+ **url.query_params,
89
+ ),
90
+ DatastoreType.CSV: lambda url: functools.partial(
91
+ ray.data.read_csv,
92
+ url.url_path,
93
+ **url.query_params,
94
+ ),
95
+ DatastoreType.CLICKHOUSE: lambda url: functools.partial(
96
+ ray.data.read_clickhouse,
97
+ table=url.parsed.query,
98
+ dsn=url.url,
99
+ **url.query_params,
100
+ ),
101
+ DatastoreType.DATABRICKS_TABLES: lambda url: functools.partial(
102
+ ray.data.read_databricks_tables,
103
+ warehouse_id=url.parsed.netloc,
104
+ **url.query_params,
105
+ ),
106
+ DatastoreType.DELTA_SHARING: lambda url: functools.partial(
107
+ ray.data.read_delta_sharing_tables,
108
+ url.url_path,
109
+ **url.query_params,
110
+ ),
111
+ DatastoreType.HUDI: lambda url: functools.partial(
112
+ ray.data.read_hudi,
113
+ url.url_path,
114
+ **url.query_params,
115
+ ),
116
+ DatastoreType.ICEBERG: lambda url: functools.partial(
117
+ ray.data.read_iceberg,
118
+ table_identifier=url.parsed.netloc,
119
+ **url.query_params,
120
+ ),
121
+ DatastoreType.IMAGES: lambda url: functools.partial(
122
+ ray.data.read_images,
123
+ url.url_path,
124
+ **url.query_params,
125
+ ),
126
+ DatastoreType.JSON: lambda url: functools.partial(
127
+ ray.data.read_json,
128
+ url.url_path,
129
+ **url.query_params,
130
+ ),
131
+ DatastoreType.LANCE: lambda url: functools.partial(
132
+ ray.data.read_lance,
133
+ url.url_path,
134
+ **url.query_params,
135
+ ),
136
+ DatastoreType.MONGO: lambda url: functools.partial(
137
+ ray.data.read_mongo,
138
+ url.url,
139
+ **url.query_params,
140
+ ),
141
+ DatastoreType.NUMPY: lambda url: functools.partial(
142
+ ray.data.read_numpy,
143
+ url.url_path,
144
+ **url.query_params,
145
+ ),
146
+ DatastoreType.PARQUET: lambda url: functools.partial(
147
+ ray.data.read_parquet,
148
+ url.url_path,
149
+ **url.query_params,
150
+ ),
151
+ DatastoreType.TEXT: lambda url: functools.partial(
152
+ ray.data.read_text,
153
+ url.url_path,
154
+ **url.query_params,
155
+ ),
156
+ DatastoreType.TFRECORDS: lambda url: functools.partial(
157
+ ray.data.read_tfrecords,
158
+ url.url_path,
159
+ **url.query_params,
160
+ ),
161
+ DatastoreType.VIDEOS: lambda url: functools.partial(
162
+ ray.data.read_videos,
163
+ url.url_path,
164
+ **url.query_params,
165
+ ),
166
+ DatastoreType.WEBDATASET: lambda url: functools.partial(
167
+ ray.data.read_webdataset,
168
+ url.url_path,
169
+ **url.query_params,
170
+ ),
171
+ }
172
+
173
+ RAY_DATASTORE_TYPE_TO_WRITER = {
174
+ DatastoreType.BIGQUERY: lambda url: functools.partial(
175
+ ray.data.Dataset.write_bigquery,
176
+ project_id=url.parsed.netloc,
177
+ dataset=url.path_elements[0] if url.path_elements else None,
178
+ **url.query_params,
179
+ ),
180
+ DatastoreType.CSV: lambda url: functools.partial(
181
+ ray.data.write_csv,
182
+ url.url_path,
183
+ **url.query_params,
184
+ ),
185
+ DatastoreType.ICEBERG: lambda url: functools.partial(
186
+ ray.data.Dataset.write_iceberg,
187
+ table_identifier=url.parsed.netloc,
188
+ **url.query_params,
189
+ ),
190
+ DatastoreType.IMAGES: lambda url: functools.partial(
191
+ ray.data.Dataset.write_images,
192
+ path=url.url_path,
193
+ column=url.query_params.pop("column", "image") if url.query_params else "image",
194
+ **url.query_params,
195
+ ),
196
+ DatastoreType.JSON: lambda url: functools.partial(
197
+ ray.data.Dataset.write_json,
198
+ url.url_path,
199
+ **url.query_params,
200
+ ),
201
+ DatastoreType.LANCE: lambda url: functools.partial(
202
+ ray.data.Dataset.write_lance,
203
+ url.url_path,
204
+ **url.query_params,
205
+ ),
206
+ DatastoreType.MONGO: lambda url: functools.partial(
207
+ ray.data.Dataset.write_mongo,
208
+ url.url,
209
+ **url.query_params,
210
+ ),
211
+ DatastoreType.NUMPY: lambda url: functools.partial(
212
+ ray.data.Dataset.write_numpy,
213
+ path=url.url_path,
214
+ column=url.query_params.pop("column", "data") if url.query_params else "data",
215
+ **url.query_params,
216
+ ),
217
+ DatastoreType.PARQUET: lambda url: functools.partial(
218
+ ray.data.Dataset.write_parquet,
219
+ url.url_path,
220
+ **url.query_params,
221
+ ),
222
+ DatastoreType.TFRECORDS: lambda url: functools.partial(
223
+ ray.data.Dataset.write_tfrecords,
224
+ url.url_path,
225
+ **url.query_params,
226
+ ),
227
+ DatastoreType.WEBDATASET: lambda url: functools.partial(
228
+ ray.data.Dataset.write_webdataset,
229
+ url.url_path,
230
+ **url.query_params,
231
+ ),
232
+ }
233
+
234
+ DAFT_DATASTORE_TYPE_TO_READER = {
235
+ DatastoreType.CSV: lambda url: functools.partial(
236
+ daft.io.read_csv,
237
+ url.url_path,
238
+ **url.query_params,
239
+ ),
240
+ DatastoreType.DELTA_LAKE: lambda url: functools.partial(
241
+ daft.io.read_deltalake,
242
+ url.url_path,
243
+ **url.query_params,
244
+ ),
245
+ DatastoreType.HUDI: lambda url: functools.partial(
246
+ daft.io.read_hudi,
247
+ url.url_path,
248
+ **url.query_params,
249
+ ),
250
+ DatastoreType.ICEBERG: lambda url: functools.partial(
251
+ daft.io.read_iceberg,
252
+ url.url_path,
253
+ **url.query_params,
254
+ ),
255
+ DatastoreType.JSON: lambda url: functools.partial(
256
+ daft.io.read_json,
257
+ url.url_path,
258
+ **url.query_params,
259
+ ),
260
+ DatastoreType.PARQUET: lambda url: functools.partial(
261
+ daft.io.read_parquet,
262
+ url.url_path,
263
+ **url.query_params,
264
+ ),
265
+ DatastoreType.WARC: lambda url: functools.partial(
266
+ daft.io.read_warc,
267
+ url.url_path,
268
+ **url.query_params,
269
+ ),
270
+ DatastoreType.TEXT: lambda url: functools.partial(
271
+ daft.io.read_csv,
272
+ url.url_path,
273
+ infer_schema=False,
274
+ schema={"text": daft.DataType.string()},
275
+ has_headers=False,
276
+ delimiter=chr(25), # end of medium char
277
+ double_quote=False,
278
+ comment=None,
279
+ ),
280
+ }
281
+
282
+ DAFT_DATASTORE_TYPE_TO_WRITER = {
283
+ DatastoreType.CSV: lambda url: functools.partial(
284
+ daft.DataFrame.write_csv,
285
+ url.url_path,
286
+ **url.query_params,
287
+ ),
288
+ DatastoreType.DELTA_LAKE: lambda url: functools.partial(
289
+ daft.DataFrame.write_deltalake,
290
+ url.url_path,
291
+ **url.query_params,
292
+ ),
293
+ DatastoreType.ICEBERG: lambda url: functools.partial(
294
+ daft.DataFrame.write_iceberg,
295
+ **url.query_params,
296
+ ),
297
+ DatastoreType.LANCE: lambda url: functools.partial(
298
+ daft.DataFrame.write_lance,
299
+ url.url_path,
300
+ **url.query_params,
301
+ ),
302
+ DatastoreType.PARQUET: lambda url: functools.partial(
303
+ daft.DataFrame.write_parquet,
304
+ url.url_path,
305
+ **url.query_params,
306
+ ),
307
+ }
308
+
309
+ PYARROW_DATASTORE_TYPE_TO_READER = {
310
+ DatastoreType.CSV: lambda url: functools.partial(
311
+ pa_utils.read_csv,
312
+ url.url_path,
313
+ read_options=pacsv.ReadOptions(use_threads=False),
314
+ **url.query_params,
315
+ ),
316
+ DatastoreType.FEATHER: lambda url: functools.partial(
317
+ pa_utils.read_feather,
318
+ url.url_path,
319
+ use_threads=False,
320
+ **url.query_params,
321
+ ),
322
+ DatastoreType.JSON: lambda url: functools.partial(
323
+ pa_utils.read_json,
324
+ url.url_path,
325
+ pajson.ReadOptions(use_threads=False),
326
+ **url.query_params,
327
+ ),
328
+ DatastoreType.ORC: lambda url: functools.partial(
329
+ pa_utils.read_orc,
330
+ url.url_path,
331
+ **url.query_params,
332
+ ),
333
+ DatastoreType.PARQUET: lambda url: functools.partial(
334
+ pa_utils.read_parquet,
335
+ url.url_path,
336
+ use_threads=False,
337
+ **url.query_params,
338
+ ),
339
+ DatastoreType.TEXT: lambda url: functools.partial(
340
+ pa_utils.read_csv,
341
+ url.url_path,
342
+ read_options=pacsv.ReadOptions(
343
+ use_threads=False,
344
+ column_names=["text"],
345
+ ),
346
+ parse_options=pacsv.ParseOptions(
347
+ delimiter=chr(25), # end of medium char
348
+ quote_char=False,
349
+ double_quote=False,
350
+ ),
351
+ convert_options=pacsv.ConvertOptions(
352
+ check_utf8=False,
353
+ column_types={"text": pa.string()},
354
+ ),
355
+ ),
356
+ }
357
+
358
+ PYARROW_DATASTORE_TYPE_TO_WRITER = {
359
+ DatastoreType.CSV: lambda url: functools.partial(
360
+ pa_utils.write_csv,
361
+ path=url.url_path,
362
+ **url.query_params,
363
+ ),
364
+ DatastoreType.FEATHER: lambda url: functools.partial(
365
+ pa_utils.write_feather,
366
+ path=url.url_path,
367
+ **url.query_params,
368
+ ),
369
+ DatastoreType.ORC: lambda url: functools.partial(
370
+ pa_utils.write_orc,
371
+ path=url.url_path,
372
+ **url.query_params,
373
+ ),
374
+ DatastoreType.PARQUET: lambda url: functools.partial(
375
+ pa_utils.write_parquet,
376
+ path=url.url_path,
377
+ **url.query_params,
378
+ ),
379
+ }
380
+
381
+ POLARS_DATASTORE_TYPE_TO_READER = {
382
+ DatastoreType.CSV: lambda url: functools.partial(
383
+ pl.read_csv,
384
+ url.url_path,
385
+ n_threads=1,
386
+ **url.query_params,
387
+ ),
388
+ DatastoreType.DELTA_LAKE: lambda url: functools.partial(
389
+ pl.read_delta,
390
+ url.url_path,
391
+ **url.query_params,
392
+ ),
393
+ DatastoreType.ICEBERG: lambda url: functools.partial(
394
+ pl.scan_iceberg,
395
+ url.url_path,
396
+ **url.query_params,
397
+ ),
398
+ DatastoreType.JSON: lambda url: functools.partial(
399
+ pl.read_json,
400
+ url.url_path,
401
+ **url.query_params,
402
+ ),
403
+ DatastoreType.PARQUET: lambda url: functools.partial(
404
+ pl.read_parquet,
405
+ url.url_path,
406
+ **url.query_params,
407
+ ),
408
+ DatastoreType.TEXT: lambda url: functools.partial(
409
+ pl.read_csv,
410
+ url.url_path,
411
+ new_columns=["text"],
412
+ n_threads=1,
413
+ separator=chr(25), # end of medium char
414
+ has_header=False,
415
+ quote_char=None,
416
+ infer_schema=False,
417
+ ),
418
+ }
419
+
420
+ POLARS_DATASTORE_TYPE_TO_WRITER = {
421
+ DatastoreType.AVRO: lambda url: functools.partial(
422
+ pl.DataFrame.write_avro,
423
+ file=url.url_path,
424
+ **url.query_params,
425
+ ),
426
+ DatastoreType.CSV: lambda url: functools.partial(
427
+ pl.DataFrame.write_csv,
428
+ file=url.url_path,
429
+ **url.query_params,
430
+ ),
431
+ DatastoreType.DELTA_LAKE: lambda url: functools.partial(
432
+ pl.DataFrame.write_delta,
433
+ target=url.url_path,
434
+ **url.query_params,
435
+ ),
436
+ DatastoreType.ICEBERG: lambda url: functools.partial(
437
+ pl.DataFrame.write_iceberg,
438
+ target=url.url_path,
439
+ **url.query_params,
440
+ ),
441
+ DatastoreType.JSON: lambda url: functools.partial(
442
+ pl.DataFrame.write_ndjson,
443
+ file=url.url_path,
444
+ **url.query_params,
445
+ ),
446
+ DatastoreType.PARQUET: lambda url: functools.partial(
447
+ pl.DataFrame.write_parquet,
448
+ file=url.url_path,
449
+ **url.query_params,
450
+ ),
451
+ }
452
+
453
+ PANDAS_DATASTORE_TYPE_TO_READER = {
454
+ DatastoreType.CSV: lambda url: functools.partial(
455
+ pd.read_csv,
456
+ url.url_path,
457
+ **url.query_params,
458
+ ),
459
+ DatastoreType.FEATHER: lambda url: functools.partial(
460
+ pd.read_feather,
461
+ url.url_path,
462
+ **url.query_params,
463
+ ),
464
+ DatastoreType.HDF: lambda url: functools.partial(
465
+ pd.read_hdf,
466
+ url.url_path,
467
+ **url.query_params,
468
+ ),
469
+ DatastoreType.HTML: lambda url: functools.partial(
470
+ pd.read_html,
471
+ url.url_path,
472
+ **url.query_params,
473
+ ),
474
+ DatastoreType.JSON: lambda url: functools.partial(
475
+ pd.read_json,
476
+ url.url_path,
477
+ **url.query_params,
478
+ ),
479
+ DatastoreType.ORC: lambda url: functools.partial(
480
+ pd.read_orc,
481
+ url.url_path,
482
+ **url.query_params,
483
+ ),
484
+ DatastoreType.PARQUET: lambda url: functools.partial(
485
+ pd.read_parquet,
486
+ url.url_path,
487
+ **url.query_params,
488
+ ),
489
+ DatastoreType.XML: lambda url: functools.partial(
490
+ pd.read_xml,
491
+ url.url_path,
492
+ **url.query_params,
493
+ ),
494
+ }
495
+
496
+ PANDAS_DATASTORE_TYPE_TO_WRITER = {
497
+ DatastoreType.CSV: lambda url: functools.partial(
498
+ pd.DataFrame.to_csv,
499
+ path_or_buf=url.url_path,
500
+ **url.query_params,
501
+ ),
502
+ DatastoreType.FEATHER: lambda url: functools.partial(
503
+ pd.DataFrame.to_feather,
504
+ path=url.url_path,
505
+ **url.query_params,
506
+ ),
507
+ DatastoreType.HDF: lambda url: functools.partial(
508
+ pd.DataFrame.to_hdf,
509
+ path_or_buf=url.url_path,
510
+ **url.query_params,
511
+ ),
512
+ DatastoreType.HTML: lambda url: functools.partial(
513
+ pd.DataFrame.to_html,
514
+ buf=url.url_path,
515
+ **url.query_params,
516
+ ),
517
+ DatastoreType.JSON: lambda url: functools.partial(
518
+ pd.DataFrame.to_json,
519
+ path_or_buf=url.url_path,
520
+ **url.query_params,
521
+ ),
522
+ DatastoreType.ORC: lambda url: functools.partial(
523
+ pd.DataFrame.to_orc,
524
+ path=url.url_path,
525
+ **url.query_params,
526
+ ),
527
+ DatastoreType.PARQUET: lambda url: functools.partial(
528
+ pd.DataFrame.to_parquet,
529
+ path=url.url_path,
530
+ **url.query_params,
531
+ ),
532
+ DatastoreType.XML: lambda url: functools.partial(
533
+ pd.DataFrame.to_xml,
534
+ path_or_buffer=url.url_path,
535
+ **url.query_params,
536
+ ),
537
+ }
538
+
539
+ NUMPY_DATASTORE_TYPE_TO_READER = {
540
+ DatastoreType.BINARY: lambda url: functools.partial(
541
+ np.fromfile,
542
+ url.url_path,
543
+ **url.query_params,
544
+ ),
545
+ DatastoreType.CSV: lambda url: functools.partial(
546
+ np.genfromtxt,
547
+ url.url_path,
548
+ **url.query_params,
549
+ ),
550
+ DatastoreType.NUMPY: lambda url: functools.partial(
551
+ np.load,
552
+ url.url_path,
553
+ **url.query_params,
554
+ ),
555
+ DatastoreType.TEXT: lambda url: functools.partial(
556
+ np.loadtxt,
557
+ url.url_path,
558
+ **url.query_params,
559
+ ),
560
+ }
561
+
562
+ NUMPY_DATASTORE_TYPE_TO_WRITER = {
563
+ DatastoreType.CSV: lambda url: functools.partial(
564
+ np.savetxt,
565
+ url.url_path,
566
+ delimiter=",",
567
+ **url.query_params,
568
+ ),
569
+ DatastoreType.NUMPY: lambda url: functools.partial(
570
+ np.savez_compressed,
571
+ url.url_path,
572
+ **url.query_params,
573
+ ),
574
+ DatastoreType.TEXT: lambda url: functools.partial(
575
+ np.savetxt,
576
+ url.url_path,
577
+ **url.query_params,
578
+ ),
579
+ }
580
+
581
+ DATASET_TYPE_TO_DATASTORE_TYPE_READER_RESOLVER = {
582
+ DatasetType.RAY_DATASET: RAY_DATASTORE_TYPE_TO_READER,
583
+ DatasetType.DAFT: DAFT_DATASTORE_TYPE_TO_READER,
584
+ DatasetType.PANDAS: PANDAS_DATASTORE_TYPE_TO_READER,
585
+ DatasetType.POLARS: POLARS_DATASTORE_TYPE_TO_READER,
586
+ DatasetType.PYARROW: PYARROW_DATASTORE_TYPE_TO_READER,
587
+ DatasetType.NUMPY: NUMPY_DATASTORE_TYPE_TO_READER,
588
+ }
589
+
590
+ DATASET_TYPE_TO_DATASTORE_TYPE_WRITER_RESOLVER = {
591
+ DatasetType.RAY_DATASET: RAY_DATASTORE_TYPE_TO_WRITER,
592
+ DatasetType.DAFT: DAFT_DATASTORE_TYPE_TO_WRITER,
593
+ DatasetType.PANDAS: PANDAS_DATASTORE_TYPE_TO_WRITER,
594
+ DatasetType.POLARS: POLARS_DATASTORE_TYPE_TO_WRITER,
595
+ DatasetType.PYARROW: PYARROW_DATASTORE_TYPE_TO_WRITER,
596
+ DatasetType.NUMPY: NUMPY_DATASTORE_TYPE_TO_WRITER,
597
+ }
598
+
599
+
600
+ class DeltaCatUrl:
601
+ """
602
+ Class for parsing DeltaCAT URLs, which are used to unambiguously locate
603
+ any internal object(s) already registered in a DeltaCAT catalog, or external
604
+ object(s) that could be registered in a DeltaCAT catalog.
605
+
606
+ Valid DeltaCAT URLs that reference internal catalog objects registered in a
607
+ DeltaCAT catalog include:
608
+
609
+ dc://<catalog>/[namespace]/[table]/[tableversion]/[stream]/[partition]/[delta]
610
+ namespace://<namespace>/[table]/[tableversion]/[stream]/[partition]/[delta]
611
+ table://<table>/[tableversion]/[stream]/[partition]/[delta]
612
+
613
+ Where <arg> is a required part of the URL and [arg] is an optional part of
614
+ the URL.
615
+
616
+ Valid DeltaCAT URLs that reference external objects include most types
617
+ readable into any supported DeltaCAT dataset type (e.g., Ray Data, Daft,
618
+ PyArrow, Pandas, Numpy). External object URLs take the form
619
+ <DatastoreType>+<URL> or, to be more explicit,
620
+ <DatastoreType>+<scheme>://<path> where `DatastoreType` is any value
621
+ from :class:`deltacat.types.media.DatastoreType`
622
+
623
+ To reference a file on local disk, replace <scheme>:// with "file" or
624
+ "local". To read an absolute local file path, use "file:///" or
625
+ "local:///". To read a local file path relative to the current working
626
+ directory, use "local://".
627
+
628
+ audio+<scheme>://<path>?param1=val1&param2=val2&...
629
+ avro+<scheme>://<path>?param1=val1&param2=val2&...
630
+ binary+<scheme>://<path>?param1=val1&param2=val2&...
631
+ csv+<scheme>://<path>?param1=val1&param2=val2&...
632
+ deltasharing+<scheme>://<path>?param1=val1&param2=val2&...
633
+ hudi+<scheme>://<path>?param1=val1&param2=val2&...
634
+ images+<scheme>://<path>?param1=val1&param2=val2&...
635
+ json+<scheme>://<path>?param1=val1&param2=val2&...
636
+ lance+<scheme>://<path>?param1=val1&param2=val2&...
637
+ numpy+<scheme>://<path>?param1=val1&param2=val2&...
638
+ parquet+<scheme>://<path>?param1=val1&param2=val2&...
639
+ text+<scheme>://<path>?param1=val1&param2=val2&...
640
+ tfrecords+<scheme>://<path>?param1=val1&param2=val2&...
641
+ videos+<scheme>://<path>?param1=val1&param2=val2&...
642
+ webdataset+<scheme>://<path>?param1=val1&param2=val2&...
643
+
644
+ Some DeltaCAT URLs reference special types of external objects
645
+ locatable via custom URLs that don't conform to the usual
646
+ <DatastoreType>+<URL> convention shown above, like:
647
+
648
+ <mongodb_uri>?database=<db_name>&collection=<collection_name>&...
649
+ bigquery://<project_id>/<dataset>?param1=val1&...
650
+ <clickhouse_dsn>?table=<table_name>?param1=val1&...
651
+ databricks://<warehouse_id>?param1=val1&...
652
+ iceberg://<table_identifier>?param1=val1&...
653
+
654
+ Note that, for reads, each of the above URLs typically resolves directly
655
+ to the equivalent :class:`deltacat.types.media.DatasetType` reader. For
656
+ example, if Ray Data is the dataset type then the equivalent
657
+ ray.data.read_{} API is used. In this case, a read referencing a URL of the
658
+ form "audio+file:///my/audio.mp4" would resolve to a call to
659
+ ray.data.read_audio("/my/audio.mp4").
660
+ """
661
+
662
+ # Auto-resolved DeltaCAT catalog path default identifiers
663
+ DELTACAT_URL_DEFAULT_CATALOG = "default"
664
+ DELTACAT_URL_DEFAULT_NAMESPACE = "default"
665
+ DELTACAT_URL_DEFAULT_TABLE_VERSION = "default"
666
+ DELTACAT_URL_DEFAULT_STREAM = "default"
667
+
668
+ def __init__(
669
+ self,
670
+ url: str,
671
+ ):
672
+ # TODO(pdames): Handle wildcard `*` and `**` at end of url.
673
+ self.catalog_name = None
674
+ self.parsed = urlparse(url, allow_fragments=False) # support '#' in path
675
+ self.url = self.parsed.geturl()
676
+ path = self.parsed.path
677
+ # Remove leading/trailing slashes and split the path into elements
678
+ self.path_elements = [
679
+ element for element in path.strip("/").split("/") if path and element
680
+ ]
681
+ # Split the scheme into the root DeltaCAT scheme and the path scheme
682
+ self.scheme_elements = self.parsed.scheme.split("+")
683
+ self.datastore_type = DatastoreType(self.scheme_elements[0])
684
+ if len(self.scheme_elements) == 2:
685
+ # Remove the source/sink type from the scheme.
686
+ self.parsed = self.parsed._replace(scheme=self.scheme_elements[1])
687
+ # Save the URL path to read/write w/o the source/sink type.
688
+ self.url_path = urlunparse(self.parsed)
689
+ elif len(self.scheme_elements) > 2:
690
+ raise ValueError(f"Invalid DeltaCAT URL: {url}")
691
+ self.query_params = parse_qs(self.parsed.query) if self.parsed.query else {}
692
+ if self.datastore_type == DatastoreType.DELTACAT:
693
+ self.catalog_name = self.parsed.netloc
694
+ self.unresolved_namespace = (
695
+ self.path_elements[0] if self.path_elements else None
696
+ )
697
+ self.table = self.path_elements[1] if len(self.path_elements) > 1 else None
698
+ self.unresolved_table_version = (
699
+ self.path_elements[2] if len(self.path_elements) > 2 else None
700
+ )
701
+ self.unresolved_stream = (
702
+ self.path_elements[3] if len(self.path_elements) > 3 else None
703
+ )
704
+ self.partition = (
705
+ self.path_elements[4] if len(self.path_elements) > 4 else None
706
+ )
707
+ self.delta = self.path_elements[5] if len(self.path_elements) > 5 else None
708
+ self._resolve_deltacat_path_identifiers()
709
+ elif self.datastore_type == DatastoreType.DELTACAT_NAMESPACE:
710
+ self.catalog_name = DeltaCatUrl.DELTACAT_URL_DEFAULT_CATALOG
711
+ self.unresolved_namespace = self.parsed.netloc
712
+ self.table = self.path_elements[0] if self.path_elements else None
713
+ self.unresolved_table_version = (
714
+ self.path_elements[1] if len(self.path_elements) > 1 else None
715
+ )
716
+ self.unresolved_stream = (
717
+ self.path_elements[2] if len(self.path_elements) > 2 else None
718
+ )
719
+ self.partition = (
720
+ self.path_elements[3] if len(self.path_elements) > 3 else None
721
+ )
722
+ self.delta = self.path_elements[4] if len(self.path_elements) > 4 else None
723
+ self._resolve_deltacat_path_identifiers()
724
+ elif self.datastore_type == DatastoreType.DELTACAT_TABLE:
725
+ self.catalog_name = DeltaCatUrl.DELTACAT_URL_DEFAULT_CATALOG
726
+ self.unresolved_namespace = DeltaCatUrl.DELTACAT_URL_DEFAULT_NAMESPACE
727
+ self.table = self.parsed.netloc
728
+ self.unresolved_table_version = (
729
+ self.path_elements[0] if self.path_elements else None
730
+ )
731
+ self.unresolved_stream = (
732
+ self.path_elements[1] if len(self.path_elements) > 1 else None
733
+ )
734
+ self.partition = (
735
+ self.path_elements[2] if len(self.path_elements) > 2 else None
736
+ )
737
+ self.delta = self.path_elements[3] if len(self.path_elements) > 3 else None
738
+ self._resolve_deltacat_path_identifiers()
739
+
740
+ def is_deltacat_catalog_url(self):
741
+ return bool(self.catalog_name)
742
+
743
+ def resolve_catalog(self):
744
+ if self.catalog_name:
745
+ if self.catalog_name.lower() == DeltaCatUrl.DELTACAT_URL_DEFAULT_CATALOG:
746
+ self.catalog = None
747
+ self.catalog: CatalogProperties = dc.get_catalog(self.catalog_name).inner
748
+ if not isinstance(self.catalog, CatalogProperties):
749
+ raise ValueError(
750
+ f"Expected catalog `{self.catalog_name}` to be a DeltaCAT "
751
+ f"catalog but found: {self.catalog}"
752
+ )
753
+
754
+ def _resolve_deltacat_path_identifiers(self):
755
+ dc.raise_if_not_initialized()
756
+ self.namespace = self.table_version = self.stream = None
757
+ if self.unresolved_namespace:
758
+ if (
759
+ self.unresolved_namespace.lower()
760
+ == DeltaCatUrl.DELTACAT_URL_DEFAULT_NAMESPACE
761
+ ):
762
+ self.namespace = DEFAULT_NAMESPACE
763
+ else:
764
+ self.namespace = self.unresolved_namespace
765
+ if (
766
+ self.unresolved_table_version
767
+ and self.unresolved_table_version.lower()
768
+ != DeltaCatUrl.DELTACAT_URL_DEFAULT_TABLE_VERSION
769
+ ):
770
+ self.table_version = self.unresolved_table_version
771
+ if self.unresolved_stream:
772
+ if (
773
+ self.unresolved_stream.lower()
774
+ == DeltaCatUrl.DELTACAT_URL_DEFAULT_STREAM
775
+ ):
776
+ self.stream = StreamFormat.DELTACAT
777
+ else:
778
+ self.stream = StreamFormat(self.unresolved_stream)
779
+
780
+ def __str__(self):
781
+ return self.url
782
+
783
+ def __repr__(self):
784
+ return self.url
785
+
786
+
787
+ def _list_table_versions(table: Table, catalog: CatalogProperties):
788
+ return metastore.list_table_versions(
789
+ namespace=table.namespace,
790
+ table_name=table.table_name,
791
+ catalog=catalog,
792
+ )
793
+
794
+
795
+ def _list_streams(table_version: TableVersion, catalog: CatalogProperties):
796
+ return metastore.list_streams(
797
+ namespace=table_version.namespace,
798
+ table_name=table_version.table_name,
799
+ table_version=table_version.table_version,
800
+ catalog=catalog,
801
+ )
802
+
803
+
804
+ class DeltaCatUrlReader:
805
+ def __init__(
806
+ self,
807
+ url: DeltaCatUrl,
808
+ dataset_type: DatasetType = DatasetType.RAY_DATASET,
809
+ ):
810
+ self._url = url
811
+ if url.is_deltacat_catalog_url():
812
+ url.resolve_catalog()
813
+ self._reader = DeltaCatUrlReader.resolve_dc_reader(url)
814
+ self._listers = DeltaCatUrlReader.resolve_dc_listers(url)
815
+ else:
816
+ self._reader = DeltaCatUrlReader.dataset_and_datastore_type_to_reader(
817
+ dataset_type,
818
+ url.datastore_type,
819
+ )
820
+
821
+ @property
822
+ def url(self) -> DeltaCatUrl:
823
+ return self._url
824
+
825
+ @property
826
+ def listers(
827
+ self,
828
+ ) -> List[
829
+ Tuple[
830
+ Callable[[Any], ListResult[Metafile]],
831
+ str,
832
+ Callable[[Metafile], Union[Metafile, str]],
833
+ ]
834
+ ]:
835
+ return self._listers
836
+
837
+ def read(self, *args, **kwargs) -> Dataset:
838
+ if self._url.is_deltacat_catalog_url():
839
+ return self._reader(*args, **kwargs)
840
+ else:
841
+ return self._reader(self._url)(*args, **kwargs)
842
+
843
+ @staticmethod
844
+ def resolve_dc_reader(url: DeltaCatUrl) -> Callable:
845
+ if url.delta:
846
+ return functools.partial(
847
+ metastore.get_delta,
848
+ namespace=url.namespace,
849
+ table_name=url.table,
850
+ table_version=url.table_version,
851
+ partition_values=json.loads(url.partition),
852
+ stream_position=url.delta,
853
+ catalog=url.catalog,
854
+ )
855
+ if url.partition:
856
+ return functools.partial(
857
+ metastore.get_partition,
858
+ stream_locator=StreamLocator.at(
859
+ namespace=url.namespace,
860
+ table_name=url.table,
861
+ table_version=url.table_version,
862
+ stream_id=None,
863
+ stream_format=url.stream,
864
+ ),
865
+ partition_values=json.loads(url.partition),
866
+ catalog=url.catalog,
867
+ )
868
+ if url.unresolved_stream:
869
+ return functools.partial(
870
+ metastore.get_stream,
871
+ namespace=url.namespace,
872
+ table_name=url.table,
873
+ table_version=url.table_version,
874
+ stream_format=url.stream,
875
+ catalog=url.catalog,
876
+ )
877
+ if url.unresolved_table_version:
878
+ return functools.partial(
879
+ metastore.get_table_version,
880
+ namespace=url.namespace,
881
+ table_name=url.table,
882
+ table_version=url.table_version,
883
+ catalog=url.catalog,
884
+ )
885
+ if url.table:
886
+ return functools.partial(
887
+ metastore.get_table,
888
+ namespace=url.namespace,
889
+ table_name=url.table,
890
+ catalog=url.catalog,
891
+ )
892
+ if url.unresolved_namespace:
893
+ return functools.partial(
894
+ metastore.get_namespace,
895
+ namespace=url.namespace,
896
+ catalog=url.catalog,
897
+ )
898
+ if url.catalog_name:
899
+ return functools.partial(
900
+ dc.get_catalog,
901
+ name=url.catalog_name,
902
+ )
903
+ raise ValueError("No DeltaCAT object to read.")
904
+
905
+ @staticmethod
906
+ def resolve_dc_listers(
907
+ url: DeltaCatUrl,
908
+ ) -> List[
909
+ Tuple[
910
+ Callable[[Any], ListResult[Metafile]],
911
+ Optional[str],
912
+ Optional[Callable[[Metafile], Union[Metafile, str]]],
913
+ ]
914
+ ]:
915
+ if url.partition:
916
+ partition_locator = PartitionLocator.at(
917
+ namespace=url.namespace,
918
+ table_name=url.table,
919
+ table_version=url.table_version,
920
+ stream_id=None,
921
+ stream_format=url.stream,
922
+ partition_values=json.loads(url.partition),
923
+ partition_id=None,
924
+ )
925
+ delta_lister = functools.partial(
926
+ metastore.list_partition_deltas,
927
+ partition_like=partition_locator,
928
+ catalog=url.catalog,
929
+ )
930
+ return [(delta_lister, None, None)]
931
+ if url.unresolved_stream:
932
+ stream_locator = StreamLocator.at(
933
+ namespace=url.namespace,
934
+ table_name=url.table,
935
+ table_version=url.table_version,
936
+ stream_id=None,
937
+ stream_format=url.stream,
938
+ )
939
+ stream = Stream.of(
940
+ locator=stream_locator,
941
+ partition_scheme=None,
942
+ )
943
+ partition_lister = functools.partial(
944
+ metastore.list_stream_partitions,
945
+ stream=stream,
946
+ catalog=url.catalog,
947
+ )
948
+ delta_lister = functools.partial(
949
+ metastore.list_partition_deltas,
950
+ catalog=url.catalog,
951
+ )
952
+ return [
953
+ (partition_lister, None, None),
954
+ (delta_lister, "partition_like", lambda x: x),
955
+ ]
956
+ if url.unresolved_table_version:
957
+ stream_lister = functools.partial(
958
+ metastore.list_streams,
959
+ namespace=url.namespace,
960
+ table_name=url.table,
961
+ table_version=url.table_version,
962
+ catalog=url.catalog,
963
+ )
964
+ partition_lister = functools.partial(
965
+ metastore.list_stream_partitions,
966
+ catalog=url.catalog,
967
+ )
968
+ delta_lister = functools.partial(
969
+ metastore.list_partition_deltas,
970
+ catalog=url.catalog,
971
+ )
972
+ return [
973
+ (stream_lister, None, None),
974
+ (partition_lister, "stream", lambda x: x),
975
+ (delta_lister, "partition_like", lambda x: x),
976
+ ]
977
+ if url.table:
978
+ table_version_lister = functools.partial(
979
+ metastore.list_table_versions,
980
+ namespace=url.namespace,
981
+ table_name=url.table,
982
+ catalog=url.catalog,
983
+ )
984
+ stream_lister = functools.partial(
985
+ metastore.list_streams,
986
+ namespace=url.namespace,
987
+ table_name=url.table,
988
+ catalog=url.catalog,
989
+ )
990
+ partition_lister = functools.partial(
991
+ metastore.list_stream_partitions,
992
+ catalog=url.catalog,
993
+ )
994
+ delta_lister = functools.partial(
995
+ metastore.list_partition_deltas,
996
+ catalog=url.catalog,
997
+ )
998
+ return [
999
+ (table_version_lister, None, None),
1000
+ (stream_lister, "table_version", lambda x: x.table_version),
1001
+ (partition_lister, "stream", lambda x: x),
1002
+ (delta_lister, "partition_like", lambda x: x),
1003
+ ]
1004
+ if url.unresolved_namespace:
1005
+ table_lister = functools.partial(
1006
+ metastore.list_tables,
1007
+ namespace=url.namespace,
1008
+ catalog=url.catalog,
1009
+ )
1010
+ table_version_lister = functools.partial(
1011
+ metastore.list_table_versions,
1012
+ namespace=url.namespace,
1013
+ catalog=url.catalog,
1014
+ )
1015
+ stream_lister = functools.partial(
1016
+ metastore.list_streams,
1017
+ namespace=url.namespace,
1018
+ table_name=url.table,
1019
+ catalog=url.catalog,
1020
+ )
1021
+ partition_lister = functools.partial(
1022
+ metastore.list_stream_partitions,
1023
+ catalog=url.catalog,
1024
+ )
1025
+ delta_lister = functools.partial(
1026
+ metastore.list_partition_deltas,
1027
+ catalog=url.catalog,
1028
+ )
1029
+ return [
1030
+ (table_lister, None, None),
1031
+ (table_version_lister, "table_name", lambda x: x.table_name),
1032
+ (stream_lister, "table_version", lambda x: x.table_version),
1033
+ (partition_lister, "stream", lambda x: x),
1034
+ (delta_lister, "partition_like", lambda x: x),
1035
+ ]
1036
+ if url.catalog_name:
1037
+ namespace_lister = functools.partial(
1038
+ metastore.list_namespaces,
1039
+ catalog=url.catalog,
1040
+ )
1041
+ table_lister = functools.partial(
1042
+ metastore.list_tables,
1043
+ catalog=url.catalog,
1044
+ )
1045
+ table_version_lister = functools.partial(
1046
+ _list_table_versions,
1047
+ catalog=url.catalog,
1048
+ )
1049
+ stream_lister = functools.partial(
1050
+ _list_streams,
1051
+ catalog=url.catalog,
1052
+ )
1053
+ partition_lister = functools.partial(
1054
+ metastore.list_stream_partitions,
1055
+ catalog=url.catalog,
1056
+ )
1057
+ delta_lister = functools.partial(
1058
+ metastore.list_partition_deltas,
1059
+ catalog=url.catalog,
1060
+ )
1061
+ return [
1062
+ (namespace_lister, None, None),
1063
+ (table_lister, "namespace", lambda x: x.namespace),
1064
+ (table_version_lister, "table", lambda x: x),
1065
+ (stream_lister, "table_version", lambda x: x),
1066
+ (partition_lister, "stream", lambda x: x),
1067
+ (delta_lister, "partition_like", lambda x: x),
1068
+ ]
1069
+ raise ValueError("No DeltaCAT objects to list.")
1070
+
1071
+ @staticmethod
1072
+ def dataset_and_datastore_type_to_reader(
1073
+ dataset_type: DatasetType,
1074
+ datastore_type: DatastoreType,
1075
+ ):
1076
+ reader_resolver = DATASET_TYPE_TO_DATASTORE_TYPE_READER_RESOLVER.get(
1077
+ dataset_type
1078
+ )
1079
+ if reader_resolver is None:
1080
+ raise ValueError(
1081
+ f"Unsupported dataset type: {dataset_type}. "
1082
+ f"Supported dataset types: {[dt.name for dt in DatasetType]}"
1083
+ )
1084
+ reader = reader_resolver.get(datastore_type)
1085
+ if reader is None:
1086
+ raise ValueError(
1087
+ f"Dataset type `{dataset_type} has no reader for "
1088
+ f"datastore type: `{datastore_type}`."
1089
+ f"Supported datastore types: {[k.name for k in reader_resolver.keys()]}"
1090
+ )
1091
+ return reader
1092
+
1093
+
1094
+ def _stage_and_commit_stream(
1095
+ stream: Stream,
1096
+ *args,
1097
+ **kwargs,
1098
+ ) -> Stream:
1099
+ """
1100
+ Helper method to stage and commit a stream (e.g., as part of a copy
1101
+ operation from another catalog). The committed stream will be assigned a
1102
+ different unique ID than the input stream.
1103
+ """
1104
+ stream = metastore.stage_stream(
1105
+ namespace=stream.namespace,
1106
+ table_name=stream.table_name,
1107
+ table_version=stream.table_version,
1108
+ stream_format=StreamFormat(stream.stream_format),
1109
+ *args,
1110
+ **kwargs,
1111
+ )
1112
+ return metastore.commit_stream(
1113
+ stream=stream,
1114
+ *args,
1115
+ **kwargs,
1116
+ )
1117
+
1118
+
1119
+ def _stage_and_commit_partition(
1120
+ partition: Partition,
1121
+ *args,
1122
+ **kwargs,
1123
+ ) -> Partition:
1124
+ """
1125
+ Helper method to stage and commit a partition (e.g., as part of a copy
1126
+ operation from another catalog). The committed partition will be assigned a
1127
+ different unique ID than the input partition.
1128
+ """
1129
+ stream = metastore.get_stream(
1130
+ namespace=partition.namespace,
1131
+ table_name=partition.table_name,
1132
+ table_version=partition.table_version,
1133
+ stream_format=StreamFormat(
1134
+ partition.stream_format or StreamFormat.DELTACAT.value
1135
+ ),
1136
+ *args,
1137
+ **kwargs,
1138
+ )
1139
+ partition = metastore.stage_partition(
1140
+ stream=stream,
1141
+ partition_values=partition.partition_values,
1142
+ partition_scheme_id=partition.partition_scheme_id,
1143
+ *args,
1144
+ **kwargs,
1145
+ )
1146
+ return metastore.commit_partition(
1147
+ partition=partition,
1148
+ *args,
1149
+ **kwargs,
1150
+ )
1151
+
1152
+
1153
+ class DeltaCatUrlWriter:
1154
+ def __init__(
1155
+ self,
1156
+ url: DeltaCatUrl,
1157
+ dataset_type: DatasetType = DatasetType.RAY_DATASET,
1158
+ metafile: Optional[Metafile] = None,
1159
+ ):
1160
+ self._url = url
1161
+ self._metafile = metafile
1162
+
1163
+ if url.is_deltacat_catalog_url():
1164
+ if url.path_elements:
1165
+ url.resolve_catalog()
1166
+ self._writer = DeltaCatUrlWriter.resolve_dc_writer(url, metafile or {})
1167
+ else:
1168
+ self._writer = DeltaCatUrlWriter.dataset_and_datastore_type_to_writer(
1169
+ dataset_type,
1170
+ url.datastore_type,
1171
+ )
1172
+
1173
+ @property
1174
+ def url(self) -> DeltaCatUrl:
1175
+ return self._url
1176
+
1177
+ @property
1178
+ def metafile(self) -> Metafile:
1179
+ return self._metafile
1180
+
1181
+ def write(self, suffix: str = "", *args, **kwargs) -> Union[Metafile, str]:
1182
+ if self._url.is_deltacat_catalog_url():
1183
+ return self._writer(*args, **kwargs)
1184
+ else:
1185
+ dest_url = DeltaCatUrl(f"{self._url.url}{suffix}")
1186
+ self._writer(dest_url)(*args, **kwargs)
1187
+ return dest_url.url_path
1188
+
1189
+ @staticmethod
1190
+ def resolve_dc_writer(
1191
+ url: DeltaCatUrl,
1192
+ metafile: Metafile,
1193
+ ) -> Callable:
1194
+ if url.delta:
1195
+ delta: Delta = Delta(
1196
+ Metafile.based_on(
1197
+ other=metafile,
1198
+ new_id=url.delta,
1199
+ )
1200
+ )
1201
+ delta.locator = DeltaLocator.at(
1202
+ namespace=url.namespace,
1203
+ table_name=url.table,
1204
+ table_version=url.table_version,
1205
+ stream_id=None,
1206
+ stream_format=url.stream,
1207
+ partition_values=json.loads(url.partition),
1208
+ partition_id=None,
1209
+ stream_position=int(url.delta),
1210
+ )
1211
+ # TODO(pdames): Honor deep vs. shallow copies. Deep copies require
1212
+ # first ensuring that all files in the source delta manifest are
1213
+ # staged to the target catalog before commit. For deltas whose
1214
+ # manifests reference local files, shallow delta copies may be
1215
+ # invalid in the target catalog, and should be blocked or
1216
+ # converted to a deep copy automatically.
1217
+ return functools.partial(
1218
+ metastore.commit_delta,
1219
+ delta=delta,
1220
+ catalog=url.catalog,
1221
+ )
1222
+ if url.partition:
1223
+ partition: Partition = Partition(metafile)
1224
+ partition.locator = PartitionLocator.at(
1225
+ namespace=url.namespace,
1226
+ table_name=url.table,
1227
+ table_version=url.table_version,
1228
+ stream_id=None,
1229
+ stream_format=url.stream,
1230
+ partition_values=json.loads(url.partition),
1231
+ partition_id=None,
1232
+ )
1233
+ return functools.partial(
1234
+ _stage_and_commit_partition,
1235
+ partition=partition,
1236
+ catalog=url.catalog,
1237
+ )
1238
+ if url.unresolved_stream:
1239
+ stream: Stream = Stream(metafile)
1240
+ stream.locator = StreamLocator.at(
1241
+ namespace=url.namespace,
1242
+ table_name=url.table,
1243
+ table_version=url.table_version,
1244
+ stream_id=None,
1245
+ stream_format=url.stream,
1246
+ )
1247
+ return functools.partial(
1248
+ _stage_and_commit_stream,
1249
+ stream=stream,
1250
+ catalog=url.catalog,
1251
+ )
1252
+ if url.unresolved_table_version:
1253
+ table_version: TableVersion = TableVersion(metafile)
1254
+ table_version.locator = TableVersionLocator.at(
1255
+ namespace=url.namespace,
1256
+ table_name=url.table,
1257
+ table_version=url.table_version,
1258
+ )
1259
+ return functools.partial(
1260
+ metastore.create_table_version,
1261
+ namespace=table_version.namespace,
1262
+ table_name=table_version.table_name,
1263
+ table_version=table_version.table_version,
1264
+ lifecycle_state=table_version.state,
1265
+ schema=table_version.schema,
1266
+ partition_scheme=table_version.partition_scheme,
1267
+ sort_keys=table_version.sort_scheme,
1268
+ table_version_description=table_version.description,
1269
+ table_version_properties=table_version.properties,
1270
+ supported_content_types=table_version.content_types,
1271
+ catalog=url.catalog,
1272
+ )
1273
+ if url.table:
1274
+ table: Table = Table(metafile)
1275
+ table.locator = TableLocator.at(
1276
+ namespace=url.namespace,
1277
+ table_name=url.table,
1278
+ )
1279
+ return functools.partial(
1280
+ metastore.create_table,
1281
+ namespace=table.namespace,
1282
+ table_name=table.table_name,
1283
+ description=table.description,
1284
+ properties=table.properties,
1285
+ catalog=url.catalog,
1286
+ )
1287
+ if url.unresolved_namespace:
1288
+ namespace: Namespace = Namespace(metafile)
1289
+ namespace.locator = NamespaceLocator.of(
1290
+ namespace=url.namespace,
1291
+ )
1292
+ return functools.partial(
1293
+ metastore.create_namespace,
1294
+ namespace=url.namespace,
1295
+ properties=namespace.properties,
1296
+ catalog=url.catalog,
1297
+ )
1298
+ if url.catalog_name:
1299
+ return functools.partial(
1300
+ dc.put_catalog,
1301
+ name=url.catalog_name,
1302
+ )
1303
+ raise ValueError("No DeltaCAT object to write.")
1304
+
1305
+ @staticmethod
1306
+ def dataset_and_datastore_type_to_writer(
1307
+ dataset_type: DatasetType,
1308
+ datastore_type: DatastoreType,
1309
+ ):
1310
+ writer_resolver = DATASET_TYPE_TO_DATASTORE_TYPE_WRITER_RESOLVER.get(
1311
+ dataset_type
1312
+ )
1313
+ if writer_resolver is None:
1314
+ raise ValueError(
1315
+ f"Unsupported dataset type: {dataset_type}. "
1316
+ f"Supported dataset types: {[dt.name for dt in DatasetType]}"
1317
+ )
1318
+ writer = writer_resolver.get(datastore_type)
1319
+ if writer is None:
1320
+ raise ValueError(
1321
+ f"Dataset type `{dataset_type} has no writer for "
1322
+ f"datastore type: `{datastore_type}`."
1323
+ f"Supported datastore types: {[k.name for k in writer_resolver.keys()]}"
1324
+ )
1325
+ return writer