deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,172 @@
1
+ from typing import List, Optional, Union, Dict, Any
2
+
3
+ from ray.data import Dataset as RayDataset
4
+ from ray.data import read_datasource
5
+
6
+ from deltacat.io.datasource.deltacat_datasource import DeltaCatDatasource
7
+ from deltacat.io.dataset.deltacat_dataset import DeltaCatDataset
8
+ from deltacat.utils.common import ReadKwargsProvider
9
+ from deltacat.utils.url import DeltaCatUrl, DeltaCatUrlReader
10
+ from deltacat.io.datasource.deltacat_datasource import DeltacatReadType
11
+
12
+
13
+ class EmptyReadKwargsProvider(ReadKwargsProvider):
14
+ def _get_kwargs(
15
+ self,
16
+ datasource_type: str,
17
+ kwargs: Dict[str, Any],
18
+ ) -> Dict[str, Any]:
19
+ return {}
20
+
21
+
22
+ def read_deltacat(
23
+ urls: Union[DeltaCatUrl, List[DeltaCatUrl]],
24
+ *,
25
+ deltacat_read_type: DeltacatReadType = DeltacatReadType.DATA,
26
+ timestamp_as_of: Optional[int] = None,
27
+ merge_on_read: Optional[bool] = False,
28
+ read_kwargs_provider: Optional[ReadKwargsProvider] = EmptyReadKwargsProvider(),
29
+ ) -> DeltaCatDataset:
30
+ """Reads the given DeltaCAT URLs into a Ray Dataset. DeltaCAT URLs can
31
+ either reference objects registered in a DeltaCAT catalog, or unregistered
32
+ external objects that are readable into a Ray Dataset.
33
+
34
+ Unless `metadata_only` is `True`, all reads of registered DeltaCAT catalog
35
+ object data must resolve to a single table version.
36
+
37
+ When reading unregistered external objects, all additional keyword
38
+ arguments specified are passed into the Ray Datasource resolved for the
39
+ given DeltaCAT URLs.
40
+
41
+ Examples:
42
+ >>> # Read the latest active DeltaCAT table version:
43
+ >>> import deltacat as dc
44
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table")
45
+ >>> # If `my_catalog is the default catalog, this is equivalent to:
46
+ >>> dc.io.read_deltacat("namespace://my_namespace/my_table")
47
+ >>> # If `my_namespace` is the default namespace, this is equivalent to:
48
+ >>> dc.io.read_deltacat("table://my_table")
49
+
50
+ >>> # Read metadata from all partitions and deltas of the latest active
51
+ >>> # DeltaCAT table version:
52
+ >>> import deltacat as dc
53
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table", metadata_only=True)
54
+ >>> # Since "default" always resolves to the latest active table version.
55
+ >>> # This is equivalent to:
56
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/default", metadata_only=True)
57
+
58
+ >>> # Read only the latest active table version's top-level metadata:
59
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/default", metadata_only=True, recursive=False)
60
+
61
+ >>> # Read only top-level metadata from a DeltaCAT table:
62
+ >>> import deltacat as dc
63
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table", metadata_only=True, recursive=False)
64
+
65
+ >>> # Read top-level table metadata from all table versions:
66
+ >>> import deltacat as dc
67
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/*", metadata_only=True, recursive=False)
68
+
69
+ >>> # Read metadata from all partitions and deltas of all table versions:
70
+ >>> import deltacat as dc
71
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/*", metadata_only=True)
72
+
73
+ >>> # Read metadata from all tables and table versions of the namespace:
74
+ >>> import deltacat as dc
75
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/*", metadata_only=True)
76
+
77
+ >>> # Read metadata from the latest active table version for each
78
+ >>> # table in the namespace:
79
+ >>> import deltacat as dc
80
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace", metadata_only=True)
81
+
82
+ >>> # Read metadata from the latest active table version for each
83
+ >>> # table in the namespace:
84
+ >>> import deltacat as dc
85
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace", metadata_only=True)
86
+
87
+ >>> # Read metadata from the latest active table version for each
88
+ >>> # table in the catalog's default namespace:
89
+ >>> import deltacat as dc
90
+ >>> dc.io.read_deltacat("dc://my_catalog", metadata_only=True)
91
+
92
+ >>> # Read metadata from all table versions for each table in each
93
+ >>> # catalog namespace:
94
+ >>> import deltacat as dc
95
+ >>> dc.io.read_deltacat("dc://my_catalog/*", metadata_only=True)
96
+
97
+ >>> # Read the Iceberg stream of the latest active DeltaCAT table version,
98
+ >>> import deltacat as dc
99
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/default/iceberg")
100
+ >>> # Or, if `my_catalog` is the default catalog, this is equivalent to:
101
+ >>> dc.io.read_deltacat("namespace://my_namespace/my_table/default/iceberg")
102
+ >>> # Or, if `my_namespace` is the default namespace, this is equivalent to:
103
+ >>> dc.io.read_deltacat("table://my_table/default/iceberg")
104
+
105
+ >>> # Read an external unregistered Iceberg table `my_db.my_table`:
106
+ >>> import deltacat as dc
107
+ >>> dc.io.read_deltacat("iceberg://my_db.my_table")
108
+
109
+ >>> # Read an external unregistered audio file from /my/audio.mp4:
110
+ >>> import deltacat as dc
111
+ >>> dc.io.read_deltacat("audio+file:///my/audio.mp4")
112
+
113
+ >>> # Read an external unregistered audio file from s3://my/audio.mp4:
114
+ >>> import deltacat as dc
115
+ >>> dc.io.read_deltacat("audio+s3://my/audio.mp4")
116
+
117
+ Args:
118
+ urls: The DeltaCAT URLs to read.
119
+ deltacat_read_type: If METADATA, reads only DeltaCAT metadata for the
120
+ given URL and skips both recursive metadata expansion and reads
121
+ of the underlying data files. If METADATA_RECURSIVE then recursively
122
+ expands child metadata but does not read underlying data files. If
123
+ DATA then recursively expands child metadata to discover and read
124
+ all underlying data files.
125
+ timestamp_as_of: Reads a historic snapshot of the given paths as-of the
126
+ given millisecond-precision epoch timestamp (only used when reading
127
+ registered DeltaCAT catalog objects).
128
+ merge_on_read: If True, merges all unmaterialized inserts, updates,
129
+ and deletes in the registered DeltaCAT table version being read. Only
130
+ applicable if `metadata_only` is False.
131
+ ray_remote_args: kwargs passed to `ray.remote` in the read tasks.
132
+ read_kwargs_provider: Resolves
133
+ :class:`~deltacat.types.media.DatasourceType` string keys to
134
+ kwarg dictionaries to pass to the resolved
135
+ :class:`~ray.data.Datasource` implementation for each distinct
136
+ DeltaCAT URL type.
137
+
138
+ Returns:
139
+ DeltacatDataset holding Arrow records read from the specified URL.
140
+ """
141
+ # TODO(pdames): The below implementation serializes reads of each URL and
142
+ # then unions their respective datasets together. While this was an easy
143
+ # starting point to implement, a more efficient implementation should push
144
+ # all URLs down into `DeltacatDatasource` to parallelize all reads
145
+ # (i.e., by returning the `ReadTask` for all datasources in
146
+ # `get_read_tasks()` and estimating the corresponding memory size across
147
+ # all datasources in `estimate_inmemory_data_size()`.
148
+ dataset: RayDataset = None
149
+ for url in urls:
150
+ if not url.is_deltacat_catalog_url():
151
+ # this URL points to an external unregistered Ray Datasource
152
+ # TODO(pdames): Honor metadata only reads of external datasources
153
+ # by registering only file paths & metadata in delta manifests.
154
+ reader = DeltaCatUrlReader(url)
155
+ next_ds = reader.read(read_kwargs_provider(url.datastore_type, {}))
156
+ else:
157
+ # this URL points to a registered DeltaCAT object
158
+ next_ds = read_datasource(
159
+ DeltaCatDatasource(
160
+ url=url,
161
+ deltacat_read_type=deltacat_read_type,
162
+ timestamp_as_of=timestamp_as_of,
163
+ merge_on_read=merge_on_read,
164
+ read_kwargs_provider=read_kwargs_provider,
165
+ )
166
+ )
167
+ # union the last dataset read into the result set
168
+ if not dataset:
169
+ dataset = next_ds
170
+ else:
171
+ dataset.union(next_ds)
172
+ return DeltaCatDataset.from_dataset(dataset)
@@ -20,6 +20,9 @@ from deltacat.storage.model.metafile import (
20
20
  from deltacat.storage.model.transaction import (
21
21
  TransactionOperation,
22
22
  Transaction,
23
+ read_transaction,
24
+ transactions,
25
+ transaction,
23
26
  )
24
27
  from deltacat.storage.model.namespace import (
25
28
  Namespace,
@@ -31,6 +34,7 @@ from deltacat.storage.model.partition import (
31
34
  PartitionLocator,
32
35
  PartitionLocatorAlias,
33
36
  PartitionKey,
37
+ PartitionKeyList,
34
38
  PartitionScheme,
35
39
  PartitionSchemeList,
36
40
  PartitionValues,
@@ -43,6 +47,9 @@ from deltacat.storage.model.schema import (
43
47
  NestedFieldName,
44
48
  Schema,
45
49
  SchemaList,
50
+ SchemaUpdate,
51
+ SchemaUpdateOperation,
52
+ SchemaUpdateOperations,
46
53
  )
47
54
  from deltacat.storage.model.stream import (
48
55
  Stream,
@@ -75,9 +82,11 @@ from deltacat.storage.model.transform import (
75
82
  MonthTransform,
76
83
  YearTransform,
77
84
  TruncateTransform,
85
+ TruncateStrategy,
78
86
  )
79
87
  from deltacat.storage.model.types import (
80
88
  CommitState,
89
+ Dataset,
81
90
  DeltaType,
82
91
  DistributedDataset,
83
92
  LifecycleState,
@@ -87,11 +96,12 @@ from deltacat.storage.model.types import (
87
96
  SchemaConsistencyType,
88
97
  StreamFormat,
89
98
  SortOrder,
90
- TransactionType,
91
99
  TransactionOperationType,
100
+ TransactionStatus,
92
101
  )
93
102
  from deltacat.storage.model.sort_key import (
94
103
  SortKey,
104
+ SortKeyList,
95
105
  SortScheme,
96
106
  SortSchemeList,
97
107
  )
@@ -102,6 +112,7 @@ __all__ = [
102
112
  "BucketTransform",
103
113
  "BucketTransformParameters",
104
114
  "CommitState",
115
+ "Dataset",
105
116
  "DayTransform",
106
117
  "Delta",
107
118
  "DeltaLocator",
@@ -136,6 +147,7 @@ __all__ = [
136
147
  "NullOrder",
137
148
  "Partition",
138
149
  "PartitionKey",
150
+ "PartitionKeyList",
139
151
  "PartitionLocator",
140
152
  "PartitionLocatorAlias",
141
153
  "PartitionScheme",
@@ -143,8 +155,12 @@ __all__ = [
143
155
  "PartitionValues",
144
156
  "Schema",
145
157
  "SchemaList",
158
+ "SchemaUpdate",
159
+ "SchemaUpdateOperation",
160
+ "SchemaUpdateOperations",
146
161
  "SchemaConsistencyType",
147
162
  "SortKey",
163
+ "SortKeyList",
148
164
  "SortOrder",
149
165
  "SortScheme",
150
166
  "SortSchemeList",
@@ -161,13 +177,17 @@ __all__ = [
161
177
  "Transaction",
162
178
  "TransactionOperation",
163
179
  "TransactionOperationType",
164
- "TransactionType",
180
+ "TransactionStatus",
165
181
  "Transform",
166
182
  "TransformName",
167
183
  "TransformParameters",
168
184
  "TruncateTransform",
169
185
  "TruncateTransformParameters",
186
+ "TruncateStrategy",
170
187
  "UnknownTransform",
171
188
  "VoidTransform",
172
189
  "YearTransform",
190
+ "read_transaction",
191
+ "transactions",
192
+ "transaction",
173
193
  ]
@@ -2,6 +2,7 @@ from typing import Any, Callable, Dict, List, Optional, Union, Tuple
2
2
 
3
3
  from deltacat.storage import (
4
4
  EntryParams,
5
+ EntryType,
5
6
  Delta,
6
7
  DeltaLocator,
7
8
  DeltaProperties,
@@ -30,11 +31,12 @@ from deltacat.storage import (
30
31
  TableVersionProperties,
31
32
  )
32
33
  from deltacat.storage.model.manifest import Manifest
34
+ from deltacat.storage.model.partition import UNKNOWN_PARTITION_ID
33
35
  from deltacat.types.media import (
34
36
  ContentType,
35
37
  DistributedDatasetType,
36
38
  StorageType,
37
- TableType,
39
+ DatasetType,
38
40
  )
39
41
  from deltacat.utils.common import ReadKwargsProvider
40
42
 
@@ -205,7 +207,7 @@ def get_latest_delta(
205
207
 
206
208
  def download_delta(
207
209
  delta_like: Union[Delta, DeltaLocator],
208
- table_type: TableType = TableType.PYARROW,
210
+ table_type: DatasetType = DatasetType.PYARROW,
209
211
  storage_type: StorageType = StorageType.DISTRIBUTED,
210
212
  max_parallelism: Optional[int] = None,
211
213
  columns: Optional[List[str]] = None,
@@ -216,7 +218,7 @@ def download_delta(
216
218
  **kwargs,
217
219
  ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
218
220
  """
219
- Download the given delta or delta locator into either a list of
221
+ Reads the given delta or delta locator into either a list of
220
222
  tables resident in the local node's memory, or into a dataset distributed
221
223
  across this Ray cluster's object store memory. Ordered table N of a local
222
224
  table list, or ordered block N of a distributed dataset, always contain
@@ -228,19 +230,19 @@ def download_delta(
228
230
  def download_delta_manifest_entry(
229
231
  delta_like: Union[Delta, DeltaLocator],
230
232
  entry_index: int,
231
- table_type: TableType = TableType.PYARROW,
233
+ table_type: DatasetType = DatasetType.PYARROW,
232
234
  columns: Optional[List[str]] = None,
233
235
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
234
236
  *args,
235
237
  **kwargs,
236
238
  ) -> LocalTable:
237
239
  """
238
- Downloads a single manifest entry into the specified table type for the
240
+ Reads a single manifest entry into the specified table type for the
239
241
  given delta or delta locator. If a delta is provided with a non-empty
240
- manifest, then the entry is downloaded from this manifest. Otherwise, the
241
- manifest is first retrieved then the given entry index downloaded.
242
+ manifest, then the entry is read from this manifest. Otherwise, the
243
+ manifest is first retrieved then the given entry index read.
242
244
 
243
- NOTE: The entry will be downloaded in the current node's memory.
245
+ NOTE: The entry will be read in the current node's memory.
244
246
  """
245
247
  raise NotImplementedError("download_delta_manifest_entry not implemented")
246
248
 
@@ -288,9 +290,9 @@ def create_table_version(
288
290
  namespace: str,
289
291
  table_name: str,
290
292
  table_version: Optional[str] = None,
293
+ lifecycle_state: Optional[LifecycleState] = LifecycleState.CREATED,
291
294
  schema: Optional[Schema] = None,
292
295
  partition_scheme: Optional[PartitionScheme] = None,
293
- # TODO(pdames): rename to `sort_scheme`
294
296
  sort_keys: Optional[SortScheme] = None,
295
297
  table_version_description: Optional[str] = None,
296
298
  table_version_properties: Optional[TableVersionProperties] = None,
@@ -299,9 +301,9 @@ def create_table_version(
299
301
  supported_content_types: Optional[List[ContentType]] = None,
300
302
  *args,
301
303
  **kwargs,
302
- ) -> Tuple[Optional[Table], TableVersion, Stream]:
304
+ ) -> Tuple[Table, TableVersion, Stream]:
303
305
  """
304
- Create a table version with an unreleased lifecycle state and an empty delta
306
+ Create a table version with the given or CREATED lifecycle state and an empty delta
305
307
  stream. Table versions may be schemaless and unpartitioned to improve write
306
308
  performance, or have their writes governed by a schema and partition scheme
307
309
  to improve data consistency and read performance.
@@ -314,6 +316,20 @@ def create_table_version(
314
316
  raise NotImplementedError("create_table_version not implemented")
315
317
 
316
318
 
319
+ def create_table(
320
+ namespace: str,
321
+ table_name: str,
322
+ description: Optional[str] = None,
323
+ properties: Optional[TableProperties] = None,
324
+ *args,
325
+ **kwargs,
326
+ ) -> Table:
327
+ """
328
+ Create a new table. Raises an error if the given table already exists.
329
+ """
330
+ raise NotImplementedError("create_table not implemented")
331
+
332
+
317
333
  def update_table(
318
334
  namespace: str,
319
335
  table_name: str,
@@ -322,7 +338,7 @@ def update_table(
322
338
  new_table_name: Optional[str] = None,
323
339
  *args,
324
340
  **kwargs,
325
- ) -> None:
341
+ ) -> Table:
326
342
  """
327
343
  Update table metadata describing the table versions it contains. By default,
328
344
  a table's properties are empty, and its description is equal to that given
@@ -345,7 +361,7 @@ def update_table_version(
345
361
  sort_keys: Optional[SortScheme] = None,
346
362
  *args,
347
363
  **kwargs,
348
- ) -> None:
364
+ ) -> Tuple[Optional[Table], TableVersion, Optional[Stream]]:
349
365
  """
350
366
  Update a table version. Notably, updating an unreleased table version's
351
367
  lifecycle state to 'active' telegraphs that it is ready for external
@@ -410,15 +426,15 @@ def delete_stream(
410
426
 
411
427
  def delete_table(
412
428
  namespace: str,
413
- name: str,
429
+ table_name: str,
414
430
  purge: bool = False,
415
431
  *args,
416
432
  **kwargs,
417
433
  ) -> None:
418
434
  """
419
- Drops the given table and all its contents (table versions, streams, partitions,
420
- and deltas). If purge is True, also removes all data files associated with the table.
421
- Raises an error if the given table does not exist.
435
+ Drops the given table from the catalog. If purge is True, also removes
436
+ all data files associated with the table. Raises an error if the given table
437
+ does not exist.
422
438
  """
423
439
  raise NotImplementedError("delete_table not implemented")
424
440
 
@@ -430,10 +446,9 @@ def delete_namespace(
430
446
  **kwargs,
431
447
  ) -> None:
432
448
  """
433
- Drops a table namespace and all its contents. If purge is True, then all
434
- tables, table versions, and deltas will be deleted. Otherwise, the namespace
435
- will be dropped only if it is empty. Raises an error if the given namespace
436
- does not exist.
449
+ Drops the given namespace from the catalog. If purge is True, also removes
450
+ all data files associated with the namespace. Raises an error if the given
451
+ namespace does not exist.
437
452
  """
438
453
  raise NotImplementedError("drop_namespace not implemented")
439
454
 
@@ -509,6 +524,7 @@ def stage_partition(
509
524
  def commit_partition(
510
525
  partition: Partition,
511
526
  previous_partition: Optional[Partition] = None,
527
+ expected_previous_partition_id: Optional[str] = UNKNOWN_PARTITION_ID,
512
528
  *args,
513
529
  **kwargs,
514
530
  ) -> Partition:
@@ -586,23 +602,19 @@ def stage_delta(
586
602
  max_records_per_entry: Optional[int] = None,
587
603
  author: Optional[ManifestAuthor] = None,
588
604
  properties: Optional[DeltaProperties] = None,
589
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
605
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
590
606
  content_type: ContentType = ContentType.PARQUET,
591
607
  entry_params: Optional[EntryParams] = None,
608
+ entry_type: Optional[EntryType] = EntryType.DATA,
609
+ schema: Optional[Schema] = None,
610
+ sort_scheme_id: Optional[str] = None,
592
611
  *args,
593
612
  **kwargs,
594
613
  ) -> Delta:
595
614
  """
596
- Writes the given table to 1 or more S3 files. Returns an unregistered
615
+ Writes the given dataset to 1 or more files. Returns an unregistered
597
616
  delta whose manifest entries point to the uploaded files. Applies any
598
617
  schema consistency policies configured for the parent table version.
599
-
600
- The partition spec will be used to split the input table into
601
- multiple files. Optionally, partition_values can be provided to avoid
602
- this method to recompute partition_values from the provided data.
603
-
604
- Raises an error if the provided data does not conform to a unique ordered
605
- list of partition_values
606
618
  """
607
619
  raise NotImplementedError("stage_delta not implemented")
608
620
 
@@ -723,13 +735,23 @@ def table_version_exists(
723
735
 
724
736
  def can_categorize(e: BaseException, *args, **kwargs) -> bool:
725
737
  """
726
- Return whether input error is from storage implementation layer.
738
+ True if the input error originated from the storage
739
+ implementation layer and can be categorized under an
740
+ existing DeltaCatError. The "categorize_errors" decorator
741
+ uses this to determine if an unknown error from the storage
742
+ implementation can be categorized prior to casting it to
743
+ the equivalent DeltaCatError via `raise_categorized_error`
727
744
  """
728
745
  raise NotImplementedError
729
746
 
730
747
 
731
748
  def raise_categorized_error(e: BaseException, *args, **kwargs):
732
749
  """
733
- Raise and handle storage implementation layer specific errors.
750
+ Casts a categorizable error that originaed from the storage
751
+ implementation layer to its equivalent DeltaCatError
752
+ for uniform handling (e.g., determining whether an error
753
+ is retryable or not) via the "categorize_errors" decorator.
754
+ Raises an UnclassifiedDeltaCatError from the input exception
755
+ if the error cannot be categorized.
734
756
  """
735
757
  raise NotImplementedError