deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
2
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
3
- from deltacat.storage.rivulet.reader.data_reader import FileReader
1
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
2
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
3
+ from deltacat.experimental.storage.rivulet.reader.data_reader import FileReader
4
4
  from typing import Type, Dict
5
5
 
6
- from deltacat.storage.rivulet.schema.schema import Schema
6
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
7
7
 
8
8
 
9
9
  class FileReaderRegistrar:
@@ -5,7 +5,7 @@ from typing import MutableMapping, Dict, Iterable, Tuple, Optional
5
5
 
6
6
  import pyarrow as pa
7
7
 
8
- from deltacat.storage.rivulet.schema.datatype import Datatype
8
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
9
9
 
10
10
 
11
11
  @dataclass(frozen=True)
@@ -1,6 +1,6 @@
1
1
  from typing import Protocol, Iterable, List, Union, Any, Dict
2
2
 
3
- from deltacat.storage.rivulet.metastore.sst import SSTableRow
3
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
4
4
  import pyarrow as pa
5
5
 
6
6
  MEMTABLE_DATA = Union[Iterable[Dict[str, Any]], pa.Table]
@@ -1,11 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
- from deltacat.storage.rivulet.parquet.serializer import ParquetDataSerializer
4
- from deltacat.storage.rivulet import Schema
5
- from deltacat.storage.rivulet.serializer import DataSerializer
6
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
3
+ from deltacat.experimental.storage.rivulet.parquet.serializer import (
4
+ ParquetDataSerializer,
5
+ )
6
+ from deltacat.experimental.storage.rivulet import Schema
7
+ from deltacat.experimental.storage.rivulet.serializer import DataSerializer
8
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
7
9
 
8
- from deltacat.storage.rivulet.feather.serializer import FeatherDataSerializer
10
+ from deltacat.experimental.storage.rivulet.feather.serializer import (
11
+ FeatherDataSerializer,
12
+ )
9
13
 
10
14
 
11
15
  class DataSerializerFactory:
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+ from typing import Generic, List, Union, Iterable
3
+ from deltacat.storage.model.shard import T, Shard, ShardingStrategy
4
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
5
+ DatasetMetastore,
6
+ )
7
+
8
+
9
+ class RangeShard(Shard, Generic[T]):
10
+ """
11
+ Represents a range-based shard with minimum and maximum keys.
12
+
13
+ param: min_key: The minimum key for the shard.
14
+ param: max_key: The maximum key for the shard.
15
+ """
16
+
17
+ def __init__(self, min_key: T, max_key: T):
18
+ self.min_key = min_key
19
+ self.max_key = max_key
20
+
21
+ def __repr__(self) -> str:
22
+ return f"Shard(type=range, min_key={self.min_key}, max_key={self.max_key})"
23
+
24
+ @staticmethod
25
+ def split(
26
+ global_min: Union[int, str], global_max: Union[int, str], num_shards: int
27
+ ) -> List[RangeShard]:
28
+ """
29
+ Splits a range into `num_shards` shards.
30
+ Currently supports splitting ranges of integers and strings.
31
+
32
+ Note: If global_min == global_max or num_shards <= 1, a single shard is returned,
33
+ num_shards is ignored.
34
+
35
+ :param global_min: The minimum key for the entire range (int or str).
36
+ :param global_max: The maximum key for the entire range (int or str).
37
+ :param num_shards: The number of shards to create.
38
+ :return: A list of RangeShard objects.
39
+ """
40
+ if global_min == global_max or num_shards <= 1:
41
+ return [RangeShard(global_min, global_max)]
42
+
43
+ # Determine which interpolation function to use based on the type of min/max
44
+ if isinstance(global_min, int) and isinstance(global_max, int):
45
+ interpolate = RangeShard._interpolate_numeric
46
+ elif isinstance(global_min, str) and isinstance(global_max, str):
47
+ interpolate = RangeShard._interpolate_str
48
+ else:
49
+ raise ValueError(
50
+ "Unsupported combination of types for global_min and global_max."
51
+ )
52
+
53
+ shards: List[RangeShard] = []
54
+ for i in range(num_shards):
55
+ start = interpolate(global_min, global_max, i, num_shards)
56
+ end = interpolate(global_min, global_max, i + 1, num_shards)
57
+
58
+ if i > 0:
59
+ if isinstance(start, int):
60
+ start = shards[-1].max_key + 1
61
+ elif isinstance(start, int):
62
+ char_list = list(start)
63
+ char_list[-1] = chr(ord(char_list[-1]) + 1)
64
+ start = "".join(char_list)
65
+
66
+ shards.append(RangeShard(start, end))
67
+
68
+ return shards
69
+
70
+ @staticmethod
71
+ def _interpolate_numeric(start: int, end: int, step: int, total_steps: int) -> int:
72
+ """
73
+ Integer interpolation using integer (floor) division.
74
+
75
+ param: start (int): The starting number.
76
+ param: end (int): The ending number.
77
+ param: step (int): The current step in the interpolation (0-based).
78
+ param: total_steps (int): The total number of interpolation steps.
79
+
80
+ returns: int: The interpolated integer.
81
+ """
82
+ return start + (end - start) * step // total_steps
83
+
84
+ @staticmethod
85
+ def _interpolate_str(start: str, end: str, step: int, total_steps: int) -> str:
86
+ """
87
+ Interpolates between two strings lexicographically.
88
+
89
+ param: start (str): The starting string.
90
+ param: end (str): The ending string.
91
+ param: step (int): The current step in the interpolation (0-based).
92
+ param: total_steps (int): The total number of interpolation steps.
93
+
94
+ returns: str: The interpolated string.
95
+ """
96
+ max_len = max(len(start), len(end))
97
+
98
+ # Pad strings to the same length with spaces (smallest lexicographical character).
99
+ start = start.ljust(max_len, " ")
100
+ end = end.ljust(max_len, " ")
101
+
102
+ # Interpolate character by character based on ordinal values.
103
+ interpolated_chars = [
104
+ chr(round(ord(s) + (ord(e) - ord(s)) * step / total_steps))
105
+ for s, e in zip(start, end)
106
+ ]
107
+
108
+ return "".join(interpolated_chars).rstrip()
109
+
110
+
111
+ class RangeShardingStrategy(ShardingStrategy, Generic[T]):
112
+ """
113
+ Implements a sharding strategy to divide a range of keys into shards.
114
+
115
+ method: shards: Generates a list of RangeShard objects based on the global range.
116
+ """
117
+
118
+ def shards(
119
+ self, num_shards: int, metastore: DatasetMetastore
120
+ ) -> Iterable[RangeShard[T]]:
121
+ """
122
+ Divides the global range of keys into evenly sized shards.
123
+
124
+ param: num_shards: The number of shards to divide the range into.
125
+ param: metastore: The dataset metastore providing access to manifests.
126
+ returns: A list of RangeShard objects representing the divided range.
127
+ """
128
+ min, max = metastore.get_min_max_keys()
129
+ return RangeShard.split(min, max, num_shards)
@@ -6,15 +6,26 @@ from typing import Any, List, Set, Protocol, TypeVar, Dict, Iterable
6
6
 
7
7
  from pyarrow import RecordBatch, Table
8
8
  from deltacat.storage.model.partition import PartitionLocator
9
- from deltacat.storage.rivulet.metastore.delta import ManifestIO, DeltacatManifestIO
10
-
11
- from deltacat.storage.rivulet import Schema
12
- from deltacat.storage.rivulet.metastore.json_sst import JsonSstWriter
13
- from deltacat.storage.rivulet.serializer import MEMTABLE_DATA, DataSerializer
14
- from deltacat.storage.rivulet.serializer_factory import DataSerializerFactory
15
- from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter, DATA
16
- from deltacat.storage.rivulet.metastore.sst import SSTWriter
17
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
9
+ from deltacat.experimental.storage.rivulet.metastore.delta import (
10
+ ManifestIO,
11
+ DeltacatManifestIO,
12
+ )
13
+
14
+ from deltacat.experimental.storage.rivulet import Schema
15
+ from deltacat.experimental.storage.rivulet.metastore.json_sst import JsonSstWriter
16
+ from deltacat.experimental.storage.rivulet.serializer import (
17
+ MEMTABLE_DATA,
18
+ DataSerializer,
19
+ )
20
+ from deltacat.experimental.storage.rivulet.serializer_factory import (
21
+ DataSerializerFactory,
22
+ )
23
+ from deltacat.experimental.storage.rivulet.writer.dataset_writer import (
24
+ DatasetWriter,
25
+ DATA,
26
+ )
27
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTWriter
28
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
18
29
 
19
30
  INPUT_ROW = TypeVar("INPUT_ROW")
20
31
 
@@ -350,7 +350,6 @@ class DeltaCatDatasource(Datasource):
350
350
  ]
351
351
  elif self._deltacat_read_type == DeltacatReadType.METADATA_LIST:
352
352
  # do a shallow read of the top-level DeltaCAT metadata
353
- print(f"listers: {self._reader.listers}")
354
353
  listers = copy.deepcopy(self._reader.listers)
355
354
  listers = [listers[0]]
356
355
  read_tasks = self._list_all_metafiles_read_tasks(
@@ -97,7 +97,7 @@ def read_deltacat(
97
97
  >>> # Read the Iceberg stream of the latest active DeltaCAT table version,
98
98
  >>> import deltacat as dc
99
99
  >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/default/iceberg")
100
- >>> # Or, if `my_catalog is the default catalog, this is equivalent to:
100
+ >>> # Or, if `my_catalog` is the default catalog, this is equivalent to:
101
101
  >>> dc.io.read_deltacat("namespace://my_namespace/my_table/default/iceberg")
102
102
  >>> # Or, if `my_namespace` is the default namespace, this is equivalent to:
103
103
  >>> dc.io.read_deltacat("table://my_table/default/iceberg")
@@ -20,6 +20,9 @@ from deltacat.storage.model.metafile import (
20
20
  from deltacat.storage.model.transaction import (
21
21
  TransactionOperation,
22
22
  Transaction,
23
+ read_transaction,
24
+ transactions,
25
+ transaction,
23
26
  )
24
27
  from deltacat.storage.model.namespace import (
25
28
  Namespace,
@@ -31,6 +34,7 @@ from deltacat.storage.model.partition import (
31
34
  PartitionLocator,
32
35
  PartitionLocatorAlias,
33
36
  PartitionKey,
37
+ PartitionKeyList,
34
38
  PartitionScheme,
35
39
  PartitionSchemeList,
36
40
  PartitionValues,
@@ -43,6 +47,9 @@ from deltacat.storage.model.schema import (
43
47
  NestedFieldName,
44
48
  Schema,
45
49
  SchemaList,
50
+ SchemaUpdate,
51
+ SchemaUpdateOperation,
52
+ SchemaUpdateOperations,
46
53
  )
47
54
  from deltacat.storage.model.stream import (
48
55
  Stream,
@@ -75,6 +82,7 @@ from deltacat.storage.model.transform import (
75
82
  MonthTransform,
76
83
  YearTransform,
77
84
  TruncateTransform,
85
+ TruncateStrategy,
78
86
  )
79
87
  from deltacat.storage.model.types import (
80
88
  CommitState,
@@ -88,11 +96,12 @@ from deltacat.storage.model.types import (
88
96
  SchemaConsistencyType,
89
97
  StreamFormat,
90
98
  SortOrder,
91
- TransactionType,
92
99
  TransactionOperationType,
100
+ TransactionStatus,
93
101
  )
94
102
  from deltacat.storage.model.sort_key import (
95
103
  SortKey,
104
+ SortKeyList,
96
105
  SortScheme,
97
106
  SortSchemeList,
98
107
  )
@@ -138,6 +147,7 @@ __all__ = [
138
147
  "NullOrder",
139
148
  "Partition",
140
149
  "PartitionKey",
150
+ "PartitionKeyList",
141
151
  "PartitionLocator",
142
152
  "PartitionLocatorAlias",
143
153
  "PartitionScheme",
@@ -145,8 +155,12 @@ __all__ = [
145
155
  "PartitionValues",
146
156
  "Schema",
147
157
  "SchemaList",
158
+ "SchemaUpdate",
159
+ "SchemaUpdateOperation",
160
+ "SchemaUpdateOperations",
148
161
  "SchemaConsistencyType",
149
162
  "SortKey",
163
+ "SortKeyList",
150
164
  "SortOrder",
151
165
  "SortScheme",
152
166
  "SortSchemeList",
@@ -163,13 +177,17 @@ __all__ = [
163
177
  "Transaction",
164
178
  "TransactionOperation",
165
179
  "TransactionOperationType",
166
- "TransactionType",
180
+ "TransactionStatus",
167
181
  "Transform",
168
182
  "TransformName",
169
183
  "TransformParameters",
170
184
  "TruncateTransform",
171
185
  "TruncateTransformParameters",
186
+ "TruncateStrategy",
172
187
  "UnknownTransform",
173
188
  "VoidTransform",
174
189
  "YearTransform",
190
+ "read_transaction",
191
+ "transactions",
192
+ "transaction",
175
193
  ]
@@ -2,6 +2,7 @@ from typing import Any, Callable, Dict, List, Optional, Union, Tuple
2
2
 
3
3
  from deltacat.storage import (
4
4
  EntryParams,
5
+ EntryType,
5
6
  Delta,
6
7
  DeltaLocator,
7
8
  DeltaProperties,
@@ -30,11 +31,12 @@ from deltacat.storage import (
30
31
  TableVersionProperties,
31
32
  )
32
33
  from deltacat.storage.model.manifest import Manifest
34
+ from deltacat.storage.model.partition import UNKNOWN_PARTITION_ID
33
35
  from deltacat.types.media import (
34
36
  ContentType,
35
37
  DistributedDatasetType,
36
38
  StorageType,
37
- TableType,
39
+ DatasetType,
38
40
  )
39
41
  from deltacat.utils.common import ReadKwargsProvider
40
42
 
@@ -205,7 +207,7 @@ def get_latest_delta(
205
207
 
206
208
  def download_delta(
207
209
  delta_like: Union[Delta, DeltaLocator],
208
- table_type: TableType = TableType.PYARROW,
210
+ table_type: DatasetType = DatasetType.PYARROW,
209
211
  storage_type: StorageType = StorageType.DISTRIBUTED,
210
212
  max_parallelism: Optional[int] = None,
211
213
  columns: Optional[List[str]] = None,
@@ -216,7 +218,7 @@ def download_delta(
216
218
  **kwargs,
217
219
  ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
218
220
  """
219
- Download the given delta or delta locator into either a list of
221
+ Reads the given delta or delta locator into either a list of
220
222
  tables resident in the local node's memory, or into a dataset distributed
221
223
  across this Ray cluster's object store memory. Ordered table N of a local
222
224
  table list, or ordered block N of a distributed dataset, always contain
@@ -228,19 +230,19 @@ def download_delta(
228
230
  def download_delta_manifest_entry(
229
231
  delta_like: Union[Delta, DeltaLocator],
230
232
  entry_index: int,
231
- table_type: TableType = TableType.PYARROW,
233
+ table_type: DatasetType = DatasetType.PYARROW,
232
234
  columns: Optional[List[str]] = None,
233
235
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
234
236
  *args,
235
237
  **kwargs,
236
238
  ) -> LocalTable:
237
239
  """
238
- Downloads a single manifest entry into the specified table type for the
240
+ Reads a single manifest entry into the specified table type for the
239
241
  given delta or delta locator. If a delta is provided with a non-empty
240
- manifest, then the entry is downloaded from this manifest. Otherwise, the
241
- manifest is first retrieved then the given entry index downloaded.
242
+ manifest, then the entry is read from this manifest. Otherwise, the
243
+ manifest is first retrieved then the given entry index read.
242
244
 
243
- NOTE: The entry will be downloaded in the current node's memory.
245
+ NOTE: The entry will be read in the current node's memory.
244
246
  """
245
247
  raise NotImplementedError("download_delta_manifest_entry not implemented")
246
248
 
@@ -288,9 +290,9 @@ def create_table_version(
288
290
  namespace: str,
289
291
  table_name: str,
290
292
  table_version: Optional[str] = None,
293
+ lifecycle_state: Optional[LifecycleState] = LifecycleState.CREATED,
291
294
  schema: Optional[Schema] = None,
292
295
  partition_scheme: Optional[PartitionScheme] = None,
293
- # TODO(pdames): rename to `sort_scheme`
294
296
  sort_keys: Optional[SortScheme] = None,
295
297
  table_version_description: Optional[str] = None,
296
298
  table_version_properties: Optional[TableVersionProperties] = None,
@@ -299,9 +301,9 @@ def create_table_version(
299
301
  supported_content_types: Optional[List[ContentType]] = None,
300
302
  *args,
301
303
  **kwargs,
302
- ) -> Tuple[Optional[Table], TableVersion, Stream]:
304
+ ) -> Tuple[Table, TableVersion, Stream]:
303
305
  """
304
- Create a table version with an unreleased lifecycle state and an empty delta
306
+ Create a table version with the given or CREATED lifecycle state and an empty delta
305
307
  stream. Table versions may be schemaless and unpartitioned to improve write
306
308
  performance, or have their writes governed by a schema and partition scheme
307
309
  to improve data consistency and read performance.
@@ -314,6 +316,20 @@ def create_table_version(
314
316
  raise NotImplementedError("create_table_version not implemented")
315
317
 
316
318
 
319
+ def create_table(
320
+ namespace: str,
321
+ table_name: str,
322
+ description: Optional[str] = None,
323
+ properties: Optional[TableProperties] = None,
324
+ *args,
325
+ **kwargs,
326
+ ) -> Table:
327
+ """
328
+ Create a new table. Raises an error if the given table already exists.
329
+ """
330
+ raise NotImplementedError("create_table not implemented")
331
+
332
+
317
333
  def update_table(
318
334
  namespace: str,
319
335
  table_name: str,
@@ -322,7 +338,7 @@ def update_table(
322
338
  new_table_name: Optional[str] = None,
323
339
  *args,
324
340
  **kwargs,
325
- ) -> None:
341
+ ) -> Table:
326
342
  """
327
343
  Update table metadata describing the table versions it contains. By default,
328
344
  a table's properties are empty, and its description is equal to that given
@@ -345,7 +361,7 @@ def update_table_version(
345
361
  sort_keys: Optional[SortScheme] = None,
346
362
  *args,
347
363
  **kwargs,
348
- ) -> None:
364
+ ) -> Tuple[Optional[Table], TableVersion, Optional[Stream]]:
349
365
  """
350
366
  Update a table version. Notably, updating an unreleased table version's
351
367
  lifecycle state to 'active' telegraphs that it is ready for external
@@ -410,15 +426,15 @@ def delete_stream(
410
426
 
411
427
  def delete_table(
412
428
  namespace: str,
413
- name: str,
429
+ table_name: str,
414
430
  purge: bool = False,
415
431
  *args,
416
432
  **kwargs,
417
433
  ) -> None:
418
434
  """
419
- Drops the given table and all its contents (table versions, streams, partitions,
420
- and deltas). If purge is True, also removes all data files associated with the table.
421
- Raises an error if the given table does not exist.
435
+ Drops the given table from the catalog. If purge is True, also removes
436
+ all data files associated with the table. Raises an error if the given table
437
+ does not exist.
422
438
  """
423
439
  raise NotImplementedError("delete_table not implemented")
424
440
 
@@ -430,10 +446,9 @@ def delete_namespace(
430
446
  **kwargs,
431
447
  ) -> None:
432
448
  """
433
- Drops a table namespace and all its contents. If purge is True, then all
434
- tables, table versions, and deltas will be deleted. Otherwise, the namespace
435
- will be dropped only if it is empty. Raises an error if the given namespace
436
- does not exist.
449
+ Drops the given namespace from the catalog. If purge is True, also removes
450
+ all data files associated with the namespace. Raises an error if the given
451
+ namespace does not exist.
437
452
  """
438
453
  raise NotImplementedError("drop_namespace not implemented")
439
454
 
@@ -509,6 +524,7 @@ def stage_partition(
509
524
  def commit_partition(
510
525
  partition: Partition,
511
526
  previous_partition: Optional[Partition] = None,
527
+ expected_previous_partition_id: Optional[str] = UNKNOWN_PARTITION_ID,
512
528
  *args,
513
529
  **kwargs,
514
530
  ) -> Partition:
@@ -586,23 +602,19 @@ def stage_delta(
586
602
  max_records_per_entry: Optional[int] = None,
587
603
  author: Optional[ManifestAuthor] = None,
588
604
  properties: Optional[DeltaProperties] = None,
589
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
605
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
590
606
  content_type: ContentType = ContentType.PARQUET,
591
607
  entry_params: Optional[EntryParams] = None,
608
+ entry_type: Optional[EntryType] = EntryType.DATA,
609
+ schema: Optional[Schema] = None,
610
+ sort_scheme_id: Optional[str] = None,
592
611
  *args,
593
612
  **kwargs,
594
613
  ) -> Delta:
595
614
  """
596
- Writes the given table to 1 or more S3 files. Returns an unregistered
615
+ Writes the given dataset to 1 or more files. Returns an unregistered
597
616
  delta whose manifest entries point to the uploaded files. Applies any
598
617
  schema consistency policies configured for the parent table version.
599
-
600
- The partition spec will be used to split the input table into
601
- multiple files. Optionally, partition_values can be provided to avoid
602
- this method to recompute partition_values from the provided data.
603
-
604
- Raises an error if the provided data does not conform to a unique ordered
605
- list of partition_values
606
618
  """
607
619
  raise NotImplementedError("stage_delta not implemented")
608
620
 
@@ -723,13 +735,23 @@ def table_version_exists(
723
735
 
724
736
  def can_categorize(e: BaseException, *args, **kwargs) -> bool:
725
737
  """
726
- Return whether input error is from storage implementation layer.
738
+ True if the input error originated from the storage
739
+ implementation layer and can be categorized under an
740
+ existing DeltaCatError. The "categorize_errors" decorator
741
+ uses this to determine if an unknown error from the storage
742
+ implementation can be categorized prior to casting it to
743
+ the equivalent DeltaCatError via `raise_categorized_error`
727
744
  """
728
745
  raise NotImplementedError
729
746
 
730
747
 
731
748
  def raise_categorized_error(e: BaseException, *args, **kwargs):
732
749
  """
733
- Raise and handle storage implementation layer specific errors.
750
+ Casts a categorizable error that originaed from the storage
751
+ implementation layer to its equivalent DeltaCatError
752
+ for uniform handling (e.g., determining whether an error
753
+ is retryable or not) via the "categorize_errors" decorator.
754
+ Raises an UnclassifiedDeltaCatError from the input exception
755
+ if the error cannot be categorized.
734
756
  """
735
757
  raise NotImplementedError