deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ from typing import Any, Callable, Dict, List, Optional, Union, Tuple
2
2
 
3
3
  from deltacat.storage import (
4
4
  EntryParams,
5
+ EntryType,
5
6
  Delta,
6
7
  DeltaLocator,
7
8
  DeltaProperties,
@@ -30,11 +31,12 @@ from deltacat.storage import (
30
31
  TableVersionProperties,
31
32
  )
32
33
  from deltacat.storage.model.manifest import Manifest
34
+ from deltacat.storage.model.partition import UNKNOWN_PARTITION_ID
33
35
  from deltacat.types.media import (
34
36
  ContentType,
35
37
  DistributedDatasetType,
36
38
  StorageType,
37
- TableType,
39
+ DatasetType,
38
40
  )
39
41
  from deltacat.utils.common import ReadKwargsProvider
40
42
 
@@ -205,7 +207,7 @@ def get_latest_delta(
205
207
 
206
208
  def download_delta(
207
209
  delta_like: Union[Delta, DeltaLocator],
208
- table_type: TableType = TableType.PYARROW,
210
+ table_type: DatasetType = DatasetType.PYARROW,
209
211
  storage_type: StorageType = StorageType.DISTRIBUTED,
210
212
  max_parallelism: Optional[int] = None,
211
213
  columns: Optional[List[str]] = None,
@@ -216,7 +218,7 @@ def download_delta(
216
218
  **kwargs,
217
219
  ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
218
220
  """
219
- Download the given delta or delta locator into either a list of
221
+ Reads the given delta or delta locator into either a list of
220
222
  tables resident in the local node's memory, or into a dataset distributed
221
223
  across this Ray cluster's object store memory. Ordered table N of a local
222
224
  table list, or ordered block N of a distributed dataset, always contain
@@ -228,19 +230,19 @@ def download_delta(
228
230
  def download_delta_manifest_entry(
229
231
  delta_like: Union[Delta, DeltaLocator],
230
232
  entry_index: int,
231
- table_type: TableType = TableType.PYARROW,
233
+ table_type: DatasetType = DatasetType.PYARROW,
232
234
  columns: Optional[List[str]] = None,
233
235
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
234
236
  *args,
235
237
  **kwargs,
236
238
  ) -> LocalTable:
237
239
  """
238
- Downloads a single manifest entry into the specified table type for the
240
+ Reads a single manifest entry into the specified table type for the
239
241
  given delta or delta locator. If a delta is provided with a non-empty
240
- manifest, then the entry is downloaded from this manifest. Otherwise, the
241
- manifest is first retrieved then the given entry index downloaded.
242
+ manifest, then the entry is read from this manifest. Otherwise, the
243
+ manifest is first retrieved then the given entry index read.
242
244
 
243
- NOTE: The entry will be downloaded in the current node's memory.
245
+ NOTE: The entry will be read in the current node's memory.
244
246
  """
245
247
  raise NotImplementedError("download_delta_manifest_entry not implemented")
246
248
 
@@ -288,9 +290,9 @@ def create_table_version(
288
290
  namespace: str,
289
291
  table_name: str,
290
292
  table_version: Optional[str] = None,
293
+ lifecycle_state: Optional[LifecycleState] = LifecycleState.CREATED,
291
294
  schema: Optional[Schema] = None,
292
295
  partition_scheme: Optional[PartitionScheme] = None,
293
- # TODO(pdames): rename to `sort_scheme`
294
296
  sort_keys: Optional[SortScheme] = None,
295
297
  table_version_description: Optional[str] = None,
296
298
  table_version_properties: Optional[TableVersionProperties] = None,
@@ -299,9 +301,9 @@ def create_table_version(
299
301
  supported_content_types: Optional[List[ContentType]] = None,
300
302
  *args,
301
303
  **kwargs,
302
- ) -> Tuple[Optional[Table], TableVersion, Stream]:
304
+ ) -> Tuple[Table, TableVersion, Stream]:
303
305
  """
304
- Create a table version with an unreleased lifecycle state and an empty delta
306
+ Create a table version with the given or CREATED lifecycle state and an empty delta
305
307
  stream. Table versions may be schemaless and unpartitioned to improve write
306
308
  performance, or have their writes governed by a schema and partition scheme
307
309
  to improve data consistency and read performance.
@@ -314,6 +316,20 @@ def create_table_version(
314
316
  raise NotImplementedError("create_table_version not implemented")
315
317
 
316
318
 
319
+ def create_table(
320
+ namespace: str,
321
+ table_name: str,
322
+ description: Optional[str] = None,
323
+ properties: Optional[TableProperties] = None,
324
+ *args,
325
+ **kwargs,
326
+ ) -> Table:
327
+ """
328
+ Create a new table. Raises an error if the given table already exists.
329
+ """
330
+ raise NotImplementedError("create_table not implemented")
331
+
332
+
317
333
  def update_table(
318
334
  namespace: str,
319
335
  table_name: str,
@@ -322,7 +338,7 @@ def update_table(
322
338
  new_table_name: Optional[str] = None,
323
339
  *args,
324
340
  **kwargs,
325
- ) -> None:
341
+ ) -> Table:
326
342
  """
327
343
  Update table metadata describing the table versions it contains. By default,
328
344
  a table's properties are empty, and its description is equal to that given
@@ -345,7 +361,7 @@ def update_table_version(
345
361
  sort_keys: Optional[SortScheme] = None,
346
362
  *args,
347
363
  **kwargs,
348
- ) -> None:
364
+ ) -> Tuple[Optional[Table], TableVersion, Optional[Stream]]:
349
365
  """
350
366
  Update a table version. Notably, updating an unreleased table version's
351
367
  lifecycle state to 'active' telegraphs that it is ready for external
@@ -410,15 +426,15 @@ def delete_stream(
410
426
 
411
427
  def delete_table(
412
428
  namespace: str,
413
- name: str,
429
+ table_name: str,
414
430
  purge: bool = False,
415
431
  *args,
416
432
  **kwargs,
417
433
  ) -> None:
418
434
  """
419
- Drops the given table and all its contents (table versions, streams, partitions,
420
- and deltas). If purge is True, also removes all data files associated with the table.
421
- Raises an error if the given table does not exist.
435
+ Drops the given table from the catalog. If purge is True, also removes
436
+ all data files associated with the table. Raises an error if the given table
437
+ does not exist.
422
438
  """
423
439
  raise NotImplementedError("delete_table not implemented")
424
440
 
@@ -430,10 +446,9 @@ def delete_namespace(
430
446
  **kwargs,
431
447
  ) -> None:
432
448
  """
433
- Drops a table namespace and all its contents. If purge is True, then all
434
- tables, table versions, and deltas will be deleted. Otherwise, the namespace
435
- will be dropped only if it is empty. Raises an error if the given namespace
436
- does not exist.
449
+ Drops the given namespace from the catalog. If purge is True, also removes
450
+ all data files associated with the namespace. Raises an error if the given
451
+ namespace does not exist.
437
452
  """
438
453
  raise NotImplementedError("drop_namespace not implemented")
439
454
 
@@ -509,6 +524,7 @@ def stage_partition(
509
524
  def commit_partition(
510
525
  partition: Partition,
511
526
  previous_partition: Optional[Partition] = None,
527
+ expected_previous_partition_id: Optional[str] = UNKNOWN_PARTITION_ID,
512
528
  *args,
513
529
  **kwargs,
514
530
  ) -> Partition:
@@ -586,23 +602,19 @@ def stage_delta(
586
602
  max_records_per_entry: Optional[int] = None,
587
603
  author: Optional[ManifestAuthor] = None,
588
604
  properties: Optional[DeltaProperties] = None,
589
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
605
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
590
606
  content_type: ContentType = ContentType.PARQUET,
591
607
  entry_params: Optional[EntryParams] = None,
608
+ entry_type: Optional[EntryType] = EntryType.DATA,
609
+ schema: Optional[Schema] = None,
610
+ sort_scheme_id: Optional[str] = None,
592
611
  *args,
593
612
  **kwargs,
594
613
  ) -> Delta:
595
614
  """
596
- Writes the given table to 1 or more S3 files. Returns an unregistered
615
+ Writes the given dataset to 1 or more files. Returns an unregistered
597
616
  delta whose manifest entries point to the uploaded files. Applies any
598
617
  schema consistency policies configured for the parent table version.
599
-
600
- The partition spec will be used to split the input table into
601
- multiple files. Optionally, partition_values can be provided to avoid
602
- this method to recompute partition_values from the provided data.
603
-
604
- Raises an error if the provided data does not conform to a unique ordered
605
- list of partition_values
606
618
  """
607
619
  raise NotImplementedError("stage_delta not implemented")
608
620
 
@@ -723,13 +735,23 @@ def table_version_exists(
723
735
 
724
736
  def can_categorize(e: BaseException, *args, **kwargs) -> bool:
725
737
  """
726
- Return whether input error is from storage implementation layer.
738
+ True if the input error originated from the storage
739
+ implementation layer and can be categorized under an
740
+ existing DeltaCatError. The "categorize_errors" decorator
741
+ uses this to determine if an unknown error from the storage
742
+ implementation can be categorized prior to casting it to
743
+ the equivalent DeltaCatError via `raise_categorized_error`
727
744
  """
728
745
  raise NotImplementedError
729
746
 
730
747
 
731
748
  def raise_categorized_error(e: BaseException, *args, **kwargs):
732
749
  """
733
- Raise and handle storage implementation layer specific errors.
750
+ Casts a categorizable error that originaed from the storage
751
+ implementation layer to its equivalent DeltaCatError
752
+ for uniform handling (e.g., determining whether an error
753
+ is retryable or not) via the "categorize_errors" decorator.
754
+ Raises an UnclassifiedDeltaCatError from the input exception
755
+ if the error cannot be categorized.
734
756
  """
735
757
  raise NotImplementedError