deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
deltacat/storage/interface.py
CHANGED
@@ -2,6 +2,7 @@ from typing import Any, Callable, Dict, List, Optional, Union, Tuple
|
|
2
2
|
|
3
3
|
from deltacat.storage import (
|
4
4
|
EntryParams,
|
5
|
+
EntryType,
|
5
6
|
Delta,
|
6
7
|
DeltaLocator,
|
7
8
|
DeltaProperties,
|
@@ -30,11 +31,12 @@ from deltacat.storage import (
|
|
30
31
|
TableVersionProperties,
|
31
32
|
)
|
32
33
|
from deltacat.storage.model.manifest import Manifest
|
34
|
+
from deltacat.storage.model.partition import UNKNOWN_PARTITION_ID
|
33
35
|
from deltacat.types.media import (
|
34
36
|
ContentType,
|
35
37
|
DistributedDatasetType,
|
36
38
|
StorageType,
|
37
|
-
|
39
|
+
DatasetType,
|
38
40
|
)
|
39
41
|
from deltacat.utils.common import ReadKwargsProvider
|
40
42
|
|
@@ -205,7 +207,7 @@ def get_latest_delta(
|
|
205
207
|
|
206
208
|
def download_delta(
|
207
209
|
delta_like: Union[Delta, DeltaLocator],
|
208
|
-
table_type:
|
210
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
209
211
|
storage_type: StorageType = StorageType.DISTRIBUTED,
|
210
212
|
max_parallelism: Optional[int] = None,
|
211
213
|
columns: Optional[List[str]] = None,
|
@@ -216,7 +218,7 @@ def download_delta(
|
|
216
218
|
**kwargs,
|
217
219
|
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
218
220
|
"""
|
219
|
-
|
221
|
+
Reads the given delta or delta locator into either a list of
|
220
222
|
tables resident in the local node's memory, or into a dataset distributed
|
221
223
|
across this Ray cluster's object store memory. Ordered table N of a local
|
222
224
|
table list, or ordered block N of a distributed dataset, always contain
|
@@ -228,19 +230,19 @@ def download_delta(
|
|
228
230
|
def download_delta_manifest_entry(
|
229
231
|
delta_like: Union[Delta, DeltaLocator],
|
230
232
|
entry_index: int,
|
231
|
-
table_type:
|
233
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
232
234
|
columns: Optional[List[str]] = None,
|
233
235
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
234
236
|
*args,
|
235
237
|
**kwargs,
|
236
238
|
) -> LocalTable:
|
237
239
|
"""
|
238
|
-
|
240
|
+
Reads a single manifest entry into the specified table type for the
|
239
241
|
given delta or delta locator. If a delta is provided with a non-empty
|
240
|
-
manifest, then the entry is
|
241
|
-
manifest is first retrieved then the given entry index
|
242
|
+
manifest, then the entry is read from this manifest. Otherwise, the
|
243
|
+
manifest is first retrieved then the given entry index read.
|
242
244
|
|
243
|
-
NOTE: The entry will be
|
245
|
+
NOTE: The entry will be read in the current node's memory.
|
244
246
|
"""
|
245
247
|
raise NotImplementedError("download_delta_manifest_entry not implemented")
|
246
248
|
|
@@ -288,9 +290,9 @@ def create_table_version(
|
|
288
290
|
namespace: str,
|
289
291
|
table_name: str,
|
290
292
|
table_version: Optional[str] = None,
|
293
|
+
lifecycle_state: Optional[LifecycleState] = LifecycleState.CREATED,
|
291
294
|
schema: Optional[Schema] = None,
|
292
295
|
partition_scheme: Optional[PartitionScheme] = None,
|
293
|
-
# TODO(pdames): rename to `sort_scheme`
|
294
296
|
sort_keys: Optional[SortScheme] = None,
|
295
297
|
table_version_description: Optional[str] = None,
|
296
298
|
table_version_properties: Optional[TableVersionProperties] = None,
|
@@ -299,9 +301,9 @@ def create_table_version(
|
|
299
301
|
supported_content_types: Optional[List[ContentType]] = None,
|
300
302
|
*args,
|
301
303
|
**kwargs,
|
302
|
-
) -> Tuple[
|
304
|
+
) -> Tuple[Table, TableVersion, Stream]:
|
303
305
|
"""
|
304
|
-
Create a table version with
|
306
|
+
Create a table version with the given or CREATED lifecycle state and an empty delta
|
305
307
|
stream. Table versions may be schemaless and unpartitioned to improve write
|
306
308
|
performance, or have their writes governed by a schema and partition scheme
|
307
309
|
to improve data consistency and read performance.
|
@@ -314,6 +316,20 @@ def create_table_version(
|
|
314
316
|
raise NotImplementedError("create_table_version not implemented")
|
315
317
|
|
316
318
|
|
319
|
+
def create_table(
|
320
|
+
namespace: str,
|
321
|
+
table_name: str,
|
322
|
+
description: Optional[str] = None,
|
323
|
+
properties: Optional[TableProperties] = None,
|
324
|
+
*args,
|
325
|
+
**kwargs,
|
326
|
+
) -> Table:
|
327
|
+
"""
|
328
|
+
Create a new table. Raises an error if the given table already exists.
|
329
|
+
"""
|
330
|
+
raise NotImplementedError("create_table not implemented")
|
331
|
+
|
332
|
+
|
317
333
|
def update_table(
|
318
334
|
namespace: str,
|
319
335
|
table_name: str,
|
@@ -322,7 +338,7 @@ def update_table(
|
|
322
338
|
new_table_name: Optional[str] = None,
|
323
339
|
*args,
|
324
340
|
**kwargs,
|
325
|
-
) ->
|
341
|
+
) -> Table:
|
326
342
|
"""
|
327
343
|
Update table metadata describing the table versions it contains. By default,
|
328
344
|
a table's properties are empty, and its description is equal to that given
|
@@ -345,7 +361,7 @@ def update_table_version(
|
|
345
361
|
sort_keys: Optional[SortScheme] = None,
|
346
362
|
*args,
|
347
363
|
**kwargs,
|
348
|
-
) ->
|
364
|
+
) -> Tuple[Optional[Table], TableVersion, Optional[Stream]]:
|
349
365
|
"""
|
350
366
|
Update a table version. Notably, updating an unreleased table version's
|
351
367
|
lifecycle state to 'active' telegraphs that it is ready for external
|
@@ -410,15 +426,15 @@ def delete_stream(
|
|
410
426
|
|
411
427
|
def delete_table(
|
412
428
|
namespace: str,
|
413
|
-
|
429
|
+
table_name: str,
|
414
430
|
purge: bool = False,
|
415
431
|
*args,
|
416
432
|
**kwargs,
|
417
433
|
) -> None:
|
418
434
|
"""
|
419
|
-
Drops the given table
|
420
|
-
|
421
|
-
|
435
|
+
Drops the given table from the catalog. If purge is True, also removes
|
436
|
+
all data files associated with the table. Raises an error if the given table
|
437
|
+
does not exist.
|
422
438
|
"""
|
423
439
|
raise NotImplementedError("delete_table not implemented")
|
424
440
|
|
@@ -430,10 +446,9 @@ def delete_namespace(
|
|
430
446
|
**kwargs,
|
431
447
|
) -> None:
|
432
448
|
"""
|
433
|
-
Drops
|
434
|
-
|
435
|
-
|
436
|
-
does not exist.
|
449
|
+
Drops the given namespace from the catalog. If purge is True, also removes
|
450
|
+
all data files associated with the namespace. Raises an error if the given
|
451
|
+
namespace does not exist.
|
437
452
|
"""
|
438
453
|
raise NotImplementedError("drop_namespace not implemented")
|
439
454
|
|
@@ -509,6 +524,7 @@ def stage_partition(
|
|
509
524
|
def commit_partition(
|
510
525
|
partition: Partition,
|
511
526
|
previous_partition: Optional[Partition] = None,
|
527
|
+
expected_previous_partition_id: Optional[str] = UNKNOWN_PARTITION_ID,
|
512
528
|
*args,
|
513
529
|
**kwargs,
|
514
530
|
) -> Partition:
|
@@ -586,23 +602,19 @@ def stage_delta(
|
|
586
602
|
max_records_per_entry: Optional[int] = None,
|
587
603
|
author: Optional[ManifestAuthor] = None,
|
588
604
|
properties: Optional[DeltaProperties] = None,
|
589
|
-
|
605
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
590
606
|
content_type: ContentType = ContentType.PARQUET,
|
591
607
|
entry_params: Optional[EntryParams] = None,
|
608
|
+
entry_type: Optional[EntryType] = EntryType.DATA,
|
609
|
+
schema: Optional[Schema] = None,
|
610
|
+
sort_scheme_id: Optional[str] = None,
|
592
611
|
*args,
|
593
612
|
**kwargs,
|
594
613
|
) -> Delta:
|
595
614
|
"""
|
596
|
-
Writes the given
|
615
|
+
Writes the given dataset to 1 or more files. Returns an unregistered
|
597
616
|
delta whose manifest entries point to the uploaded files. Applies any
|
598
617
|
schema consistency policies configured for the parent table version.
|
599
|
-
|
600
|
-
The partition spec will be used to split the input table into
|
601
|
-
multiple files. Optionally, partition_values can be provided to avoid
|
602
|
-
this method to recompute partition_values from the provided data.
|
603
|
-
|
604
|
-
Raises an error if the provided data does not conform to a unique ordered
|
605
|
-
list of partition_values
|
606
618
|
"""
|
607
619
|
raise NotImplementedError("stage_delta not implemented")
|
608
620
|
|
@@ -723,13 +735,23 @@ def table_version_exists(
|
|
723
735
|
|
724
736
|
def can_categorize(e: BaseException, *args, **kwargs) -> bool:
|
725
737
|
"""
|
726
|
-
|
738
|
+
True if the input error originated from the storage
|
739
|
+
implementation layer and can be categorized under an
|
740
|
+
existing DeltaCatError. The "categorize_errors" decorator
|
741
|
+
uses this to determine if an unknown error from the storage
|
742
|
+
implementation can be categorized prior to casting it to
|
743
|
+
the equivalent DeltaCatError via `raise_categorized_error`
|
727
744
|
"""
|
728
745
|
raise NotImplementedError
|
729
746
|
|
730
747
|
|
731
748
|
def raise_categorized_error(e: BaseException, *args, **kwargs):
|
732
749
|
"""
|
733
|
-
|
750
|
+
Casts a categorizable error that originaed from the storage
|
751
|
+
implementation layer to its equivalent DeltaCatError
|
752
|
+
for uniform handling (e.g., determining whether an error
|
753
|
+
is retryable or not) via the "categorize_errors" decorator.
|
754
|
+
Raises an UnclassifiedDeltaCatError from the input exception
|
755
|
+
if the error cannot be categorized.
|
734
756
|
"""
|
735
757
|
raise NotImplementedError
|