deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
deltacat/utils/url.py
CHANGED
@@ -43,6 +43,28 @@ from deltacat.storage import (
|
|
43
43
|
TableVersionLocator,
|
44
44
|
)
|
45
45
|
|
46
|
+
|
47
|
+
def _normalize_partition_values_from_json(partition_values):
|
48
|
+
"""
|
49
|
+
Normalize partition values parsed from JSON URLs.
|
50
|
+
|
51
|
+
Both None and empty list [] represent unpartitioned data, but they should be
|
52
|
+
normalized to None for consistent lookup and validation.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
partition_values: Partition values parsed from JSON
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
None for unpartitioned data (both None and [] inputs),
|
59
|
+
original value for partitioned data
|
60
|
+
"""
|
61
|
+
if partition_values is None or (
|
62
|
+
isinstance(partition_values, list) and len(partition_values) == 0
|
63
|
+
):
|
64
|
+
return None
|
65
|
+
return partition_values
|
66
|
+
|
67
|
+
|
46
68
|
RAY_DATASTORE_TYPE_TO_READER = {
|
47
69
|
DatastoreType.AUDIO: lambda url: functools.partial(
|
48
70
|
ray.data.read_audio,
|
@@ -753,7 +775,7 @@ class DeltaCatUrl:
|
|
753
775
|
):
|
754
776
|
self.stream = StreamFormat.DELTACAT
|
755
777
|
else:
|
756
|
-
self.stream = StreamFormat(self.
|
778
|
+
self.stream = StreamFormat(self.unresolved_stream)
|
757
779
|
|
758
780
|
def __str__(self):
|
759
781
|
return self.url
|
@@ -762,6 +784,23 @@ class DeltaCatUrl:
|
|
762
784
|
return self.url
|
763
785
|
|
764
786
|
|
787
|
+
def _list_table_versions(table: Table, catalog: CatalogProperties):
|
788
|
+
return metastore.list_table_versions(
|
789
|
+
namespace=table.namespace,
|
790
|
+
table_name=table.table_name,
|
791
|
+
catalog=catalog,
|
792
|
+
)
|
793
|
+
|
794
|
+
|
795
|
+
def _list_streams(table_version: TableVersion, catalog: CatalogProperties):
|
796
|
+
return metastore.list_streams(
|
797
|
+
namespace=table_version.namespace,
|
798
|
+
table_name=table_version.table_name,
|
799
|
+
table_version=table_version.table_version,
|
800
|
+
catalog=catalog,
|
801
|
+
)
|
802
|
+
|
803
|
+
|
765
804
|
class DeltaCatUrlReader:
|
766
805
|
def __init__(
|
767
806
|
self,
|
@@ -1004,14 +1043,11 @@ class DeltaCatUrlReader:
|
|
1004
1043
|
catalog=url.catalog,
|
1005
1044
|
)
|
1006
1045
|
table_version_lister = functools.partial(
|
1007
|
-
|
1008
|
-
namespace=url.namespace,
|
1046
|
+
_list_table_versions,
|
1009
1047
|
catalog=url.catalog,
|
1010
1048
|
)
|
1011
1049
|
stream_lister = functools.partial(
|
1012
|
-
|
1013
|
-
namespace=url.namespace,
|
1014
|
-
table_name=url.table,
|
1050
|
+
_list_streams,
|
1015
1051
|
catalog=url.catalog,
|
1016
1052
|
)
|
1017
1053
|
partition_lister = functools.partial(
|
@@ -1025,8 +1061,8 @@ class DeltaCatUrlReader:
|
|
1025
1061
|
return [
|
1026
1062
|
(namespace_lister, None, None),
|
1027
1063
|
(table_lister, "namespace", lambda x: x.namespace),
|
1028
|
-
(table_version_lister, "
|
1029
|
-
(stream_lister, "table_version", lambda x: x
|
1064
|
+
(table_version_lister, "table", lambda x: x),
|
1065
|
+
(stream_lister, "table_version", lambda x: x),
|
1030
1066
|
(partition_lister, "stream", lambda x: x),
|
1031
1067
|
(delta_lister, "partition_like", lambda x: x),
|
1032
1068
|
]
|
@@ -1094,7 +1130,11 @@ def _stage_and_commit_partition(
|
|
1094
1130
|
namespace=partition.namespace,
|
1095
1131
|
table_name=partition.table_name,
|
1096
1132
|
table_version=partition.table_version,
|
1097
|
-
stream_format=StreamFormat(
|
1133
|
+
stream_format=StreamFormat(
|
1134
|
+
partition.stream_format or StreamFormat.DELTACAT.value
|
1135
|
+
),
|
1136
|
+
*args,
|
1137
|
+
**kwargs,
|
1098
1138
|
)
|
1099
1139
|
partition = metastore.stage_partition(
|
1100
1140
|
stream=stream,
|
@@ -1119,6 +1159,7 @@ class DeltaCatUrlWriter:
|
|
1119
1159
|
):
|
1120
1160
|
self._url = url
|
1121
1161
|
self._metafile = metafile
|
1162
|
+
|
1122
1163
|
if url.is_deltacat_catalog_url():
|
1123
1164
|
if url.path_elements:
|
1124
1165
|
url.resolve_catalog()
|
@@ -1170,7 +1211,7 @@ class DeltaCatUrlWriter:
|
|
1170
1211
|
# TODO(pdames): Honor deep vs. shallow copies. Deep copies require
|
1171
1212
|
# first ensuring that all files in the source delta manifest are
|
1172
1213
|
# staged to the target catalog before commit. For deltas whose
|
1173
|
-
# manifests reference local files, shallow delta copies
|
1214
|
+
# manifests reference local files, shallow delta copies may be
|
1174
1215
|
# invalid in the target catalog, and should be blocked or
|
1175
1216
|
# converted to a deep copy automatically.
|
1176
1217
|
return functools.partial(
|
@@ -1187,6 +1228,7 @@ class DeltaCatUrlWriter:
|
|
1187
1228
|
stream_id=None,
|
1188
1229
|
stream_format=url.stream,
|
1189
1230
|
partition_values=json.loads(url.partition),
|
1231
|
+
partition_id=None,
|
1190
1232
|
)
|
1191
1233
|
return functools.partial(
|
1192
1234
|
_stage_and_commit_partition,
|
@@ -1219,13 +1261,12 @@ class DeltaCatUrlWriter:
|
|
1219
1261
|
namespace=table_version.namespace,
|
1220
1262
|
table_name=table_version.table_name,
|
1221
1263
|
table_version=table_version.table_version,
|
1264
|
+
lifecycle_state=table_version.state,
|
1222
1265
|
schema=table_version.schema,
|
1223
1266
|
partition_scheme=table_version.partition_scheme,
|
1224
1267
|
sort_keys=table_version.sort_scheme,
|
1225
1268
|
table_version_description=table_version.description,
|
1226
1269
|
table_version_properties=table_version.properties,
|
1227
|
-
table_description=table_version.description,
|
1228
|
-
table_properties=table_version.properties,
|
1229
1270
|
supported_content_types=table_version.content_types,
|
1230
1271
|
catalog=url.catalog,
|
1231
1272
|
)
|
@@ -1236,11 +1277,11 @@ class DeltaCatUrlWriter:
|
|
1236
1277
|
table_name=url.table,
|
1237
1278
|
)
|
1238
1279
|
return functools.partial(
|
1239
|
-
metastore.
|
1280
|
+
metastore.create_table,
|
1240
1281
|
namespace=table.namespace,
|
1241
1282
|
table_name=table.table_name,
|
1242
|
-
|
1243
|
-
|
1283
|
+
description=table.description,
|
1284
|
+
properties=table.properties,
|
1244
1285
|
catalog=url.catalog,
|
1245
1286
|
)
|
1246
1287
|
if url.unresolved_namespace:
|