deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,9 @@ from deltacat.storage.model.types import (
|
|
12
12
|
from deltacat.storage.model.schema import FieldLocator
|
13
13
|
from deltacat.storage.model.transform import Transform
|
14
14
|
|
15
|
+
UNSORTED_SCHEME_NAME = "unsorted_scheme"
|
16
|
+
UNSORTED_SCHEME_ID = "deadbeef-7277-49a4-a195-fdc8ed235d42"
|
17
|
+
|
15
18
|
|
16
19
|
class SortKey(tuple):
|
17
20
|
@staticmethod
|
@@ -103,6 +106,10 @@ class SortKeyList(List[SortKey]):
|
|
103
106
|
self[item] = val = SortKey(val)
|
104
107
|
return val
|
105
108
|
|
109
|
+
def __iter__(self):
|
110
|
+
for i in range(len(self)):
|
111
|
+
yield self[i] # This triggers __getitem__ conversion
|
112
|
+
|
106
113
|
|
107
114
|
class SortScheme(dict):
|
108
115
|
@staticmethod
|
@@ -112,6 +119,19 @@ class SortScheme(dict):
|
|
112
119
|
scheme_id: Optional[str] = None,
|
113
120
|
native_object: Optional[Any] = None,
|
114
121
|
) -> SortScheme:
|
122
|
+
# Validate keys if provided
|
123
|
+
if keys is not None:
|
124
|
+
# Check for empty keys list
|
125
|
+
if len(keys) == 0:
|
126
|
+
raise ValueError("Sort scheme cannot have empty keys list")
|
127
|
+
|
128
|
+
# Check for duplicate keys
|
129
|
+
key_names = []
|
130
|
+
for key in keys:
|
131
|
+
if key.key[0] in key_names:
|
132
|
+
raise ValueError(f"Duplicate sort key found: {key.key[0]}")
|
133
|
+
key_names.append(key.key[0])
|
134
|
+
|
115
135
|
return SortScheme(
|
116
136
|
{
|
117
137
|
"keys": keys,
|
@@ -132,6 +152,15 @@ class SortScheme(dict):
|
|
132
152
|
return False
|
133
153
|
if not isinstance(other, SortScheme):
|
134
154
|
other = SortScheme(other)
|
155
|
+
# If both have None keys, they are equivalent (for unsorted schemes)
|
156
|
+
if self.keys is None and other.keys is None:
|
157
|
+
return not check_identifiers or (
|
158
|
+
self.name == other.name and self.id == other.id
|
159
|
+
)
|
160
|
+
# If only one has None keys, they are not equivalent
|
161
|
+
if self.keys is None or other.keys is None:
|
162
|
+
return False
|
163
|
+
# Compare keys if both have them
|
135
164
|
for i in range(len(self.keys)):
|
136
165
|
if not self.keys[i].equivalent_to(other.keys[i]):
|
137
166
|
return False
|
@@ -173,6 +202,13 @@ class SortScheme(dict):
|
|
173
202
|
return self.get("nativeObject")
|
174
203
|
|
175
204
|
|
205
|
+
UNSORTED_SCHEME = SortScheme.of(
|
206
|
+
keys=None,
|
207
|
+
name=UNSORTED_SCHEME_NAME,
|
208
|
+
scheme_id=UNSORTED_SCHEME_ID,
|
209
|
+
)
|
210
|
+
|
211
|
+
|
176
212
|
class SortSchemeList(List[SortScheme]):
|
177
213
|
@staticmethod
|
178
214
|
def of(items: List[SortScheme]) -> SortSchemeList:
|
@@ -188,3 +224,7 @@ class SortSchemeList(List[SortScheme]):
|
|
188
224
|
if val is not None and not isinstance(val, SortScheme):
|
189
225
|
self[item] = val = SortScheme(val)
|
190
226
|
return val
|
227
|
+
|
228
|
+
def __iter__(self):
|
229
|
+
for i in range(len(self)):
|
230
|
+
yield self[i] # This triggers __getitem__ conversion
|
deltacat/storage/model/stream.py
CHANGED
@@ -178,6 +178,13 @@ class Stream(Metafile):
|
|
178
178
|
return stream_locator.table_version
|
179
179
|
return None
|
180
180
|
|
181
|
+
def url(self, catalog_name: Optional[str] = None) -> str:
|
182
|
+
return (
|
183
|
+
f"dc://{catalog_name}/{self.namespace}/{self.table_name}/{self.table_version}/{self.stream_format}/"
|
184
|
+
if catalog_name
|
185
|
+
else f"table://{self.namespace}/{self.table_name}/{self.table_version}/{self.stream_format}/"
|
186
|
+
)
|
187
|
+
|
181
188
|
def to_serializable(self) -> Stream:
|
182
189
|
serializable = self
|
183
190
|
if serializable.table_locator:
|
@@ -382,8 +389,8 @@ class StreamLocatorAlias(Locator, dict):
|
|
382
389
|
),
|
383
390
|
}
|
384
391
|
)
|
385
|
-
if parent_stream.state
|
386
|
-
else None #
|
392
|
+
if parent_stream.state != CommitState.STAGED
|
393
|
+
else None # staged streams cannot be resolved by alias
|
387
394
|
)
|
388
395
|
|
389
396
|
@property
|
deltacat/storage/model/table.py
CHANGED
@@ -13,8 +13,9 @@ from deltacat.storage.model.namespace import (
|
|
13
13
|
)
|
14
14
|
from deltacat.storage.model.metafile import Metafile, MetafileRevisionInfo
|
15
15
|
from deltacat.constants import TXN_DIR_NAME
|
16
|
+
from deltacat.types.tables import TableProperty
|
16
17
|
|
17
|
-
TableProperties =
|
18
|
+
TableProperties = Dict[str, Any]
|
18
19
|
|
19
20
|
|
20
21
|
class Table(Metafile):
|
@@ -126,6 +127,16 @@ class Table(Metafile):
|
|
126
127
|
if table_locator:
|
127
128
|
table_locator.table_name = table_name
|
128
129
|
|
130
|
+
def url(self, catalog_name: Optional[str] = None) -> str:
|
131
|
+
return (
|
132
|
+
f"dc://{catalog_name}/{self.namespace}/{self.table_name}/"
|
133
|
+
if catalog_name
|
134
|
+
else f"table://{self.namespace}/{self.table_name}/"
|
135
|
+
)
|
136
|
+
|
137
|
+
def read_table_property(self, property: TableProperty) -> Any:
|
138
|
+
return TableProperty.read_table_property(self, property)
|
139
|
+
|
129
140
|
def to_serializable(self) -> Table:
|
130
141
|
serializable = self
|
131
142
|
if serializable.namespace_locator:
|
@@ -34,6 +34,7 @@ from deltacat.storage.model.table import (
|
|
34
34
|
from deltacat.types.media import ContentType
|
35
35
|
from deltacat.storage.model.sort_key import SortScheme, SortSchemeList
|
36
36
|
from deltacat.storage.model.types import LifecycleState
|
37
|
+
from deltacat.types.tables import TableProperty
|
37
38
|
|
38
39
|
TableVersionProperties = Dict[str, Any]
|
39
40
|
|
@@ -251,6 +252,13 @@ class TableVersion(Metafile):
|
|
251
252
|
return table_version_locator.table_version
|
252
253
|
return None
|
253
254
|
|
255
|
+
def url(self, catalog_name: Optional[str] = None) -> str:
|
256
|
+
return (
|
257
|
+
f"dc://{catalog_name}/{self.namespace}/{self.table_name}/{self.table_version}/"
|
258
|
+
if catalog_name
|
259
|
+
else f"table://{self.namespace}/{self.table_name}/{self.table_version}/"
|
260
|
+
)
|
261
|
+
|
254
262
|
def is_supported_content_type(self, content_type: ContentType):
|
255
263
|
supported_content_types = self.content_types
|
256
264
|
return (not supported_content_types) or (
|
@@ -355,6 +363,9 @@ class TableVersion(Metafile):
|
|
355
363
|
)
|
356
364
|
return int(version_number) if version_number is not None else None
|
357
365
|
|
366
|
+
def read_table_property(self, property: TableProperty) -> Any:
|
367
|
+
return TableProperty.read_table_property(self, property)
|
368
|
+
|
358
369
|
@staticmethod
|
359
370
|
def next_version(previous_version: Optional[str] = None) -> str:
|
360
371
|
"""
|