deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
from abc import abstractmethod
|
2
|
+
from typing import Iterable, Optional, Protocol, TypeVar, Union
|
3
|
+
|
4
|
+
from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
|
5
|
+
|
6
|
+
# TODO: Add type validation in dataset/schema classes
|
7
|
+
T = TypeVar("T", bound=Union[int, str])
|
8
|
+
|
9
|
+
|
10
|
+
class Shard(Protocol[T]):
|
11
|
+
"""
|
12
|
+
Abstract base class representing a shard with defined inclusive boundaries.
|
13
|
+
|
14
|
+
A shard represents a logical partition of data, defined by its
|
15
|
+
minimum and maximum keys. These keys determine the range of data
|
16
|
+
within a dataset that the shard encompasses.
|
17
|
+
"""
|
18
|
+
|
19
|
+
min_key: Optional[T]
|
20
|
+
max_key: Optional[T]
|
21
|
+
|
22
|
+
|
23
|
+
class ShardingStrategy(Protocol):
|
24
|
+
"""
|
25
|
+
A sharding strategy determines how the dataset is divided into shards.
|
26
|
+
"""
|
27
|
+
|
28
|
+
@staticmethod
|
29
|
+
def from_string(strategy: str) -> "ShardingStrategy":
|
30
|
+
"""
|
31
|
+
Factory method to create the appropriate ShardingStrategy from a string.
|
32
|
+
|
33
|
+
param: strategy: The string representation of the sharding strategy.
|
34
|
+
return: ShardingStrategy class.
|
35
|
+
"""
|
36
|
+
if strategy == "range":
|
37
|
+
from deltacat.storage.rivulet.shard.range_shard import RangeShardingStrategy
|
38
|
+
|
39
|
+
return RangeShardingStrategy()
|
40
|
+
else:
|
41
|
+
raise ValueError(f"Unsupported sharding strategy type: {strategy}")
|
42
|
+
|
43
|
+
@abstractmethod
|
44
|
+
def shards(self, num_shards: int, metastore: DatasetMetastore) -> Iterable[Shard]:
|
45
|
+
"""
|
46
|
+
Generate the shards based on the chosen strategy.
|
47
|
+
"""
|
@@ -1,33 +1,190 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from
|
4
|
+
from typing import Optional, Any, List, Tuple, Dict
|
5
5
|
|
6
|
+
from pyarrow.compute import SortOptions
|
6
7
|
|
7
|
-
|
8
|
-
|
9
|
-
|
8
|
+
from deltacat.storage.model.types import (
|
9
|
+
SortOrder,
|
10
|
+
NullOrder,
|
11
|
+
)
|
12
|
+
from deltacat.storage.model.schema import FieldLocator
|
13
|
+
from deltacat.storage.model.transform import Transform
|
10
14
|
|
11
15
|
|
12
16
|
class SortKey(tuple):
|
13
17
|
@staticmethod
|
14
|
-
def of(
|
18
|
+
def of(
|
19
|
+
key: Optional[List[FieldLocator]],
|
20
|
+
sort_order: SortOrder = SortOrder.ASCENDING,
|
21
|
+
null_order: NullOrder = NullOrder.AT_END,
|
22
|
+
transform: Optional[Transform] = None,
|
23
|
+
native_object: Optional[Any] = None,
|
24
|
+
) -> SortKey:
|
15
25
|
"""
|
16
26
|
Create a sort key from a field name to use as the sort key, and
|
17
27
|
the sort order for this key. If no sort order is specified, then the
|
18
|
-
data will be sorted in ascending order by default.
|
19
|
-
always keeps the LAST occurrence of this key post-sort. For example, if
|
20
|
-
you used an integer column as your sort key which contained the values
|
21
|
-
[2, 1, 3] specifying SortOrder.ASCENDING would ensure that the
|
22
|
-
value [3] is kept over [2, 1], and specifying SortOrder.DESCENDING
|
23
|
-
would ensure that [1] is kept over [2, 3].
|
28
|
+
data will be sorted in ascending order by default.
|
24
29
|
"""
|
25
|
-
return SortKey(
|
30
|
+
return SortKey(
|
31
|
+
(
|
32
|
+
key,
|
33
|
+
sort_order.value if isinstance(sort_order, SortOrder) else sort_order,
|
34
|
+
null_order.value if isinstance(null_order, NullOrder) else null_order,
|
35
|
+
transform,
|
36
|
+
native_object,
|
37
|
+
)
|
38
|
+
)
|
39
|
+
|
40
|
+
def equivalent_to(
|
41
|
+
self,
|
42
|
+
other: SortKey,
|
43
|
+
):
|
44
|
+
if other is None:
|
45
|
+
return False
|
46
|
+
if not isinstance(other, tuple):
|
47
|
+
return False
|
48
|
+
if not isinstance(other, SortKey):
|
49
|
+
other = SortKey(other)
|
50
|
+
return (
|
51
|
+
self.key == other.key
|
52
|
+
and self.transform == other.transform
|
53
|
+
and self.sort_order == other.sort_order
|
54
|
+
and self.null_order == other.null_order
|
55
|
+
)
|
26
56
|
|
27
57
|
@property
|
28
|
-
def
|
58
|
+
def key(self) -> Optional[List[FieldLocator]]:
|
29
59
|
return self[0]
|
30
60
|
|
31
61
|
@property
|
32
62
|
def sort_order(self) -> SortOrder:
|
33
63
|
return SortOrder(self[1])
|
64
|
+
|
65
|
+
@property
|
66
|
+
def null_order(self) -> NullOrder:
|
67
|
+
return NullOrder(self[2])
|
68
|
+
|
69
|
+
@property
|
70
|
+
def transform(self) -> Optional[Transform]:
|
71
|
+
val: Dict[str, Any] = (
|
72
|
+
Transform(self[3]) if len(self) >= 4 and self[3] is not None else None
|
73
|
+
)
|
74
|
+
return val
|
75
|
+
|
76
|
+
@property
|
77
|
+
def arrow(self) -> List[Tuple[str, str]]:
|
78
|
+
# TODO(pdames): Convert unsupported field locators to arrow field names,
|
79
|
+
# and transforms/multi-key-sorts to pyarrow compute expressions. Add
|
80
|
+
# null order via SortOptions when supported per field by Arrow.
|
81
|
+
return (
|
82
|
+
[(field_locator, self[1]) for field_locator in self[0]] if self[0] else []
|
83
|
+
)
|
84
|
+
|
85
|
+
@property
|
86
|
+
def native_object(self) -> Optional[Any]:
|
87
|
+
return self[4] if len(self) >= 5 else None
|
88
|
+
|
89
|
+
|
90
|
+
class SortKeyList(List[SortKey]):
|
91
|
+
@staticmethod
|
92
|
+
def of(items: List[SortKey]) -> SortKeyList:
|
93
|
+
typed_items = SortKeyList()
|
94
|
+
for item in items:
|
95
|
+
if item is not None and not isinstance(item, SortKey):
|
96
|
+
item = SortKey(item)
|
97
|
+
typed_items.append(item)
|
98
|
+
return typed_items
|
99
|
+
|
100
|
+
def __getitem__(self, item):
|
101
|
+
val = super().__getitem__(item)
|
102
|
+
if val is not None and not isinstance(val, SortKey):
|
103
|
+
self[item] = val = SortKey(val)
|
104
|
+
return val
|
105
|
+
|
106
|
+
|
107
|
+
class SortScheme(dict):
|
108
|
+
@staticmethod
|
109
|
+
def of(
|
110
|
+
keys: Optional[SortKeyList],
|
111
|
+
name: Optional[str] = None,
|
112
|
+
scheme_id: Optional[str] = None,
|
113
|
+
native_object: Optional[Any] = None,
|
114
|
+
) -> SortScheme:
|
115
|
+
return SortScheme(
|
116
|
+
{
|
117
|
+
"keys": keys,
|
118
|
+
"name": name,
|
119
|
+
"id": scheme_id,
|
120
|
+
"nativeObject": native_object,
|
121
|
+
}
|
122
|
+
)
|
123
|
+
|
124
|
+
def equivalent_to(
|
125
|
+
self,
|
126
|
+
other: SortScheme,
|
127
|
+
check_identifiers: bool = False,
|
128
|
+
) -> bool:
|
129
|
+
if other is None:
|
130
|
+
return False
|
131
|
+
if not isinstance(other, dict):
|
132
|
+
return False
|
133
|
+
if not isinstance(other, SortScheme):
|
134
|
+
other = SortScheme(other)
|
135
|
+
for i in range(len(self.keys)):
|
136
|
+
if not self.keys[i].equivalent_to(other.keys[i]):
|
137
|
+
return False
|
138
|
+
return not check_identifiers or (
|
139
|
+
self.name == other.name and self.id == other.id
|
140
|
+
)
|
141
|
+
|
142
|
+
@property
|
143
|
+
def keys(self) -> Optional[SortKeyList]:
|
144
|
+
val: List[SortKey] = self.get("keys")
|
145
|
+
if val is not None and not isinstance(val, SortKeyList):
|
146
|
+
self["keys"] = val = SortKeyList.of(val)
|
147
|
+
return val
|
148
|
+
|
149
|
+
@property
|
150
|
+
def name(self) -> Optional[str]:
|
151
|
+
return self.get("name")
|
152
|
+
|
153
|
+
@property
|
154
|
+
def id(self) -> Optional[str]:
|
155
|
+
return self.get("id")
|
156
|
+
|
157
|
+
@property
|
158
|
+
def arrow(self) -> SortOptions:
|
159
|
+
# TODO(pdames): Remove homogenous null ordering when supported by Arrow.
|
160
|
+
if self.keys:
|
161
|
+
if len(set([key.null_order for key in self.keys])) == 1:
|
162
|
+
return SortOptions(
|
163
|
+
sort_keys=[pa_key for k in self.keys for pa_key in k.arrow],
|
164
|
+
null_placement=self.keys[0].null_order.value,
|
165
|
+
)
|
166
|
+
else:
|
167
|
+
err_msg = "All arrow sort keys must use the same null order."
|
168
|
+
raise ValueError(err_msg)
|
169
|
+
return SortOptions()
|
170
|
+
|
171
|
+
@property
|
172
|
+
def native_object(self) -> Optional[Any]:
|
173
|
+
return self.get("nativeObject")
|
174
|
+
|
175
|
+
|
176
|
+
class SortSchemeList(List[SortScheme]):
|
177
|
+
@staticmethod
|
178
|
+
def of(items: List[SortScheme]) -> SortSchemeList:
|
179
|
+
typed_items = SortSchemeList()
|
180
|
+
for item in items:
|
181
|
+
if item is not None and not isinstance(item, SortScheme):
|
182
|
+
item = SortScheme(item)
|
183
|
+
typed_items.append(item)
|
184
|
+
return typed_items
|
185
|
+
|
186
|
+
def __getitem__(self, item):
|
187
|
+
val = super().__getitem__(item)
|
188
|
+
if val is not None and not isinstance(val, SortScheme):
|
189
|
+
self[item] = val = SortScheme(val)
|
190
|
+
return val
|
deltacat/storage/model/stream.py
CHANGED
@@ -1,31 +1,54 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
|
4
|
+
import posixpath
|
5
5
|
|
6
|
-
|
6
|
+
import pyarrow
|
7
|
+
|
8
|
+
import deltacat.storage.model.partition as partition
|
9
|
+
|
10
|
+
from typing import Any, Dict, Optional, List
|
11
|
+
|
12
|
+
from deltacat.storage.model.metafile import Metafile, MetafileRevisionInfo
|
13
|
+
from deltacat.constants import TXN_DIR_NAME
|
14
|
+
from deltacat.storage.model.locator import (
|
15
|
+
Locator,
|
16
|
+
LocatorName,
|
17
|
+
)
|
7
18
|
from deltacat.storage.model.namespace import NamespaceLocator
|
8
|
-
from deltacat.storage.model.table import
|
19
|
+
from deltacat.storage.model.table import (
|
20
|
+
TableLocator,
|
21
|
+
Table,
|
22
|
+
)
|
9
23
|
from deltacat.storage.model.table_version import TableVersionLocator
|
10
|
-
from deltacat.storage.model.types import
|
11
|
-
|
24
|
+
from deltacat.storage.model.types import (
|
25
|
+
CommitState,
|
26
|
+
StreamFormat,
|
27
|
+
)
|
28
|
+
|
12
29
|
|
30
|
+
class Stream(Metafile):
|
31
|
+
"""
|
32
|
+
An unbounded stream of Deltas, where each delta's records are optionally
|
33
|
+
partitioned according to the given partition scheme.
|
34
|
+
"""
|
13
35
|
|
14
|
-
class Stream(dict):
|
15
36
|
@staticmethod
|
16
37
|
def of(
|
17
38
|
locator: Optional[StreamLocator],
|
18
|
-
|
39
|
+
partition_scheme: Optional[partition.PartitionScheme],
|
19
40
|
state: Optional[CommitState] = None,
|
20
|
-
|
21
|
-
|
41
|
+
previous_stream_id: Optional[str] = None,
|
42
|
+
watermark: Optional[int] = None,
|
43
|
+
native_object: Optional[Any] = None,
|
22
44
|
) -> Stream:
|
23
45
|
stream = Stream()
|
24
46
|
stream.locator = locator
|
25
|
-
stream.
|
47
|
+
stream.partition_scheme = partition_scheme
|
26
48
|
stream.state = state
|
27
|
-
stream.
|
28
|
-
stream.
|
49
|
+
stream.previous_stream_id = previous_stream_id
|
50
|
+
stream.watermark = watermark
|
51
|
+
stream.native_object = native_object
|
29
52
|
return stream
|
30
53
|
|
31
54
|
@property
|
@@ -40,31 +63,44 @@ class Stream(dict):
|
|
40
63
|
self["streamLocator"] = stream_locator
|
41
64
|
|
42
65
|
@property
|
43
|
-
def
|
44
|
-
|
45
|
-
Ordered list of unique column names in the table schema on
|
46
|
-
which the underlying data is partitioned. Either partition_spec
|
47
|
-
or partition_keys must be specified but not both.
|
66
|
+
def locator_alias(self) -> Optional[StreamLocatorAlias]:
|
67
|
+
return StreamLocatorAlias.of(self)
|
48
68
|
|
49
|
-
|
50
|
-
|
69
|
+
@property
|
70
|
+
def partition_scheme(self) -> Optional[partition.PartitionScheme]:
|
51
71
|
"""
|
52
|
-
|
72
|
+
A table's partition keys are defined within the context of a
|
73
|
+
Partition Scheme, which supports defining both fields to partition
|
74
|
+
a table by and optional transforms to apply to those fields to
|
75
|
+
derive the Partition Values that a given field, and its corresponding
|
76
|
+
record, belong to.
|
77
|
+
"""
|
78
|
+
val: Dict[str, Any] = self.get("partitionScheme")
|
79
|
+
if val is not None and not isinstance(val, partition.PartitionScheme):
|
80
|
+
self.partition_scheme = val = partition.PartitionScheme(val)
|
81
|
+
return val
|
53
82
|
|
54
|
-
@
|
55
|
-
def
|
56
|
-
self[
|
83
|
+
@partition_scheme.setter
|
84
|
+
def partition_scheme(
|
85
|
+
self, partition_scheme: Optional[partition.PartitionScheme]
|
86
|
+
) -> None:
|
87
|
+
self["partitionScheme"] = partition_scheme
|
57
88
|
|
58
89
|
@property
|
59
|
-
def
|
60
|
-
""
|
61
|
-
Previous stream digest
|
62
|
-
"""
|
63
|
-
return self.get("previousStreamDigest")
|
90
|
+
def previous_stream_id(self) -> Optional[str]:
|
91
|
+
return self.get("previousStreamId")
|
64
92
|
|
65
|
-
@
|
66
|
-
def
|
67
|
-
self["
|
93
|
+
@previous_stream_id.setter
|
94
|
+
def previous_stream_id(self, previous_stream_id: Optional[str]) -> None:
|
95
|
+
self["previousStreamId"] = previous_stream_id
|
96
|
+
|
97
|
+
@property
|
98
|
+
def watermark(self) -> Optional[int]:
|
99
|
+
return self.get("watermark")
|
100
|
+
|
101
|
+
@watermark.setter
|
102
|
+
def watermark(self, watermark: Optional[int]) -> None:
|
103
|
+
self["watermark"] = watermark
|
68
104
|
|
69
105
|
@property
|
70
106
|
def state(self) -> Optional[CommitState]:
|
@@ -79,24 +115,12 @@ class Stream(dict):
|
|
79
115
|
self["state"] = state
|
80
116
|
|
81
117
|
@property
|
82
|
-
def
|
83
|
-
""
|
84
|
-
If a table uses complex partitioning instead of identity,
|
85
|
-
partition spec can be specified to define that strategy.
|
86
|
-
For example, a partition spec can define a bucketing strategy
|
87
|
-
on composite column values or can define iceberg compliant
|
88
|
-
bucketing.
|
118
|
+
def native_object(self) -> Optional[Any]:
|
119
|
+
return self.get("nativeObject")
|
89
120
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
if val is not None and not isinstance(val, StreamPartitionSpec):
|
94
|
-
self.partition_spec = val = StreamPartitionSpec(val)
|
95
|
-
return val
|
96
|
-
|
97
|
-
@partition_spec.setter
|
98
|
-
def partition_spec(self, spec: StreamPartitionSpec) -> None:
|
99
|
-
self["partitionSpec"] = spec
|
121
|
+
@native_object.setter
|
122
|
+
def native_object(self, native_object: Optional[Any]) -> None:
|
123
|
+
self["nativeObject"] = native_object
|
100
124
|
|
101
125
|
@property
|
102
126
|
def namespace_locator(self) -> Optional[NamespaceLocator]:
|
@@ -126,6 +150,13 @@ class Stream(dict):
|
|
126
150
|
return stream_locator.stream_id
|
127
151
|
return None
|
128
152
|
|
153
|
+
@property
|
154
|
+
def stream_format(self) -> Optional[str]:
|
155
|
+
stream_locator = self.locator
|
156
|
+
if stream_locator:
|
157
|
+
return stream_locator.format
|
158
|
+
return None
|
159
|
+
|
129
160
|
@property
|
130
161
|
def namespace(self) -> Optional[str]:
|
131
162
|
stream_locator = self.locator
|
@@ -147,16 +178,65 @@ class Stream(dict):
|
|
147
178
|
return stream_locator.table_version
|
148
179
|
return None
|
149
180
|
|
150
|
-
def
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
f"{num_keys} partition keys: {self}"
|
181
|
+
def to_serializable(self) -> Stream:
|
182
|
+
serializable = self
|
183
|
+
if serializable.table_locator:
|
184
|
+
serializable: Stream = Stream.update_for(self)
|
185
|
+
# remove the mutable table locator
|
186
|
+
serializable.table_version_locator.table_locator = TableLocator.at(
|
187
|
+
namespace=self.id,
|
188
|
+
table_name=self.id,
|
159
189
|
)
|
190
|
+
return serializable
|
191
|
+
|
192
|
+
def from_serializable(
|
193
|
+
self,
|
194
|
+
path: str,
|
195
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
196
|
+
) -> Stream:
|
197
|
+
# restore the table locator from its mapped immutable metafile ID
|
198
|
+
if self.table_locator and self.table_locator.table_name == self.id:
|
199
|
+
parent_rev_dir_path = Metafile._parent_metafile_rev_dir_path(
|
200
|
+
base_metafile_path=path,
|
201
|
+
parent_number=2,
|
202
|
+
)
|
203
|
+
txn_log_dir = posixpath.join(
|
204
|
+
posixpath.dirname(
|
205
|
+
posixpath.dirname(
|
206
|
+
posixpath.dirname(parent_rev_dir_path),
|
207
|
+
)
|
208
|
+
),
|
209
|
+
TXN_DIR_NAME,
|
210
|
+
)
|
211
|
+
table = Table.read(
|
212
|
+
MetafileRevisionInfo.latest_revision(
|
213
|
+
revision_dir_path=parent_rev_dir_path,
|
214
|
+
filesystem=filesystem,
|
215
|
+
success_txn_log_dir=txn_log_dir,
|
216
|
+
).path,
|
217
|
+
filesystem,
|
218
|
+
)
|
219
|
+
self.table_version_locator.table_locator = table.locator
|
220
|
+
return self
|
221
|
+
|
222
|
+
|
223
|
+
class StreamLocatorName(LocatorName):
|
224
|
+
def __init__(self, locator: StreamLocator):
|
225
|
+
self.locator = locator
|
226
|
+
|
227
|
+
@property
|
228
|
+
def immutable_id(self) -> Optional[str]:
|
229
|
+
return self.locator.stream_id
|
230
|
+
|
231
|
+
@immutable_id.setter
|
232
|
+
def immutable_id(self, immutable_id: Optional[str]):
|
233
|
+
self.locator.stream_id = immutable_id
|
234
|
+
|
235
|
+
def parts(self) -> List[str]:
|
236
|
+
return [
|
237
|
+
self.locator.stream_id,
|
238
|
+
self.locator.format,
|
239
|
+
]
|
160
240
|
|
161
241
|
|
162
242
|
class StreamLocator(Locator, dict):
|
@@ -164,7 +244,7 @@ class StreamLocator(Locator, dict):
|
|
164
244
|
def of(
|
165
245
|
table_version_locator: Optional[TableVersionLocator],
|
166
246
|
stream_id: Optional[str],
|
167
|
-
|
247
|
+
stream_format: Optional[StreamFormat],
|
168
248
|
) -> StreamLocator:
|
169
249
|
"""
|
170
250
|
Creates a table version Stream Locator. All input parameters are
|
@@ -173,7 +253,11 @@ class StreamLocator(Locator, dict):
|
|
173
253
|
stream_locator = StreamLocator()
|
174
254
|
stream_locator.table_version_locator = table_version_locator
|
175
255
|
stream_locator.stream_id = stream_id
|
176
|
-
stream_locator.
|
256
|
+
stream_locator.format = (
|
257
|
+
stream_format.value
|
258
|
+
if isinstance(stream_format, StreamFormat)
|
259
|
+
else stream_format
|
260
|
+
)
|
177
261
|
return stream_locator
|
178
262
|
|
179
263
|
@staticmethod
|
@@ -182,19 +266,31 @@ class StreamLocator(Locator, dict):
|
|
182
266
|
table_name: Optional[str],
|
183
267
|
table_version: Optional[str],
|
184
268
|
stream_id: Optional[str],
|
185
|
-
|
269
|
+
stream_format: Optional[StreamFormat],
|
186
270
|
) -> StreamLocator:
|
187
|
-
table_version_locator =
|
188
|
-
|
189
|
-
|
190
|
-
|
271
|
+
table_version_locator = (
|
272
|
+
TableVersionLocator.at(
|
273
|
+
namespace,
|
274
|
+
table_name,
|
275
|
+
table_version,
|
276
|
+
)
|
277
|
+
if table_version
|
278
|
+
else None
|
191
279
|
)
|
192
280
|
return StreamLocator.of(
|
193
281
|
table_version_locator,
|
194
282
|
stream_id,
|
195
|
-
|
283
|
+
stream_format,
|
196
284
|
)
|
197
285
|
|
286
|
+
@property
|
287
|
+
def name(self) -> StreamLocatorName:
|
288
|
+
return StreamLocatorName(self)
|
289
|
+
|
290
|
+
@property
|
291
|
+
def parent(self) -> Optional[TableVersionLocator]:
|
292
|
+
return self.table_version_locator
|
293
|
+
|
198
294
|
@property
|
199
295
|
def table_version_locator(self) -> Optional[TableVersionLocator]:
|
200
296
|
val: Dict[str, Any] = self.get("tableVersionLocator")
|
@@ -217,12 +313,12 @@ class StreamLocator(Locator, dict):
|
|
217
313
|
self["streamId"] = stream_id
|
218
314
|
|
219
315
|
@property
|
220
|
-
def
|
221
|
-
return self.get("
|
316
|
+
def format(self) -> Optional[str]:
|
317
|
+
return self.get("format")
|
222
318
|
|
223
|
-
@
|
224
|
-
def
|
225
|
-
self["
|
319
|
+
@format.setter
|
320
|
+
def format(self, stream_format: Optional[str]) -> None:
|
321
|
+
self["format"] = stream_format
|
226
322
|
|
227
323
|
@property
|
228
324
|
def namespace_locator(self) -> Optional[NamespaceLocator]:
|
@@ -259,13 +355,45 @@ class StreamLocator(Locator, dict):
|
|
259
355
|
return table_version_locator.table_version
|
260
356
|
return None
|
261
357
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
358
|
+
|
359
|
+
class StreamLocatorAliasName(LocatorName):
|
360
|
+
def __init__(self, locator: StreamLocatorAlias):
|
361
|
+
self.locator = locator
|
362
|
+
|
363
|
+
@property
|
364
|
+
def immutable_id(self) -> Optional[str]:
|
365
|
+
return None
|
366
|
+
|
367
|
+
def parts(self) -> List[str]:
|
368
|
+
return [self.locator.format]
|
369
|
+
|
370
|
+
|
371
|
+
class StreamLocatorAlias(Locator, dict):
|
372
|
+
@staticmethod
|
373
|
+
def of(
|
374
|
+
parent_stream: Stream,
|
375
|
+
) -> StreamLocatorAlias:
|
376
|
+
return (
|
377
|
+
StreamLocatorAlias(
|
378
|
+
{
|
379
|
+
"format": parent_stream.stream_format,
|
380
|
+
"parent": (
|
381
|
+
parent_stream.locator.parent if parent_stream.locator else None
|
382
|
+
),
|
383
|
+
}
|
384
|
+
)
|
385
|
+
if parent_stream.state == CommitState.COMMITTED
|
386
|
+
else None # only committed streams can be resolved by alias
|
387
|
+
)
|
388
|
+
|
389
|
+
@property
|
390
|
+
def format(self) -> Optional[str]:
|
391
|
+
return self.get("format")
|
392
|
+
|
393
|
+
@property
|
394
|
+
def name(self) -> StreamLocatorAliasName:
|
395
|
+
return StreamLocatorAliasName(self)
|
396
|
+
|
397
|
+
@property
|
398
|
+
def parent(self) -> Optional[Locator]:
|
399
|
+
return self.get("parent")
|