deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,18 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import List
|
3
2
|
from enum import Enum
|
3
|
+
from typing import Dict, Any, Optional
|
4
4
|
|
5
5
|
|
6
6
|
class TransformName(str, Enum):
|
7
7
|
IDENTITY = "identity"
|
8
8
|
BUCKET = "bucket"
|
9
|
+
YEAR = "year"
|
10
|
+
MONTH = "month"
|
11
|
+
DAY = "day"
|
12
|
+
HOUR = "hour"
|
13
|
+
TRUNCATE = "truncate"
|
14
|
+
VOID = "void"
|
15
|
+
UNKNOWN = "unknown"
|
9
16
|
|
10
17
|
|
11
18
|
class TransformParameters(dict):
|
@@ -17,63 +24,30 @@ class TransformParameters(dict):
|
|
17
24
|
pass
|
18
25
|
|
19
26
|
|
20
|
-
class IdentityTransformParameters(TransformParameters):
|
21
|
-
"""
|
22
|
-
This class is used to pass parameters to the identity transform
|
23
|
-
"""
|
24
|
-
|
25
|
-
@staticmethod
|
26
|
-
def of(column_name: str) -> IdentityTransformParameters:
|
27
|
-
identify_transform_parameters = IdentityTransformParameters()
|
28
|
-
identify_transform_parameters["columnName"] = column_name
|
29
|
-
return identify_transform_parameters
|
30
|
-
|
31
|
-
@property
|
32
|
-
def column_name(self) -> str:
|
33
|
-
"""
|
34
|
-
The name of the column to use for identity transform
|
35
|
-
"""
|
36
|
-
return self["columnName"]
|
37
|
-
|
38
|
-
@column_name.setter
|
39
|
-
def column_name(self, value: str) -> None:
|
40
|
-
self["columnName"] = value
|
41
|
-
|
42
|
-
|
43
27
|
class BucketingStrategy(str, Enum):
|
44
28
|
"""
|
45
29
|
A bucketing strategy for the transform
|
46
30
|
"""
|
47
31
|
|
48
|
-
#
|
49
|
-
# This strategy supports hashing on composite keys
|
50
|
-
# and uses SHA1 hashing for determining the bucket.
|
51
|
-
# If no columns passed, it will use a random UUID
|
52
|
-
# for determining the bucket.
|
32
|
+
# Default DeltaCAT SHA-1 based hash bucketing strategy.
|
53
33
|
DEFAULT = "default"
|
54
34
|
|
55
|
-
#
|
56
|
-
# As indicated in the iceberg spec, it does not support
|
57
|
-
# composite keys and uses murmur3 hash for determining
|
58
|
-
# the bucket.
|
59
|
-
# See https://iceberg.apache.org/spec/#partitioning
|
35
|
+
# Iceberg-compliant murmur3 based hash bucketing strategy.
|
60
36
|
ICEBERG = "iceberg"
|
61
37
|
|
62
38
|
|
63
39
|
class BucketTransformParameters(TransformParameters):
|
64
40
|
"""
|
65
|
-
|
41
|
+
Parameters for the bucket transform.
|
66
42
|
"""
|
67
43
|
|
44
|
+
@staticmethod
|
68
45
|
def of(
|
69
|
-
self,
|
70
46
|
num_buckets: int,
|
71
|
-
column_names: List[str],
|
72
47
|
bucketing_strategy: BucketingStrategy,
|
73
48
|
) -> BucketTransformParameters:
|
74
49
|
bucket_transform_parameters = BucketTransformParameters()
|
75
50
|
bucket_transform_parameters["numBuckets"] = num_buckets
|
76
|
-
bucket_transform_parameters["columnNames"] = column_names
|
77
51
|
bucket_transform_parameters["bucketingStrategy"] = bucketing_strategy
|
78
52
|
|
79
53
|
return bucket_transform_parameters
|
@@ -81,47 +55,210 @@ class BucketTransformParameters(TransformParameters):
|
|
81
55
|
@property
|
82
56
|
def num_buckets(self) -> int:
|
83
57
|
"""
|
84
|
-
The total number of buckets to create
|
58
|
+
The total number of buckets to create.
|
85
59
|
"""
|
86
60
|
return self["numBuckets"]
|
87
61
|
|
88
62
|
@property
|
89
|
-
def
|
63
|
+
def bucketing_strategy(self) -> BucketingStrategy:
|
90
64
|
"""
|
91
|
-
|
92
|
-
to use for bucketings.
|
65
|
+
The bucketing strategy to use.
|
93
66
|
"""
|
94
|
-
return self["
|
67
|
+
return BucketingStrategy(self["bucketingStrategy"])
|
68
|
+
|
69
|
+
|
70
|
+
class TruncateTransformParameters(TransformParameters):
|
71
|
+
"""
|
72
|
+
Parameters for the truncate transform.
|
73
|
+
"""
|
74
|
+
|
75
|
+
@staticmethod
|
76
|
+
def of(width: int) -> TruncateTransformParameters:
|
77
|
+
truncate_transform_parameters = TruncateTransformParameters()
|
78
|
+
truncate_transform_parameters["width"] = width
|
79
|
+
return truncate_transform_parameters
|
95
80
|
|
96
81
|
@property
|
97
|
-
def
|
82
|
+
def width(self) -> int:
|
98
83
|
"""
|
99
|
-
The
|
84
|
+
The width to truncate the field to.
|
100
85
|
"""
|
101
|
-
return self["
|
86
|
+
return self["width"]
|
102
87
|
|
103
88
|
|
104
89
|
class Transform(dict):
|
105
90
|
"""
|
106
|
-
A transform
|
107
|
-
transformed into a new value.
|
108
|
-
|
91
|
+
A transform represents how a particular column value can be
|
92
|
+
transformed into a new value. For example, transforms may be used
|
93
|
+
to determine partition or sort values for table records.
|
94
|
+
"""
|
95
|
+
|
96
|
+
@property
|
97
|
+
def name(self) -> TransformName:
|
98
|
+
return TransformName(self["name"])
|
99
|
+
|
100
|
+
@name.setter
|
101
|
+
def name(self, name: TransformName) -> None:
|
102
|
+
self["name"] = name
|
103
|
+
|
104
|
+
@property
|
105
|
+
def parameters(self) -> Optional[TransformParameters]:
|
106
|
+
return NAME_TO_TRANSFORM[self.name].parameters
|
107
|
+
|
108
|
+
@parameters.setter
|
109
|
+
def parameters(
|
110
|
+
self,
|
111
|
+
parameters: Optional[TransformParameters] = None,
|
112
|
+
) -> None:
|
113
|
+
NAME_TO_TRANSFORM[self.name].parameters = parameters
|
114
|
+
|
115
|
+
|
116
|
+
class BucketTransform(Transform):
|
117
|
+
"""
|
118
|
+
A transform that hashes field values into a fixed number of buckets.
|
109
119
|
"""
|
110
120
|
|
111
121
|
@staticmethod
|
112
|
-
def of(
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
partition_transform["name"] = name
|
118
|
-
partition_transform["parameters"] = parameters
|
119
|
-
return partition_transform
|
122
|
+
def of(parameters: BucketTransformParameters) -> BucketTransform:
|
123
|
+
transform = BucketTransform()
|
124
|
+
transform.name = TransformName.BUCKET
|
125
|
+
transform.parameters = parameters
|
126
|
+
return transform
|
120
127
|
|
121
128
|
@property
|
122
|
-
def
|
123
|
-
|
129
|
+
def parameters(self) -> BucketTransformParameters:
|
130
|
+
val: Dict[str, Any] = self.get("parameters")
|
131
|
+
if val is not None and not isinstance(val, BucketTransformParameters.__class__):
|
132
|
+
self["parameters"] = val = BucketTransformParameters(val)
|
133
|
+
return val
|
134
|
+
|
135
|
+
@parameters.setter
|
136
|
+
def parameters(
|
137
|
+
self,
|
138
|
+
parameters: Optional[BucketTransformParameters] = None,
|
139
|
+
) -> None:
|
140
|
+
self["parameters"] = parameters
|
141
|
+
|
142
|
+
|
143
|
+
class TruncateTransform(Transform):
|
144
|
+
"""
|
145
|
+
A transform that truncates field values to a fixed width.
|
146
|
+
"""
|
147
|
+
|
148
|
+
@staticmethod
|
149
|
+
def of(parameters: TruncateTransformParameters) -> TruncateTransform:
|
150
|
+
transform = TruncateTransform()
|
151
|
+
transform.name = TransformName.TRUNCATE
|
152
|
+
transform.parameters = parameters
|
153
|
+
return transform
|
124
154
|
|
125
155
|
@property
|
126
|
-
def parameters(self) ->
|
127
|
-
|
156
|
+
def parameters(self) -> TruncateTransformParameters:
|
157
|
+
val: Dict[str, Any] = self.get("parameters")
|
158
|
+
if val is not None and not isinstance(val, TruncateTransformParameters):
|
159
|
+
self["parameters"] = val = TruncateTransformParameters(val)
|
160
|
+
return val
|
161
|
+
|
162
|
+
@parameters.setter
|
163
|
+
def parameters(
|
164
|
+
self,
|
165
|
+
parameters: Optional[TruncateTransformParameters] = None,
|
166
|
+
) -> None:
|
167
|
+
self["parameters"] = parameters
|
168
|
+
|
169
|
+
|
170
|
+
class IdentityTransform(Transform):
|
171
|
+
"""
|
172
|
+
A no-op transform that returns unmodified field values.
|
173
|
+
"""
|
174
|
+
|
175
|
+
@staticmethod
|
176
|
+
def of() -> IdentityTransform:
|
177
|
+
transform = IdentityTransform()
|
178
|
+
transform.name = TransformName.IDENTITY
|
179
|
+
return transform
|
180
|
+
|
181
|
+
|
182
|
+
class HourTransform(Transform):
|
183
|
+
"""
|
184
|
+
A transform that returns the hour of a datetime value.
|
185
|
+
"""
|
186
|
+
|
187
|
+
@staticmethod
|
188
|
+
def of() -> HourTransform:
|
189
|
+
transform = HourTransform()
|
190
|
+
transform.name = TransformName.HOUR
|
191
|
+
return transform
|
192
|
+
|
193
|
+
|
194
|
+
class DayTransform(Transform):
|
195
|
+
"""
|
196
|
+
A transform that returns the day of a datetime value.
|
197
|
+
"""
|
198
|
+
|
199
|
+
@staticmethod
|
200
|
+
def of() -> DayTransform:
|
201
|
+
transform = DayTransform()
|
202
|
+
transform.name = TransformName.DAY
|
203
|
+
return transform
|
204
|
+
|
205
|
+
|
206
|
+
class MonthTransform(Transform):
|
207
|
+
"""
|
208
|
+
A transform that returns the month of a datetime value.
|
209
|
+
"""
|
210
|
+
|
211
|
+
@staticmethod
|
212
|
+
def of() -> MonthTransform:
|
213
|
+
transform = MonthTransform()
|
214
|
+
transform.name = TransformName.MONTH
|
215
|
+
return transform
|
216
|
+
|
217
|
+
|
218
|
+
class YearTransform(Transform):
|
219
|
+
"""
|
220
|
+
A transform that returns the year of a datetime value.
|
221
|
+
"""
|
222
|
+
|
223
|
+
@staticmethod
|
224
|
+
def of() -> YearTransform:
|
225
|
+
transform = YearTransform()
|
226
|
+
transform.name = TransformName.YEAR
|
227
|
+
return transform
|
228
|
+
|
229
|
+
|
230
|
+
class VoidTransform(Transform):
|
231
|
+
"""
|
232
|
+
A transform that coerces all field values to None.
|
233
|
+
"""
|
234
|
+
|
235
|
+
@staticmethod
|
236
|
+
def of() -> VoidTransform:
|
237
|
+
transform = VoidTransform()
|
238
|
+
transform.name = TransformName.VOID
|
239
|
+
return transform
|
240
|
+
|
241
|
+
|
242
|
+
class UnknownTransform(Transform):
|
243
|
+
"""
|
244
|
+
An unknown or invalid transform.
|
245
|
+
"""
|
246
|
+
|
247
|
+
@staticmethod
|
248
|
+
def of() -> UnknownTransform:
|
249
|
+
transform = UnknownTransform()
|
250
|
+
transform.name = TransformName.UNKNOWN
|
251
|
+
return transform
|
252
|
+
|
253
|
+
|
254
|
+
NAME_TO_TRANSFORM: Dict[TransformName, Transform] = {
|
255
|
+
TransformName.IDENTITY: IdentityTransform,
|
256
|
+
TransformName.BUCKET: BucketTransform,
|
257
|
+
TransformName.YEAR: YearTransform,
|
258
|
+
TransformName.MONTH: MonthTransform,
|
259
|
+
TransformName.DAY: DayTransform,
|
260
|
+
TransformName.HOUR: HourTransform,
|
261
|
+
TransformName.TRUNCATE: TruncateTransform,
|
262
|
+
TransformName.VOID: VoidTransform,
|
263
|
+
TransformName.UNKNOWN: UnknownTransform,
|
264
|
+
}
|
deltacat/storage/model/types.py
CHANGED
@@ -1,25 +1,100 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
from enum import Enum
|
2
4
|
from typing import List, Union
|
3
5
|
|
4
|
-
from pyarrow.parquet import ParquetFile
|
5
6
|
import numpy as np
|
6
7
|
import pandas as pd
|
7
8
|
import pyarrow as pa
|
8
9
|
from ray.data.dataset import Dataset
|
9
10
|
from daft import DataFrame as DaftDataFrame
|
10
11
|
|
11
|
-
|
12
|
+
|
13
|
+
LocalTable = Union[
|
14
|
+
pa.Table,
|
15
|
+
pd.DataFrame,
|
16
|
+
np.ndarray,
|
17
|
+
pa.parquet.ParquetFile,
|
18
|
+
]
|
12
19
|
LocalDataset = List[LocalTable]
|
13
20
|
DistributedDataset = Union[Dataset, DaftDataFrame]
|
14
21
|
|
15
22
|
|
23
|
+
class StreamFormat(str, Enum):
|
24
|
+
DELTACAT = "deltacat"
|
25
|
+
ICEBERG = "iceberg"
|
26
|
+
HUDI = "hudi"
|
27
|
+
DELTA_LAKE = "delta_lake"
|
28
|
+
SQLITE3 = "SQLITE3" # used by tests
|
29
|
+
|
30
|
+
|
16
31
|
class DeltaType(str, Enum):
|
17
32
|
APPEND = "append"
|
18
33
|
UPSERT = "upsert"
|
19
34
|
DELETE = "delete"
|
20
35
|
|
21
36
|
|
37
|
+
class TransactionType(str, Enum):
|
38
|
+
# the transaction reads existing data
|
39
|
+
# does not conflict with any other transaction types
|
40
|
+
READ = "read"
|
41
|
+
# the transaction only appends new data
|
42
|
+
# conflicts with other transaction types can be auto-resolved
|
43
|
+
APPEND = "append"
|
44
|
+
# the transaction alters existing data
|
45
|
+
# (even if it also appends data)
|
46
|
+
# conflicts with other alters/overwrites/restates/deletes fail
|
47
|
+
ALTER = "alter"
|
48
|
+
# the transaction overwrites existing data
|
49
|
+
# (even if it also appends or alters data)
|
50
|
+
# conflicts with other alters/overwrites/restates/deletes fail
|
51
|
+
OVERWRITE = "overwrite"
|
52
|
+
# the transaction restates existing data with a new layout
|
53
|
+
# (even if it appends, alters, or overwrites data to do so)
|
54
|
+
# conflicts with other alters/overwrites/restates/deletes fail
|
55
|
+
RESTATE = "restate"
|
56
|
+
# the transaction deletes existing data
|
57
|
+
# (even if it also appends, alters, overwrites, or restates data)
|
58
|
+
# conflicts with other alters/overwrites/restates/deletes fail
|
59
|
+
DELETE = "delete"
|
60
|
+
|
61
|
+
|
62
|
+
class TransactionOperationType(str, Enum):
|
63
|
+
CREATE = "create"
|
64
|
+
UPDATE = "update"
|
65
|
+
DELETE = "delete"
|
66
|
+
|
67
|
+
READ_SIBLINGS = "read_siblings"
|
68
|
+
READ_CHILDREN = "read_children"
|
69
|
+
READ_LATEST = "read_latest"
|
70
|
+
READ_EXISTS = "read_exists"
|
71
|
+
|
72
|
+
@staticmethod
|
73
|
+
def write_operations():
|
74
|
+
return {
|
75
|
+
TransactionOperationType.CREATE,
|
76
|
+
TransactionOperationType.UPDATE,
|
77
|
+
TransactionOperationType.DELETE,
|
78
|
+
}
|
79
|
+
|
80
|
+
@staticmethod
|
81
|
+
def read_operations():
|
82
|
+
return {
|
83
|
+
TransactionOperationType.READ_SIBLINGS,
|
84
|
+
TransactionOperationType.READ_CHILDREN,
|
85
|
+
TransactionOperationType.READ_LATEST,
|
86
|
+
TransactionOperationType.READ_EXISTS,
|
87
|
+
}
|
88
|
+
|
89
|
+
def is_write_operation(self) -> bool:
|
90
|
+
return self in TransactionOperationType.write_operations()
|
91
|
+
|
92
|
+
def is_read_operation(self) -> bool:
|
93
|
+
return self in TransactionOperationType.read_operatins()
|
94
|
+
|
95
|
+
|
22
96
|
class LifecycleState(str, Enum):
|
97
|
+
CREATED = "created"
|
23
98
|
UNRELEASED = "unreleased"
|
24
99
|
ACTIVE = "active"
|
25
100
|
DEPRECATED = "deprecated"
|
@@ -35,22 +110,45 @@ class CommitState(str, Enum):
|
|
35
110
|
|
36
111
|
class SchemaConsistencyType(str, Enum):
|
37
112
|
"""
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
table load time:
|
113
|
+
DeltaCAT table schemas can be used to inform the data consistency checks
|
114
|
+
run for each field. When present, the schema can be used to enforce the
|
115
|
+
following field-level data consistency policies at table load time:
|
42
116
|
|
43
|
-
NONE: No consistency checks are run.
|
44
|
-
policies by specifying column names to pass through together with
|
45
|
-
column names to coerce/validate.
|
117
|
+
NONE: No consistency checks are run.
|
46
118
|
|
47
|
-
COERCE: Coerce fields to fit the schema whenever possible.
|
48
|
-
subset of column names to coerce may optionally be specified.
|
119
|
+
COERCE: Coerce fields to fit the schema whenever possible.
|
49
120
|
|
50
|
-
VALIDATE: Raise an error for any fields that don't fit the schema.
|
51
|
-
explicit subset of column names to validate may optionally be specified.
|
121
|
+
VALIDATE: Raise an error for any fields that don't fit the schema.
|
52
122
|
"""
|
53
123
|
|
54
124
|
NONE = "none"
|
55
125
|
COERCE = "coerce"
|
56
126
|
VALIDATE = "validate"
|
127
|
+
|
128
|
+
|
129
|
+
class SortOrder(str, Enum):
|
130
|
+
ASCENDING = "ascending"
|
131
|
+
DESCENDING = "descending"
|
132
|
+
|
133
|
+
@classmethod
|
134
|
+
def _missing_(cls, value: str):
|
135
|
+
# pyiceberg.table.sorting.SortDirection mappings
|
136
|
+
if value.lower() == "asc":
|
137
|
+
return SortOrder.ASCENDING
|
138
|
+
elif value.lower() == "desc":
|
139
|
+
return SortOrder.DESCENDING
|
140
|
+
return None
|
141
|
+
|
142
|
+
|
143
|
+
class NullOrder(str, Enum):
|
144
|
+
AT_START = "at_start"
|
145
|
+
AT_END = "at_end"
|
146
|
+
|
147
|
+
@classmethod
|
148
|
+
def _missing_(cls, value: str):
|
149
|
+
# pyiceberg.table.sorting.NullOrder mappings
|
150
|
+
if value.lower() == "nulls-first":
|
151
|
+
return NullOrder.AT_START
|
152
|
+
elif value.lower() == "nulls-last":
|
153
|
+
return NullOrder.AT_END
|
154
|
+
return None
|
File without changes
|
@@ -0,0 +1,75 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Iterator, List, Any
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
6
|
+
from deltacat.storage.rivulet import Schema
|
7
|
+
from deltacat.storage.rivulet.serializer import DataSerializer, MEMTABLE_DATA
|
8
|
+
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
9
|
+
|
10
|
+
|
11
|
+
class ArrowSerializer(DataSerializer, ABC):
|
12
|
+
"""
|
13
|
+
Utility class which can serialize data by first converting to arrow as intermediate format
|
14
|
+
and then using the provided serialization function
|
15
|
+
"""
|
16
|
+
|
17
|
+
def __init__(self, file_provider: FileProvider, schema: Schema):
|
18
|
+
self.schema = schema
|
19
|
+
self.file_provider = file_provider
|
20
|
+
self.arrow_schema = self.schema.to_pyarrow()
|
21
|
+
|
22
|
+
@abstractmethod
|
23
|
+
def serialize(self, table: pa.Table) -> List[SSTableRow]:
|
24
|
+
"""
|
25
|
+
Write an Arrow table to the specified output file
|
26
|
+
|
27
|
+
:param table: PyArrow table to write
|
28
|
+
:return: Number of row groups in the written file
|
29
|
+
"""
|
30
|
+
pass
|
31
|
+
|
32
|
+
def _to_arrow_table(self, sorted_records: MEMTABLE_DATA) -> pa.Table:
|
33
|
+
"""
|
34
|
+
Convert input records to an Arrow table
|
35
|
+
"""
|
36
|
+
if isinstance(sorted_records, pa.Table):
|
37
|
+
return sorted_records
|
38
|
+
elif isinstance(sorted_records, (Iterator, List)):
|
39
|
+
return pa.Table.from_pylist(sorted_records, schema=self.arrow_schema)
|
40
|
+
else:
|
41
|
+
raise ValueError(f"Unsupported record type: {type(sorted_records)}")
|
42
|
+
|
43
|
+
def _get_min_max_key(self, table: pa.Table) -> (Any, Any):
|
44
|
+
"""
|
45
|
+
Get min and max values for the merge key from the table
|
46
|
+
"""
|
47
|
+
key_col = table[self.schema.get_merge_key()]
|
48
|
+
return key_col[0].as_py(), key_col[len(key_col) - 1].as_py()
|
49
|
+
|
50
|
+
def flush_batch(self, sorted_records: MEMTABLE_DATA) -> List[SSTableRow]:
|
51
|
+
"""
|
52
|
+
Write records to new parquet file as row group
|
53
|
+
For now, we will depend on pyarrow to write to parquet
|
54
|
+
|
55
|
+
:param sorted_records: record batch in SORTED ORDER
|
56
|
+
:return: metadata for constructing SSTable
|
57
|
+
"""
|
58
|
+
if not sorted_records:
|
59
|
+
return []
|
60
|
+
|
61
|
+
table = self._to_arrow_table(sorted_records)
|
62
|
+
return self.serialize(table)
|
63
|
+
|
64
|
+
def write(self, sorted_records: MEMTABLE_DATA) -> List[SSTableRow]:
|
65
|
+
"""
|
66
|
+
Write records using the provided serialization function
|
67
|
+
|
68
|
+
:param sorted_records: record batch in SORTED ORDER
|
69
|
+
:return: metadata for constructing SSTable
|
70
|
+
"""
|
71
|
+
if not sorted_records:
|
72
|
+
return []
|
73
|
+
|
74
|
+
table = self._to_arrow_table(sorted_records)
|
75
|
+
return self.serialize(table)
|