deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.0b2.dist-info/METADATA +65 -0
- deltacat-2.0.0b2.dist-info/RECORD +349 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,23 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from typing import Any, Dict, Optional
|
4
|
+
from typing import Any, Dict, Optional, List
|
5
5
|
|
6
|
-
from deltacat.storage.model.
|
6
|
+
from deltacat.storage.model.metafile import Metafile
|
7
|
+
from deltacat.storage.model.locator import Locator, LocatorName
|
7
8
|
|
9
|
+
NamespaceProperties = dict[str, Any]
|
8
10
|
|
9
|
-
|
11
|
+
|
12
|
+
class Namespace(Metafile):
|
10
13
|
@staticmethod
|
11
14
|
def of(
|
12
|
-
locator: Optional[NamespaceLocator],
|
15
|
+
locator: Optional[NamespaceLocator],
|
16
|
+
properties: Optional[NamespaceProperties] = None,
|
13
17
|
) -> Namespace:
|
14
18
|
namespace = Namespace()
|
15
19
|
namespace.locator = locator
|
16
|
-
namespace.
|
20
|
+
namespace.properties = properties
|
17
21
|
return namespace
|
18
22
|
|
19
23
|
@property
|
@@ -35,12 +39,24 @@ class Namespace(dict):
|
|
35
39
|
return None
|
36
40
|
|
37
41
|
@property
|
38
|
-
def
|
39
|
-
return self.get("
|
42
|
+
def properties(self) -> Optional[NamespaceProperties]:
|
43
|
+
return self.get("properties")
|
44
|
+
|
45
|
+
@properties.setter
|
46
|
+
def properties(self, properties: Optional[NamespaceProperties]) -> None:
|
47
|
+
self["properties"] = properties
|
48
|
+
|
49
|
+
|
50
|
+
class NamespaceLocatorName(LocatorName):
|
51
|
+
def __init__(self, locator: NamespaceLocator):
|
52
|
+
self.locator = locator
|
53
|
+
|
54
|
+
@property
|
55
|
+
def immutable_id(self) -> Optional[str]:
|
56
|
+
return None
|
40
57
|
|
41
|
-
|
42
|
-
|
43
|
-
self["permissions"] = permissions
|
58
|
+
def parts(self) -> List[str]:
|
59
|
+
return [self.locator.namespace]
|
44
60
|
|
45
61
|
|
46
62
|
class NamespaceLocator(Locator, dict):
|
@@ -50,6 +66,14 @@ class NamespaceLocator(Locator, dict):
|
|
50
66
|
namespace_locator.namespace = namespace
|
51
67
|
return namespace_locator
|
52
68
|
|
69
|
+
@property
|
70
|
+
def name(self) -> NamespaceLocatorName:
|
71
|
+
return NamespaceLocatorName(self)
|
72
|
+
|
73
|
+
@property
|
74
|
+
def parent(self) -> Optional[Locator]:
|
75
|
+
return None
|
76
|
+
|
53
77
|
@property
|
54
78
|
def namespace(self) -> Optional[str]:
|
55
79
|
return self.get("namespace")
|
@@ -57,11 +81,3 @@ class NamespaceLocator(Locator, dict):
|
|
57
81
|
@namespace.setter
|
58
82
|
def namespace(self, namespace: Optional[str]) -> None:
|
59
83
|
self["namespace"] = namespace
|
60
|
-
|
61
|
-
def canonical_string(self) -> str:
|
62
|
-
"""
|
63
|
-
Returns a unique string for the given locator that can be used
|
64
|
-
for equality checks (i.e. two locators are equal if they have
|
65
|
-
the same canonical string).
|
66
|
-
"""
|
67
|
-
return self.namespace
|
@@ -1,29 +1,58 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
|
-
from typing import Any, Dict, List, Optional, Union
|
4
3
|
|
4
|
+
import base64
|
5
|
+
import posixpath
|
6
|
+
|
7
|
+
import pyarrow
|
5
8
|
import pyarrow as pa
|
6
|
-
|
7
|
-
from
|
9
|
+
|
10
|
+
from typing import Any, Dict, List, Optional
|
11
|
+
|
12
|
+
from deltacat.storage.model.metafile import Metafile, MetafileRevisionInfo
|
13
|
+
from deltacat.constants import METAFILE_FORMAT, METAFILE_FORMAT_JSON, TXN_DIR_NAME
|
14
|
+
from deltacat.storage.model.schema import (
|
15
|
+
FieldLocator,
|
16
|
+
Schema,
|
17
|
+
)
|
18
|
+
from deltacat.storage.model.locator import (
|
19
|
+
Locator,
|
20
|
+
LocatorName,
|
21
|
+
)
|
8
22
|
from deltacat.storage.model.namespace import NamespaceLocator
|
9
23
|
from deltacat.storage.model.stream import StreamLocator
|
10
|
-
from deltacat.storage.model.table import
|
24
|
+
from deltacat.storage.model.table import (
|
25
|
+
TableLocator,
|
26
|
+
Table,
|
27
|
+
)
|
11
28
|
from deltacat.storage.model.table_version import TableVersionLocator
|
12
|
-
from deltacat.storage.model.
|
29
|
+
from deltacat.storage.model.transform import Transform
|
30
|
+
from deltacat.storage.model.types import (
|
31
|
+
CommitState,
|
32
|
+
StreamFormat,
|
33
|
+
)
|
13
34
|
from deltacat.types.media import ContentType
|
14
35
|
|
15
36
|
|
16
|
-
|
37
|
+
"""
|
38
|
+
An ordered list of partition values. Partition values are typically derived
|
39
|
+
by applying one or more transforms to a table's fields.
|
40
|
+
"""
|
41
|
+
PartitionValues = List[Any]
|
42
|
+
UNPARTITIONED_SCHEME_ID = "deadbeef-7277-49a4-a195-fdc8ed235d42"
|
43
|
+
|
44
|
+
|
45
|
+
class Partition(Metafile):
|
17
46
|
@staticmethod
|
18
47
|
def of(
|
19
48
|
locator: Optional[PartitionLocator],
|
20
|
-
schema: Optional[
|
49
|
+
schema: Optional[Schema],
|
21
50
|
content_types: Optional[List[ContentType]],
|
22
51
|
state: Optional[CommitState] = None,
|
23
52
|
previous_stream_position: Optional[int] = None,
|
24
53
|
previous_partition_id: Optional[str] = None,
|
25
54
|
stream_position: Optional[int] = None,
|
26
|
-
|
55
|
+
partition_scheme_id: Optional[str] = None,
|
27
56
|
) -> Partition:
|
28
57
|
partition = Partition()
|
29
58
|
partition.locator = locator
|
@@ -33,7 +62,9 @@ class Partition(dict):
|
|
33
62
|
partition.previous_stream_position = previous_stream_position
|
34
63
|
partition.previous_partition_id = previous_partition_id
|
35
64
|
partition.stream_position = stream_position
|
36
|
-
partition.
|
65
|
+
partition.partition_scheme_id = (
|
66
|
+
partition_scheme_id if locator.partition_values else UNPARTITIONED_SCHEME_ID
|
67
|
+
)
|
37
68
|
return partition
|
38
69
|
|
39
70
|
@property
|
@@ -48,11 +79,18 @@ class Partition(dict):
|
|
48
79
|
self["partitionLocator"] = partition_locator
|
49
80
|
|
50
81
|
@property
|
51
|
-
def
|
52
|
-
return
|
82
|
+
def locator_alias(self) -> Optional[PartitionLocatorAlias]:
|
83
|
+
return PartitionLocatorAlias.of(self)
|
84
|
+
|
85
|
+
@property
|
86
|
+
def schema(self) -> Optional[Schema]:
|
87
|
+
val: Dict[str, Any] = self.get("schema")
|
88
|
+
if val is not None and not isinstance(val, Schema):
|
89
|
+
self.schema = val = Schema(val)
|
90
|
+
return val
|
53
91
|
|
54
92
|
@schema.setter
|
55
|
-
def schema(self, schema: Optional[
|
93
|
+
def schema(self, schema: Optional[Schema]) -> None:
|
56
94
|
self["schema"] = schema
|
57
95
|
|
58
96
|
@property
|
@@ -104,12 +142,12 @@ class Partition(dict):
|
|
104
142
|
self["streamPosition"] = stream_position
|
105
143
|
|
106
144
|
@property
|
107
|
-
def
|
108
|
-
return self.get("
|
145
|
+
def partition_scheme_id(self) -> Optional[str]:
|
146
|
+
return self.get("partitionSchemeId")
|
109
147
|
|
110
|
-
@
|
111
|
-
def
|
112
|
-
self["
|
148
|
+
@partition_scheme_id.setter
|
149
|
+
def partition_scheme_id(self, partition_scheme_id: Optional[str]) -> None:
|
150
|
+
self["partitionSchemeId"] = partition_scheme_id
|
113
151
|
|
114
152
|
@property
|
115
153
|
def partition_id(self) -> Optional[str]:
|
@@ -125,6 +163,13 @@ class Partition(dict):
|
|
125
163
|
return partition_locator.stream_id
|
126
164
|
return None
|
127
165
|
|
166
|
+
@property
|
167
|
+
def stream_format(self) -> Optional[str]:
|
168
|
+
partition_locator = self.locator
|
169
|
+
if partition_locator:
|
170
|
+
return partition_locator.stream_format
|
171
|
+
return None
|
172
|
+
|
128
173
|
@property
|
129
174
|
def partition_values(self) -> Optional[PartitionValues]:
|
130
175
|
partition_locator = self.locator
|
@@ -163,7 +208,7 @@ class Partition(dict):
|
|
163
208
|
def storage_type(self) -> Optional[str]:
|
164
209
|
partition_locator = self.locator
|
165
210
|
if partition_locator:
|
166
|
-
return partition_locator.
|
211
|
+
return partition_locator.stream_format
|
167
212
|
return None
|
168
213
|
|
169
214
|
@property
|
@@ -193,6 +238,84 @@ class Partition(dict):
|
|
193
238
|
content_type in supported_content_types
|
194
239
|
)
|
195
240
|
|
241
|
+
def to_serializable(self) -> Partition:
|
242
|
+
serializable: Partition = Partition.update_for(self)
|
243
|
+
if serializable.schema:
|
244
|
+
schema_bytes = serializable.schema.serialize().to_pybytes()
|
245
|
+
serializable.schema = (
|
246
|
+
base64.b64encode(schema_bytes).decode("utf-8")
|
247
|
+
if METAFILE_FORMAT == METAFILE_FORMAT_JSON
|
248
|
+
else schema_bytes
|
249
|
+
)
|
250
|
+
|
251
|
+
if serializable.table_locator:
|
252
|
+
# replace the mutable table locator
|
253
|
+
serializable.table_version_locator.table_locator = TableLocator.at(
|
254
|
+
namespace=self.id,
|
255
|
+
table_name=self.id,
|
256
|
+
)
|
257
|
+
return serializable
|
258
|
+
|
259
|
+
def from_serializable(
|
260
|
+
self,
|
261
|
+
path: str,
|
262
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
263
|
+
) -> Partition:
|
264
|
+
if self.get("schema"):
|
265
|
+
schema_data = self["schema"]
|
266
|
+
schema_bytes = (
|
267
|
+
base64.b64decode(schema_data)
|
268
|
+
if METAFILE_FORMAT == METAFILE_FORMAT_JSON
|
269
|
+
else schema_data
|
270
|
+
)
|
271
|
+
self["schema"] = Schema.deserialize(pa.py_buffer(schema_bytes))
|
272
|
+
else:
|
273
|
+
self["schema"] = None
|
274
|
+
|
275
|
+
# restore the table locator from its mapped immutable metafile ID
|
276
|
+
if self.table_locator and self.table_locator.table_name == self.id:
|
277
|
+
parent_rev_dir_path = Metafile._parent_metafile_rev_dir_path(
|
278
|
+
base_metafile_path=path,
|
279
|
+
parent_number=3,
|
280
|
+
)
|
281
|
+
txn_log_dir = posixpath.join(
|
282
|
+
posixpath.dirname(
|
283
|
+
posixpath.dirname(
|
284
|
+
posixpath.dirname(parent_rev_dir_path),
|
285
|
+
)
|
286
|
+
),
|
287
|
+
TXN_DIR_NAME,
|
288
|
+
)
|
289
|
+
table = Table.read(
|
290
|
+
MetafileRevisionInfo.latest_revision(
|
291
|
+
revision_dir_path=parent_rev_dir_path,
|
292
|
+
filesystem=filesystem,
|
293
|
+
success_txn_log_dir=txn_log_dir,
|
294
|
+
).path,
|
295
|
+
filesystem,
|
296
|
+
)
|
297
|
+
self.table_version_locator.table_locator = table.locator
|
298
|
+
return self
|
299
|
+
|
300
|
+
|
301
|
+
class PartitionLocatorName(LocatorName):
|
302
|
+
def __init__(self, locator: PartitionLocator):
|
303
|
+
self.locator = locator
|
304
|
+
|
305
|
+
@property
|
306
|
+
def immutable_id(self) -> Optional[str]:
|
307
|
+
return self.locator.partition_id
|
308
|
+
|
309
|
+
@immutable_id.setter
|
310
|
+
def immutable_id(self, immutable_id: Optional[str]):
|
311
|
+
self.locator.partition_id = immutable_id
|
312
|
+
|
313
|
+
def parts(self) -> List[str]:
|
314
|
+
return [
|
315
|
+
str(self.locator.partition_values),
|
316
|
+
self.locator.partition_id,
|
317
|
+
]
|
318
|
+
|
196
319
|
|
197
320
|
class PartitionLocator(Locator, dict):
|
198
321
|
@staticmethod
|
@@ -223,16 +346,20 @@ class PartitionLocator(Locator, dict):
|
|
223
346
|
table_name: Optional[str],
|
224
347
|
table_version: Optional[str],
|
225
348
|
stream_id: Optional[str],
|
226
|
-
|
349
|
+
stream_format: Optional[StreamFormat],
|
227
350
|
partition_values: Optional[PartitionValues],
|
228
351
|
partition_id: Optional[str],
|
229
352
|
) -> PartitionLocator:
|
230
|
-
stream_locator =
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
353
|
+
stream_locator = (
|
354
|
+
StreamLocator.at(
|
355
|
+
namespace,
|
356
|
+
table_name,
|
357
|
+
table_version,
|
358
|
+
stream_id,
|
359
|
+
stream_format,
|
360
|
+
)
|
361
|
+
if stream_id and stream_format
|
362
|
+
else None
|
236
363
|
)
|
237
364
|
return PartitionLocator.of(
|
238
365
|
stream_locator,
|
@@ -240,6 +367,14 @@ class PartitionLocator(Locator, dict):
|
|
240
367
|
partition_id,
|
241
368
|
)
|
242
369
|
|
370
|
+
@property
|
371
|
+
def name(self) -> PartitionLocatorName:
|
372
|
+
return PartitionLocatorName(self)
|
373
|
+
|
374
|
+
@property
|
375
|
+
def parent(self) -> Optional[StreamLocator]:
|
376
|
+
return self.stream_locator
|
377
|
+
|
243
378
|
@property
|
244
379
|
def stream_locator(self) -> Optional[StreamLocator]:
|
245
380
|
val: Dict[str, Any] = self.get("streamLocator")
|
@@ -296,10 +431,10 @@ class PartitionLocator(Locator, dict):
|
|
296
431
|
return None
|
297
432
|
|
298
433
|
@property
|
299
|
-
def
|
434
|
+
def stream_format(self) -> Optional[str]:
|
300
435
|
stream_locator = self.stream_locator
|
301
436
|
if stream_locator:
|
302
|
-
return stream_locator.
|
437
|
+
return stream_locator.format
|
303
438
|
return None
|
304
439
|
|
305
440
|
@property
|
@@ -323,13 +458,203 @@ class PartitionLocator(Locator, dict):
|
|
323
458
|
return stream_locator.table_version
|
324
459
|
return None
|
325
460
|
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
461
|
+
|
462
|
+
class PartitionKey(dict):
|
463
|
+
@staticmethod
|
464
|
+
def of(
|
465
|
+
key: List[FieldLocator],
|
466
|
+
name: Optional[str] = None,
|
467
|
+
field_id: Optional[int] = None,
|
468
|
+
transform: Optional[Transform] = None,
|
469
|
+
native_object: Optional[Any] = None,
|
470
|
+
) -> PartitionKey:
|
471
|
+
return PartitionKey(
|
472
|
+
{
|
473
|
+
"key": key,
|
474
|
+
"name": name,
|
475
|
+
"fieldId": field_id,
|
476
|
+
"transform": transform,
|
477
|
+
"nativeObject": native_object,
|
478
|
+
}
|
479
|
+
)
|
480
|
+
|
481
|
+
def equivalent_to(
|
482
|
+
self,
|
483
|
+
other: PartitionKey,
|
484
|
+
check_identifiers: False,
|
485
|
+
):
|
486
|
+
if other is None:
|
487
|
+
return False
|
488
|
+
if not isinstance(other, dict):
|
489
|
+
return False
|
490
|
+
if not isinstance(other, PartitionKey):
|
491
|
+
other = PartitionKey(other)
|
492
|
+
return (
|
493
|
+
self.key == other.key
|
494
|
+
and self.transform == other.transform
|
495
|
+
and not check_identifiers
|
496
|
+
or (self.name == other.name and self.id == other.id)
|
497
|
+
)
|
498
|
+
|
499
|
+
@property
|
500
|
+
def key(self) -> List[FieldLocator]:
|
501
|
+
return self.get("key")
|
502
|
+
|
503
|
+
@property
|
504
|
+
def name(self) -> Optional[str]:
|
505
|
+
return self.get("name")
|
506
|
+
|
507
|
+
@property
|
508
|
+
def id(self) -> Optional[int]:
|
509
|
+
return self.get("fieldId")
|
510
|
+
|
511
|
+
@property
|
512
|
+
def transform(self) -> Optional[Transform]:
|
513
|
+
val: Dict[str, Any] = self.get("transform")
|
514
|
+
if val is not None and not isinstance(val, Transform):
|
515
|
+
self["transform"] = val = Transform(val)
|
516
|
+
return val
|
517
|
+
|
518
|
+
@property
|
519
|
+
def native_object(self) -> Optional[Any]:
|
520
|
+
return self.get("nativeObject")
|
521
|
+
|
522
|
+
|
523
|
+
class PartitionKeyList(List[PartitionKey]):
|
524
|
+
@staticmethod
|
525
|
+
def of(items: List[PartitionKey]) -> PartitionKeyList:
|
526
|
+
typed_items = PartitionKeyList()
|
527
|
+
for item in items:
|
528
|
+
if item is not None and not isinstance(item, PartitionKey):
|
529
|
+
item = PartitionKey(item)
|
530
|
+
typed_items.append(item)
|
531
|
+
return typed_items
|
532
|
+
|
533
|
+
def __getitem__(self, item):
|
534
|
+
val = super().__getitem__(item)
|
535
|
+
if val is not None and not isinstance(val, PartitionKey):
|
536
|
+
self[item] = val = PartitionKey(val)
|
537
|
+
return val
|
538
|
+
|
539
|
+
|
540
|
+
class PartitionScheme(dict):
|
541
|
+
@staticmethod
|
542
|
+
def of(
|
543
|
+
keys: Optional[PartitionKeyList],
|
544
|
+
name: Optional[str] = None,
|
545
|
+
scheme_id: Optional[str] = None,
|
546
|
+
native_object: Optional[Any] = None,
|
547
|
+
) -> PartitionScheme:
|
548
|
+
return PartitionScheme(
|
549
|
+
{
|
550
|
+
"keys": keys,
|
551
|
+
"name": name,
|
552
|
+
"id": scheme_id,
|
553
|
+
"nativeObject": native_object,
|
554
|
+
}
|
555
|
+
)
|
556
|
+
|
557
|
+
def equivalent_to(
|
558
|
+
self,
|
559
|
+
other: PartitionScheme,
|
560
|
+
check_identifiers: bool = False,
|
561
|
+
) -> bool:
|
562
|
+
if other is None:
|
563
|
+
return False
|
564
|
+
if not isinstance(other, dict):
|
565
|
+
return False
|
566
|
+
if not isinstance(other, PartitionScheme):
|
567
|
+
other = PartitionScheme(other)
|
568
|
+
for i in range(len(self.keys)):
|
569
|
+
if not self.keys[i].equivalent_to(other.keys[i], check_identifiers):
|
570
|
+
return False
|
571
|
+
return not check_identifiers or (
|
572
|
+
self.name == other.name and self.id == other.id
|
573
|
+
)
|
574
|
+
|
575
|
+
@property
|
576
|
+
def keys(self) -> Optional[PartitionKeyList]:
|
577
|
+
val: List[PartitionKey] = self.get("keys")
|
578
|
+
if val is not None and not isinstance(val, PartitionKeyList):
|
579
|
+
self["keys"] = val = PartitionKeyList.of(val)
|
580
|
+
return val
|
581
|
+
|
582
|
+
@property
|
583
|
+
def name(self) -> Optional[str]:
|
584
|
+
return self.get("name")
|
585
|
+
|
586
|
+
@property
|
587
|
+
def id(self) -> Optional[str]:
|
588
|
+
return self.get("id")
|
589
|
+
|
590
|
+
@property
|
591
|
+
def native_object(self) -> Optional[Any]:
|
592
|
+
return self.get("nativeObject")
|
593
|
+
|
594
|
+
|
595
|
+
class PartitionSchemeList(List[PartitionScheme]):
|
596
|
+
@staticmethod
|
597
|
+
def of(items: List[PartitionScheme]) -> PartitionSchemeList:
|
598
|
+
typed_items = PartitionSchemeList()
|
599
|
+
for item in items:
|
600
|
+
if item is not None and not isinstance(item, PartitionScheme):
|
601
|
+
item = PartitionScheme(item)
|
602
|
+
typed_items.append(item)
|
603
|
+
return typed_items
|
604
|
+
|
605
|
+
def __getitem__(self, item):
|
606
|
+
val = super().__getitem__(item)
|
607
|
+
if val is not None and not isinstance(val, PartitionScheme):
|
608
|
+
self[item] = val = PartitionScheme(val)
|
609
|
+
return val
|
610
|
+
|
611
|
+
|
612
|
+
class PartitionLocatorAliasName(LocatorName):
|
613
|
+
def __init__(self, locator: PartitionLocatorAlias):
|
614
|
+
self.locator = locator
|
615
|
+
|
616
|
+
@property
|
617
|
+
def immutable_id(self) -> Optional[str]:
|
618
|
+
return None
|
619
|
+
|
620
|
+
def parts(self) -> List[str]:
|
621
|
+
return [
|
622
|
+
str(self.locator.partition_values),
|
623
|
+
self.locator.partition_scheme_id,
|
624
|
+
]
|
625
|
+
|
626
|
+
|
627
|
+
class PartitionLocatorAlias(Locator, dict):
|
628
|
+
@staticmethod
|
629
|
+
def of(parent_partition: Partition):
|
630
|
+
return (
|
631
|
+
PartitionLocatorAlias(
|
632
|
+
{
|
633
|
+
"partition_values": parent_partition.partition_values,
|
634
|
+
"partition_scheme_id": parent_partition.partition_scheme_id,
|
635
|
+
"parent": (
|
636
|
+
parent_partition.locator.parent
|
637
|
+
if parent_partition.locator
|
638
|
+
else None
|
639
|
+
),
|
640
|
+
}
|
641
|
+
)
|
642
|
+
if parent_partition.state == CommitState.COMMITTED
|
643
|
+
else None # only committed partitions can be resolved by alias
|
644
|
+
)
|
645
|
+
|
646
|
+
@property
|
647
|
+
def partition_values(self) -> Optional[PartitionValues]:
|
648
|
+
return self.get("partition_values")
|
649
|
+
|
650
|
+
@property
|
651
|
+
def partition_scheme_id(self) -> Optional[str]:
|
652
|
+
return self.get("partition_scheme_id")
|
653
|
+
|
654
|
+
@property
|
655
|
+
def name(self) -> PartitionLocatorAliasName:
|
656
|
+
return PartitionLocatorAliasName(self)
|
657
|
+
|
658
|
+
@property
|
659
|
+
def parent(self) -> Optional[Locator]:
|
660
|
+
return self.get("parent")
|
File without changes
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class RowFilter:
|
2
|
+
...
|
3
|
+
|
4
|
+
|
5
|
+
class ColumnFilter:
|
6
|
+
...
|
7
|
+
|
8
|
+
|
9
|
+
class PartitionFilter:
|
10
|
+
...
|
11
|
+
|
12
|
+
|
13
|
+
class Pushdown:
|
14
|
+
"""Represents pushdown predicates to be applied for DeltaCAT Tables"""
|
15
|
+
|
16
|
+
row_filter: RowFilter
|
17
|
+
column_filter: ColumnFilter
|
18
|
+
partition_filter: PartitionFilter
|
19
|
+
limit: int
|
@@ -0,0 +1,34 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from dataclasses import dataclass
|
3
|
+
|
4
|
+
|
5
|
+
@dataclass
|
6
|
+
class DataFile:
|
7
|
+
"""Represents a data file, e.g. a S3 object, or a local file."""
|
8
|
+
|
9
|
+
file_path: str
|
10
|
+
|
11
|
+
|
12
|
+
class ScanTask(ABC):
|
13
|
+
"""Base class representing a unit of data to be read by a compute worker"""
|
14
|
+
|
15
|
+
@abstractmethod
|
16
|
+
def data_files(self) -> list[DataFile]:
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
@dataclass
|
21
|
+
class FileScanTask(ScanTask):
|
22
|
+
"""A unit of data in the form of data files"""
|
23
|
+
|
24
|
+
data_file_list: list[DataFile]
|
25
|
+
|
26
|
+
def data_files(self) -> list[DataFile]:
|
27
|
+
return self.data_file_list
|
28
|
+
|
29
|
+
|
30
|
+
class ShardedScanTask(ScanTask):
|
31
|
+
"""A unit of data in the form of shards (e.g. shard 1-10 each represents 1/10 of all data in a Table)"""
|
32
|
+
|
33
|
+
def data_files(self) -> list[DataFile]:
|
34
|
+
raise NotImplementedError("data_files is not implemented for ShardedScanTask")
|