deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,97 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import logging
|
3
|
-
from typing import Dict, Any
|
4
|
-
from deltacat import logs
|
5
|
-
from deltacat.compute.compactor import RoundCompletionInfo
|
6
|
-
from deltacat.storage import PartitionLocator
|
7
|
-
from deltacat.aws import s3u as s3_utils
|
8
|
-
from typing import Optional
|
9
|
-
from deltacat.utils.metrics import metrics
|
10
|
-
|
11
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
|
-
|
13
|
-
|
14
|
-
def get_round_completion_file_s3_url(
|
15
|
-
bucket: str,
|
16
|
-
source_partition_locator: PartitionLocator,
|
17
|
-
destination_partition_locator: Optional[PartitionLocator] = None,
|
18
|
-
) -> str:
|
19
|
-
|
20
|
-
base_url = source_partition_locator.path(f"s3://{bucket}")
|
21
|
-
if destination_partition_locator:
|
22
|
-
base_url = destination_partition_locator.path(
|
23
|
-
f"s3://{bucket}/{source_partition_locator.hexdigest()}"
|
24
|
-
)
|
25
|
-
|
26
|
-
return f"{base_url}.json"
|
27
|
-
|
28
|
-
|
29
|
-
@metrics
|
30
|
-
def read_round_completion_file(
|
31
|
-
bucket: str,
|
32
|
-
source_partition_locator: PartitionLocator,
|
33
|
-
destination_partition_locator: Optional[PartitionLocator] = None,
|
34
|
-
**s3_client_kwargs: Optional[Dict[str, Any]],
|
35
|
-
) -> RoundCompletionInfo:
|
36
|
-
|
37
|
-
all_uris = []
|
38
|
-
if destination_partition_locator:
|
39
|
-
round_completion_file_url_with_destination = get_round_completion_file_s3_url(
|
40
|
-
bucket,
|
41
|
-
source_partition_locator,
|
42
|
-
destination_partition_locator,
|
43
|
-
)
|
44
|
-
all_uris.append(round_completion_file_url_with_destination)
|
45
|
-
|
46
|
-
# Note: we read from RCF at two different URI for backward
|
47
|
-
# compatibility reasons.
|
48
|
-
round_completion_file_url_prev = get_round_completion_file_s3_url(
|
49
|
-
bucket,
|
50
|
-
source_partition_locator,
|
51
|
-
)
|
52
|
-
|
53
|
-
all_uris.append(round_completion_file_url_prev)
|
54
|
-
|
55
|
-
round_completion_info = None
|
56
|
-
|
57
|
-
for rcf_uri in all_uris:
|
58
|
-
logger.info(f"Reading round completion file from: {rcf_uri}")
|
59
|
-
result = s3_utils.download(rcf_uri, False, **s3_client_kwargs)
|
60
|
-
if result:
|
61
|
-
json_str = result["Body"].read().decode("utf-8")
|
62
|
-
round_completion_info = RoundCompletionInfo(json.loads(json_str))
|
63
|
-
logger.info(f"Read round completion info: {round_completion_info}")
|
64
|
-
break
|
65
|
-
else:
|
66
|
-
logger.warning(f"Round completion file not present at {rcf_uri}")
|
67
|
-
|
68
|
-
return round_completion_info
|
69
|
-
|
70
|
-
|
71
|
-
@metrics
|
72
|
-
def write_round_completion_file(
|
73
|
-
bucket: Optional[str],
|
74
|
-
source_partition_locator: Optional[PartitionLocator],
|
75
|
-
destination_partition_locator: Optional[PartitionLocator],
|
76
|
-
round_completion_info: RoundCompletionInfo,
|
77
|
-
completion_file_s3_url: Optional[str] = None,
|
78
|
-
**s3_client_kwargs: Optional[Dict[str, Any]],
|
79
|
-
) -> str:
|
80
|
-
if bucket is None and completion_file_s3_url is None:
|
81
|
-
raise AssertionError("Either bucket or completion_file_s3_url must be passed")
|
82
|
-
|
83
|
-
logger.info(f"writing round completion file contents: {round_completion_info}")
|
84
|
-
if completion_file_s3_url is None:
|
85
|
-
completion_file_s3_url = get_round_completion_file_s3_url(
|
86
|
-
bucket,
|
87
|
-
source_partition_locator,
|
88
|
-
destination_partition_locator,
|
89
|
-
)
|
90
|
-
logger.info(f"writing round completion file to: {completion_file_s3_url}")
|
91
|
-
s3_utils.upload(
|
92
|
-
completion_file_s3_url,
|
93
|
-
str(json.dumps(round_completion_info)),
|
94
|
-
**s3_client_kwargs,
|
95
|
-
)
|
96
|
-
logger.info(f"round completion file written to: {completion_file_s3_url}")
|
97
|
-
return completion_file_s3_url
|
@@ -1,40 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
|
3
|
-
from deltacat.storage.model.types import DistributedDataset
|
4
|
-
from deltacat.types.media import TableType, DistributedDatasetType
|
5
|
-
from deltacat.compute.merge_on_read.utils.delta import create_df_from_all_deltas
|
6
|
-
from deltacat import logs
|
7
|
-
|
8
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
9
|
-
|
10
|
-
|
11
|
-
def merge(params: MergeOnReadParams, **kwargs) -> DistributedDataset:
|
12
|
-
"""
|
13
|
-
Merges the given deltas and returns the result as distributed dataframe.
|
14
|
-
It reads the deltas into the Daft dataframe and leverages operations supported
|
15
|
-
by Daft to perform an efficient merge using Ray cluster.
|
16
|
-
|
17
|
-
TODO(raghumdani): Perform actual merge.
|
18
|
-
"""
|
19
|
-
|
20
|
-
delta_dfs = create_df_from_all_deltas(
|
21
|
-
deltas=params.deltas,
|
22
|
-
table_type=TableType.PYARROW,
|
23
|
-
distributed_dataset_type=DistributedDatasetType.DAFT,
|
24
|
-
reader_kwargs=params.reader_kwargs,
|
25
|
-
deltacat_storage=params.deltacat_storage,
|
26
|
-
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
27
|
-
**kwargs,
|
28
|
-
)
|
29
|
-
|
30
|
-
logger.info(f"Merging {len(delta_dfs)} delta dfs...")
|
31
|
-
|
32
|
-
# TODO: This code should be optimized from daft side
|
33
|
-
result = None
|
34
|
-
for df in delta_dfs:
|
35
|
-
if result is None:
|
36
|
-
result = df
|
37
|
-
else:
|
38
|
-
result = result.concat(df)
|
39
|
-
|
40
|
-
return result
|
@@ -1,66 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
from typing import Optional, Dict, List, Union, Any
|
3
|
-
from deltacat.storage import (
|
4
|
-
Delta,
|
5
|
-
DeltaLocator,
|
6
|
-
interface as unimplemented_deltacat_storage,
|
7
|
-
)
|
8
|
-
|
9
|
-
|
10
|
-
class MergeOnReadParams(dict):
|
11
|
-
"""
|
12
|
-
This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
|
13
|
-
"""
|
14
|
-
|
15
|
-
@staticmethod
|
16
|
-
def of(params: Optional[Dict]) -> MergeOnReadParams:
|
17
|
-
params = {} if params is None else params
|
18
|
-
|
19
|
-
result = MergeOnReadParams(params)
|
20
|
-
assert result.deltas is not None, "deltas is a required arg"
|
21
|
-
|
22
|
-
result.deltacat_storage = params.get(
|
23
|
-
"deltacat_storage", unimplemented_deltacat_storage
|
24
|
-
)
|
25
|
-
result.reader_kwargs = params.get("reader_kwargs", {})
|
26
|
-
result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
|
27
|
-
|
28
|
-
return result
|
29
|
-
|
30
|
-
@property
|
31
|
-
def deltas(self) -> List[Union[Delta, DeltaLocator]]:
|
32
|
-
"""
|
33
|
-
The list of deltas to compact in-memory.
|
34
|
-
"""
|
35
|
-
return self["deltas"]
|
36
|
-
|
37
|
-
@deltas.setter
|
38
|
-
def deltas(self, to_set: List[Union[Delta, DeltaLocator]]) -> None:
|
39
|
-
self["deltas"] = to_set
|
40
|
-
|
41
|
-
@property
|
42
|
-
def reader_kwargs(self) -> Dict[Any, Any]:
|
43
|
-
"""
|
44
|
-
The key word arguments to be passed to the reader.
|
45
|
-
"""
|
46
|
-
return self["reader_kwargs"]
|
47
|
-
|
48
|
-
@reader_kwargs.setter
|
49
|
-
def reader_kwargs(self, kwargs: Dict[Any, Any]) -> None:
|
50
|
-
self["reader_kwargs"] = kwargs
|
51
|
-
|
52
|
-
@property
|
53
|
-
def deltacat_storage(self) -> unimplemented_deltacat_storage:
|
54
|
-
return self["deltacat_storage"]
|
55
|
-
|
56
|
-
@deltacat_storage.setter
|
57
|
-
def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
|
58
|
-
self["deltacat_storage"] = storage
|
59
|
-
|
60
|
-
@property
|
61
|
-
def deltacat_storage_kwargs(self) -> dict:
|
62
|
-
return self["deltacat_storage_kwargs"]
|
63
|
-
|
64
|
-
@deltacat_storage_kwargs.setter
|
65
|
-
def deltacat_storage_kwargs(self, kwargs: dict) -> None:
|
66
|
-
self["deltacat_storage_kwargs"] = kwargs
|
@@ -1,42 +0,0 @@
|
|
1
|
-
from typing import List, Dict, Any, Optional, Union
|
2
|
-
from deltacat.storage.model.delta import Delta, DeltaLocator
|
3
|
-
from deltacat.storage.model.types import DistributedDataset
|
4
|
-
from deltacat.storage import (
|
5
|
-
interface as unimplemented_deltacat_storage,
|
6
|
-
)
|
7
|
-
from deltacat.types.media import TableType, StorageType, DistributedDatasetType
|
8
|
-
|
9
|
-
|
10
|
-
def create_df_from_all_deltas(
|
11
|
-
deltas: List[Union[Delta, DeltaLocator]],
|
12
|
-
table_type: TableType,
|
13
|
-
distributed_dataset_type: DistributedDatasetType,
|
14
|
-
reader_kwargs: Optional[Dict[Any, Any]] = None,
|
15
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
16
|
-
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
|
17
|
-
*args,
|
18
|
-
**kwargs
|
19
|
-
) -> List[DistributedDataset]: # type: ignore
|
20
|
-
"""
|
21
|
-
This method creates a distributed dataset for each delta and returns their references.
|
22
|
-
"""
|
23
|
-
|
24
|
-
if reader_kwargs is None:
|
25
|
-
reader_kwargs = {}
|
26
|
-
if deltacat_storage_kwargs is None:
|
27
|
-
deltacat_storage_kwargs = {}
|
28
|
-
|
29
|
-
df_list = []
|
30
|
-
|
31
|
-
for delta in deltas:
|
32
|
-
df = deltacat_storage.download_delta(
|
33
|
-
delta_like=delta,
|
34
|
-
table_type=table_type,
|
35
|
-
distributed_dataset_type=distributed_dataset_type,
|
36
|
-
storage_type=StorageType.DISTRIBUTED,
|
37
|
-
**reader_kwargs,
|
38
|
-
**deltacat_storage_kwargs
|
39
|
-
)
|
40
|
-
df_list.append(df)
|
41
|
-
|
42
|
-
return df_list
|
@@ -1,231 +0,0 @@
|
|
1
|
-
import pytest
|
2
|
-
import os
|
3
|
-
from moto import mock_s3
|
4
|
-
import boto3
|
5
|
-
from boto3.resources.base import ServiceResource
|
6
|
-
from deltacat.compute.compactor.utils.round_completion_file import (
|
7
|
-
read_round_completion_file,
|
8
|
-
write_round_completion_file,
|
9
|
-
)
|
10
|
-
from deltacat.tests.compute.test_util_common import get_test_partition_locator
|
11
|
-
from deltacat.compute.compactor import RoundCompletionInfo
|
12
|
-
|
13
|
-
RCF_BUCKET_NAME = "rcf-bucket"
|
14
|
-
|
15
|
-
|
16
|
-
@pytest.fixture(autouse=True, scope="module")
|
17
|
-
def mock_aws_credential():
|
18
|
-
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
19
|
-
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
20
|
-
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
21
|
-
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
22
|
-
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
23
|
-
yield
|
24
|
-
|
25
|
-
|
26
|
-
@pytest.fixture(autouse=True, scope="module")
|
27
|
-
def s3_resource(mock_aws_credential):
|
28
|
-
with mock_s3():
|
29
|
-
yield boto3.resource("s3")
|
30
|
-
|
31
|
-
|
32
|
-
@pytest.fixture(autouse=True, scope="function")
|
33
|
-
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
34
|
-
s3_resource.create_bucket(
|
35
|
-
ACL="authenticated-read",
|
36
|
-
Bucket=RCF_BUCKET_NAME,
|
37
|
-
)
|
38
|
-
yield
|
39
|
-
s3_resource.Bucket(RCF_BUCKET_NAME).objects.all().delete()
|
40
|
-
|
41
|
-
|
42
|
-
class TestReadWriteRoundCompletionFile:
|
43
|
-
def test_read_when_rcf_written_without_destination(self):
|
44
|
-
"""
|
45
|
-
This test case tests the backward compatibility by successfully
|
46
|
-
reading the previously written rcf.
|
47
|
-
"""
|
48
|
-
|
49
|
-
source_locator = get_test_partition_locator("source")
|
50
|
-
destination_locator = get_test_partition_locator("destination")
|
51
|
-
|
52
|
-
expected_rcf = RoundCompletionInfo.of(
|
53
|
-
high_watermark=122,
|
54
|
-
compacted_delta_locator={},
|
55
|
-
compacted_pyarrow_write_result={},
|
56
|
-
sort_keys_bit_width=12,
|
57
|
-
)
|
58
|
-
|
59
|
-
rcf_url = write_round_completion_file(
|
60
|
-
RCF_BUCKET_NAME, source_locator, None, expected_rcf
|
61
|
-
)
|
62
|
-
|
63
|
-
rcf = read_round_completion_file(
|
64
|
-
RCF_BUCKET_NAME, source_locator, destination_locator
|
65
|
-
)
|
66
|
-
|
67
|
-
assert (
|
68
|
-
rcf_url == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507.json"
|
69
|
-
)
|
70
|
-
assert rcf == expected_rcf
|
71
|
-
|
72
|
-
def test_read_when_rcf_written_with_destination(self):
|
73
|
-
"""
|
74
|
-
This test case tests the backward compatibility by successfully
|
75
|
-
reading the previously written rcf.
|
76
|
-
"""
|
77
|
-
|
78
|
-
source_locator = get_test_partition_locator("source")
|
79
|
-
destination_locator = get_test_partition_locator("destination")
|
80
|
-
|
81
|
-
expected_rcf = RoundCompletionInfo.of(
|
82
|
-
high_watermark=122,
|
83
|
-
compacted_delta_locator={},
|
84
|
-
compacted_pyarrow_write_result={},
|
85
|
-
sort_keys_bit_width=12,
|
86
|
-
)
|
87
|
-
|
88
|
-
rcf_url = write_round_completion_file(
|
89
|
-
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
90
|
-
)
|
91
|
-
|
92
|
-
rcf = read_round_completion_file(
|
93
|
-
RCF_BUCKET_NAME, source_locator, destination_locator
|
94
|
-
)
|
95
|
-
|
96
|
-
assert (
|
97
|
-
rcf_url
|
98
|
-
== "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507/e9939deadc091b3289a2eb0ca56b1ba86b9892f4.json"
|
99
|
-
)
|
100
|
-
assert rcf == expected_rcf
|
101
|
-
|
102
|
-
def test_read_without_destination_when_rcf_written_with_destination(self):
|
103
|
-
"""
|
104
|
-
This test case tests the backward compatibility by successfully
|
105
|
-
reading the previously written rcf.
|
106
|
-
"""
|
107
|
-
|
108
|
-
source_locator = get_test_partition_locator("source")
|
109
|
-
destination_locator = get_test_partition_locator("destination")
|
110
|
-
|
111
|
-
expected_rcf = RoundCompletionInfo.of(
|
112
|
-
high_watermark=122,
|
113
|
-
compacted_delta_locator={},
|
114
|
-
compacted_pyarrow_write_result={},
|
115
|
-
sort_keys_bit_width=12,
|
116
|
-
)
|
117
|
-
|
118
|
-
write_round_completion_file(
|
119
|
-
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
120
|
-
)
|
121
|
-
|
122
|
-
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
123
|
-
|
124
|
-
assert rcf is None
|
125
|
-
|
126
|
-
def test_read_without_destination_when_rcf_written_without_destination(self):
|
127
|
-
"""
|
128
|
-
This test case tests the backward compatibility by successfully
|
129
|
-
reading the previously written rcf.
|
130
|
-
"""
|
131
|
-
|
132
|
-
source_locator = get_test_partition_locator("source")
|
133
|
-
|
134
|
-
expected_rcf = RoundCompletionInfo.of(
|
135
|
-
high_watermark=122,
|
136
|
-
compacted_delta_locator={},
|
137
|
-
compacted_pyarrow_write_result={},
|
138
|
-
sort_keys_bit_width=12,
|
139
|
-
)
|
140
|
-
|
141
|
-
write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
|
142
|
-
|
143
|
-
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
144
|
-
|
145
|
-
assert rcf == expected_rcf
|
146
|
-
|
147
|
-
def test_read_when_rcf_written_both_with_and_without_destination(self):
|
148
|
-
"""
|
149
|
-
This test case tests the backward compatibility by successfully
|
150
|
-
reading the previously written rcf.
|
151
|
-
"""
|
152
|
-
|
153
|
-
source_locator = get_test_partition_locator("source")
|
154
|
-
destination_locator = get_test_partition_locator("destination")
|
155
|
-
|
156
|
-
expected_rcf = RoundCompletionInfo.of(
|
157
|
-
high_watermark=122,
|
158
|
-
compacted_delta_locator={},
|
159
|
-
compacted_pyarrow_write_result={},
|
160
|
-
sort_keys_bit_width=12,
|
161
|
-
)
|
162
|
-
|
163
|
-
expected_rcf_2 = RoundCompletionInfo.of(
|
164
|
-
high_watermark=1223,
|
165
|
-
compacted_delta_locator={},
|
166
|
-
compacted_pyarrow_write_result={},
|
167
|
-
sort_keys_bit_width=1233,
|
168
|
-
)
|
169
|
-
|
170
|
-
write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
|
171
|
-
|
172
|
-
write_round_completion_file(
|
173
|
-
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf_2
|
174
|
-
)
|
175
|
-
|
176
|
-
rcf = read_round_completion_file(
|
177
|
-
RCF_BUCKET_NAME, source_locator, destination_locator
|
178
|
-
)
|
179
|
-
|
180
|
-
assert rcf == expected_rcf_2
|
181
|
-
|
182
|
-
def test_read_when_none_destination_partition_id(self):
|
183
|
-
|
184
|
-
source_locator = get_test_partition_locator("source")
|
185
|
-
destination_locator = get_test_partition_locator(None)
|
186
|
-
|
187
|
-
expected_rcf = RoundCompletionInfo.of(
|
188
|
-
high_watermark=122,
|
189
|
-
compacted_delta_locator={},
|
190
|
-
compacted_pyarrow_write_result={},
|
191
|
-
sort_keys_bit_width=12,
|
192
|
-
)
|
193
|
-
|
194
|
-
write_round_completion_file(
|
195
|
-
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
196
|
-
)
|
197
|
-
|
198
|
-
rcf = read_round_completion_file(
|
199
|
-
RCF_BUCKET_NAME, source_locator, destination_locator
|
200
|
-
)
|
201
|
-
|
202
|
-
assert rcf == expected_rcf
|
203
|
-
|
204
|
-
def test_write_when_custom_url_is_passed(self):
|
205
|
-
"""
|
206
|
-
This test case tests the backward compatibility by successfully
|
207
|
-
reading the previously written rcf.
|
208
|
-
"""
|
209
|
-
|
210
|
-
source_locator = get_test_partition_locator("source")
|
211
|
-
|
212
|
-
expected_rcf = RoundCompletionInfo.of(
|
213
|
-
high_watermark=122,
|
214
|
-
compacted_delta_locator={},
|
215
|
-
compacted_pyarrow_write_result={},
|
216
|
-
sort_keys_bit_width=12,
|
217
|
-
)
|
218
|
-
|
219
|
-
completion_file_s3_url = f"s3://{RCF_BUCKET_NAME}/test.json"
|
220
|
-
rcf_url = write_round_completion_file(
|
221
|
-
RCF_BUCKET_NAME,
|
222
|
-
source_locator,
|
223
|
-
None,
|
224
|
-
expected_rcf,
|
225
|
-
completion_file_s3_url=completion_file_s3_url,
|
226
|
-
)
|
227
|
-
|
228
|
-
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
229
|
-
|
230
|
-
assert rcf_url == completion_file_s3_url
|
231
|
-
assert rcf is None
|