deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,97 +0,0 @@
1
- import json
2
- import logging
3
- from typing import Dict, Any
4
- from deltacat import logs
5
- from deltacat.compute.compactor import RoundCompletionInfo
6
- from deltacat.storage import PartitionLocator
7
- from deltacat.aws import s3u as s3_utils
8
- from typing import Optional
9
- from deltacat.utils.metrics import metrics
10
-
11
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
-
13
-
14
- def get_round_completion_file_s3_url(
15
- bucket: str,
16
- source_partition_locator: PartitionLocator,
17
- destination_partition_locator: Optional[PartitionLocator] = None,
18
- ) -> str:
19
-
20
- base_url = source_partition_locator.path(f"s3://{bucket}")
21
- if destination_partition_locator:
22
- base_url = destination_partition_locator.path(
23
- f"s3://{bucket}/{source_partition_locator.hexdigest()}"
24
- )
25
-
26
- return f"{base_url}.json"
27
-
28
-
29
- @metrics
30
- def read_round_completion_file(
31
- bucket: str,
32
- source_partition_locator: PartitionLocator,
33
- destination_partition_locator: Optional[PartitionLocator] = None,
34
- **s3_client_kwargs: Optional[Dict[str, Any]],
35
- ) -> RoundCompletionInfo:
36
-
37
- all_uris = []
38
- if destination_partition_locator:
39
- round_completion_file_url_with_destination = get_round_completion_file_s3_url(
40
- bucket,
41
- source_partition_locator,
42
- destination_partition_locator,
43
- )
44
- all_uris.append(round_completion_file_url_with_destination)
45
-
46
- # Note: we read from RCF at two different URI for backward
47
- # compatibility reasons.
48
- round_completion_file_url_prev = get_round_completion_file_s3_url(
49
- bucket,
50
- source_partition_locator,
51
- )
52
-
53
- all_uris.append(round_completion_file_url_prev)
54
-
55
- round_completion_info = None
56
-
57
- for rcf_uri in all_uris:
58
- logger.info(f"Reading round completion file from: {rcf_uri}")
59
- result = s3_utils.download(rcf_uri, False, **s3_client_kwargs)
60
- if result:
61
- json_str = result["Body"].read().decode("utf-8")
62
- round_completion_info = RoundCompletionInfo(json.loads(json_str))
63
- logger.info(f"Read round completion info: {round_completion_info}")
64
- break
65
- else:
66
- logger.warning(f"Round completion file not present at {rcf_uri}")
67
-
68
- return round_completion_info
69
-
70
-
71
- @metrics
72
- def write_round_completion_file(
73
- bucket: Optional[str],
74
- source_partition_locator: Optional[PartitionLocator],
75
- destination_partition_locator: Optional[PartitionLocator],
76
- round_completion_info: RoundCompletionInfo,
77
- completion_file_s3_url: Optional[str] = None,
78
- **s3_client_kwargs: Optional[Dict[str, Any]],
79
- ) -> str:
80
- if bucket is None and completion_file_s3_url is None:
81
- raise AssertionError("Either bucket or completion_file_s3_url must be passed")
82
-
83
- logger.info(f"writing round completion file contents: {round_completion_info}")
84
- if completion_file_s3_url is None:
85
- completion_file_s3_url = get_round_completion_file_s3_url(
86
- bucket,
87
- source_partition_locator,
88
- destination_partition_locator,
89
- )
90
- logger.info(f"writing round completion file to: {completion_file_s3_url}")
91
- s3_utils.upload(
92
- completion_file_s3_url,
93
- str(json.dumps(round_completion_info)),
94
- **s3_client_kwargs,
95
- )
96
- logger.info(f"round completion file written to: {completion_file_s3_url}")
97
- return completion_file_s3_url
@@ -1,4 +0,0 @@
1
- from deltacat.types.media import DistributedDatasetType
2
- from deltacat.compute.merge_on_read.daft import merge as daft_merge
3
-
4
- MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE = {DistributedDatasetType.DAFT.value: daft_merge}
@@ -1,40 +0,0 @@
1
- import logging
2
- from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
3
- from deltacat.storage.model.types import DistributedDataset
4
- from deltacat.types.media import TableType, DistributedDatasetType
5
- from deltacat.compute.merge_on_read.utils.delta import create_df_from_all_deltas
6
- from deltacat import logs
7
-
8
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
9
-
10
-
11
- def merge(params: MergeOnReadParams, **kwargs) -> DistributedDataset:
12
- """
13
- Merges the given deltas and returns the result as distributed dataframe.
14
- It reads the deltas into the Daft dataframe and leverages operations supported
15
- by Daft to perform an efficient merge using Ray cluster.
16
-
17
- TODO(raghumdani): Perform actual merge.
18
- """
19
-
20
- delta_dfs = create_df_from_all_deltas(
21
- deltas=params.deltas,
22
- table_type=TableType.PYARROW,
23
- distributed_dataset_type=DistributedDatasetType.DAFT,
24
- reader_kwargs=params.reader_kwargs,
25
- deltacat_storage=params.deltacat_storage,
26
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
27
- **kwargs,
28
- )
29
-
30
- logger.info(f"Merging {len(delta_dfs)} delta dfs...")
31
-
32
- # TODO: This code should be optimized from daft side
33
- result = None
34
- for df in delta_dfs:
35
- if result is None:
36
- result = df
37
- else:
38
- result = result.concat(df)
39
-
40
- return result
@@ -1,66 +0,0 @@
1
- from __future__ import annotations
2
- from typing import Optional, Dict, List, Union, Any
3
- from deltacat.storage import (
4
- Delta,
5
- DeltaLocator,
6
- interface as unimplemented_deltacat_storage,
7
- )
8
-
9
-
10
- class MergeOnReadParams(dict):
11
- """
12
- This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
13
- """
14
-
15
- @staticmethod
16
- def of(params: Optional[Dict]) -> MergeOnReadParams:
17
- params = {} if params is None else params
18
-
19
- result = MergeOnReadParams(params)
20
- assert result.deltas is not None, "deltas is a required arg"
21
-
22
- result.deltacat_storage = params.get(
23
- "deltacat_storage", unimplemented_deltacat_storage
24
- )
25
- result.reader_kwargs = params.get("reader_kwargs", {})
26
- result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
27
-
28
- return result
29
-
30
- @property
31
- def deltas(self) -> List[Union[Delta, DeltaLocator]]:
32
- """
33
- The list of deltas to compact in-memory.
34
- """
35
- return self["deltas"]
36
-
37
- @deltas.setter
38
- def deltas(self, to_set: List[Union[Delta, DeltaLocator]]) -> None:
39
- self["deltas"] = to_set
40
-
41
- @property
42
- def reader_kwargs(self) -> Dict[Any, Any]:
43
- """
44
- The key word arguments to be passed to the reader.
45
- """
46
- return self["reader_kwargs"]
47
-
48
- @reader_kwargs.setter
49
- def reader_kwargs(self, kwargs: Dict[Any, Any]) -> None:
50
- self["reader_kwargs"] = kwargs
51
-
52
- @property
53
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
54
- return self["deltacat_storage"]
55
-
56
- @deltacat_storage.setter
57
- def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
58
- self["deltacat_storage"] = storage
59
-
60
- @property
61
- def deltacat_storage_kwargs(self) -> dict:
62
- return self["deltacat_storage_kwargs"]
63
-
64
- @deltacat_storage_kwargs.setter
65
- def deltacat_storage_kwargs(self, kwargs: dict) -> None:
66
- self["deltacat_storage_kwargs"] = kwargs
@@ -1,42 +0,0 @@
1
- from typing import List, Dict, Any, Optional, Union
2
- from deltacat.storage.model.delta import Delta, DeltaLocator
3
- from deltacat.storage.model.types import DistributedDataset
4
- from deltacat.storage import (
5
- interface as unimplemented_deltacat_storage,
6
- )
7
- from deltacat.types.media import TableType, StorageType, DistributedDatasetType
8
-
9
-
10
- def create_df_from_all_deltas(
11
- deltas: List[Union[Delta, DeltaLocator]],
12
- table_type: TableType,
13
- distributed_dataset_type: DistributedDatasetType,
14
- reader_kwargs: Optional[Dict[Any, Any]] = None,
15
- deltacat_storage=unimplemented_deltacat_storage,
16
- deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
17
- *args,
18
- **kwargs
19
- ) -> List[DistributedDataset]: # type: ignore
20
- """
21
- This method creates a distributed dataset for each delta and returns their references.
22
- """
23
-
24
- if reader_kwargs is None:
25
- reader_kwargs = {}
26
- if deltacat_storage_kwargs is None:
27
- deltacat_storage_kwargs = {}
28
-
29
- df_list = []
30
-
31
- for delta in deltas:
32
- df = deltacat_storage.download_delta(
33
- delta_like=delta,
34
- table_type=table_type,
35
- distributed_dataset_type=distributed_dataset_type,
36
- storage_type=StorageType.DISTRIBUTED,
37
- **reader_kwargs,
38
- **deltacat_storage_kwargs
39
- )
40
- df_list.append(df)
41
-
42
- return df_list
@@ -1,231 +0,0 @@
1
- import pytest
2
- import os
3
- from moto import mock_s3
4
- import boto3
5
- from boto3.resources.base import ServiceResource
6
- from deltacat.compute.compactor.utils.round_completion_file import (
7
- read_round_completion_file,
8
- write_round_completion_file,
9
- )
10
- from deltacat.tests.compute.test_util_common import get_test_partition_locator
11
- from deltacat.compute.compactor import RoundCompletionInfo
12
-
13
- RCF_BUCKET_NAME = "rcf-bucket"
14
-
15
-
16
- @pytest.fixture(autouse=True, scope="module")
17
- def mock_aws_credential():
18
- os.environ["AWS_ACCESS_KEY_ID"] = "testing"
19
- os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
20
- os.environ["AWS_SECURITY_TOKEN"] = "testing"
21
- os.environ["AWS_SESSION_TOKEN"] = "testing"
22
- os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
23
- yield
24
-
25
-
26
- @pytest.fixture(autouse=True, scope="module")
27
- def s3_resource(mock_aws_credential):
28
- with mock_s3():
29
- yield boto3.resource("s3")
30
-
31
-
32
- @pytest.fixture(autouse=True, scope="function")
33
- def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
34
- s3_resource.create_bucket(
35
- ACL="authenticated-read",
36
- Bucket=RCF_BUCKET_NAME,
37
- )
38
- yield
39
- s3_resource.Bucket(RCF_BUCKET_NAME).objects.all().delete()
40
-
41
-
42
- class TestReadWriteRoundCompletionFile:
43
- def test_read_when_rcf_written_without_destination(self):
44
- """
45
- This test case tests the backward compatibility by successfully
46
- reading the previously written rcf.
47
- """
48
-
49
- source_locator = get_test_partition_locator("source")
50
- destination_locator = get_test_partition_locator("destination")
51
-
52
- expected_rcf = RoundCompletionInfo.of(
53
- high_watermark=122,
54
- compacted_delta_locator={},
55
- compacted_pyarrow_write_result={},
56
- sort_keys_bit_width=12,
57
- )
58
-
59
- rcf_url = write_round_completion_file(
60
- RCF_BUCKET_NAME, source_locator, None, expected_rcf
61
- )
62
-
63
- rcf = read_round_completion_file(
64
- RCF_BUCKET_NAME, source_locator, destination_locator
65
- )
66
-
67
- assert (
68
- rcf_url == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507.json"
69
- )
70
- assert rcf == expected_rcf
71
-
72
- def test_read_when_rcf_written_with_destination(self):
73
- """
74
- This test case tests the backward compatibility by successfully
75
- reading the previously written rcf.
76
- """
77
-
78
- source_locator = get_test_partition_locator("source")
79
- destination_locator = get_test_partition_locator("destination")
80
-
81
- expected_rcf = RoundCompletionInfo.of(
82
- high_watermark=122,
83
- compacted_delta_locator={},
84
- compacted_pyarrow_write_result={},
85
- sort_keys_bit_width=12,
86
- )
87
-
88
- rcf_url = write_round_completion_file(
89
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
90
- )
91
-
92
- rcf = read_round_completion_file(
93
- RCF_BUCKET_NAME, source_locator, destination_locator
94
- )
95
-
96
- assert (
97
- rcf_url
98
- == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507/e9939deadc091b3289a2eb0ca56b1ba86b9892f4.json"
99
- )
100
- assert rcf == expected_rcf
101
-
102
- def test_read_without_destination_when_rcf_written_with_destination(self):
103
- """
104
- This test case tests the backward compatibility by successfully
105
- reading the previously written rcf.
106
- """
107
-
108
- source_locator = get_test_partition_locator("source")
109
- destination_locator = get_test_partition_locator("destination")
110
-
111
- expected_rcf = RoundCompletionInfo.of(
112
- high_watermark=122,
113
- compacted_delta_locator={},
114
- compacted_pyarrow_write_result={},
115
- sort_keys_bit_width=12,
116
- )
117
-
118
- write_round_completion_file(
119
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
120
- )
121
-
122
- rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
123
-
124
- assert rcf is None
125
-
126
- def test_read_without_destination_when_rcf_written_without_destination(self):
127
- """
128
- This test case tests the backward compatibility by successfully
129
- reading the previously written rcf.
130
- """
131
-
132
- source_locator = get_test_partition_locator("source")
133
-
134
- expected_rcf = RoundCompletionInfo.of(
135
- high_watermark=122,
136
- compacted_delta_locator={},
137
- compacted_pyarrow_write_result={},
138
- sort_keys_bit_width=12,
139
- )
140
-
141
- write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
142
-
143
- rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
144
-
145
- assert rcf == expected_rcf
146
-
147
- def test_read_when_rcf_written_both_with_and_without_destination(self):
148
- """
149
- This test case tests the backward compatibility by successfully
150
- reading the previously written rcf.
151
- """
152
-
153
- source_locator = get_test_partition_locator("source")
154
- destination_locator = get_test_partition_locator("destination")
155
-
156
- expected_rcf = RoundCompletionInfo.of(
157
- high_watermark=122,
158
- compacted_delta_locator={},
159
- compacted_pyarrow_write_result={},
160
- sort_keys_bit_width=12,
161
- )
162
-
163
- expected_rcf_2 = RoundCompletionInfo.of(
164
- high_watermark=1223,
165
- compacted_delta_locator={},
166
- compacted_pyarrow_write_result={},
167
- sort_keys_bit_width=1233,
168
- )
169
-
170
- write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
171
-
172
- write_round_completion_file(
173
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf_2
174
- )
175
-
176
- rcf = read_round_completion_file(
177
- RCF_BUCKET_NAME, source_locator, destination_locator
178
- )
179
-
180
- assert rcf == expected_rcf_2
181
-
182
- def test_read_when_none_destination_partition_id(self):
183
-
184
- source_locator = get_test_partition_locator("source")
185
- destination_locator = get_test_partition_locator(None)
186
-
187
- expected_rcf = RoundCompletionInfo.of(
188
- high_watermark=122,
189
- compacted_delta_locator={},
190
- compacted_pyarrow_write_result={},
191
- sort_keys_bit_width=12,
192
- )
193
-
194
- write_round_completion_file(
195
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
196
- )
197
-
198
- rcf = read_round_completion_file(
199
- RCF_BUCKET_NAME, source_locator, destination_locator
200
- )
201
-
202
- assert rcf == expected_rcf
203
-
204
- def test_write_when_custom_url_is_passed(self):
205
- """
206
- This test case tests the backward compatibility by successfully
207
- reading the previously written rcf.
208
- """
209
-
210
- source_locator = get_test_partition_locator("source")
211
-
212
- expected_rcf = RoundCompletionInfo.of(
213
- high_watermark=122,
214
- compacted_delta_locator={},
215
- compacted_pyarrow_write_result={},
216
- sort_keys_bit_width=12,
217
- )
218
-
219
- completion_file_s3_url = f"s3://{RCF_BUCKET_NAME}/test.json"
220
- rcf_url = write_round_completion_file(
221
- RCF_BUCKET_NAME,
222
- source_locator,
223
- None,
224
- expected_rcf,
225
- completion_file_s3_url=completion_file_s3_url,
226
- )
227
-
228
- rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
229
-
230
- assert rcf_url == completion_file_s3_url
231
- assert rcf is None