deltacat 0.2.11__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/s3u.py +250 -111
  3. deltacat/catalog/default_catalog_impl/__init__.py +369 -0
  4. deltacat/compute/compactor_v2/compaction_session.py +175 -152
  5. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  6. deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
  7. deltacat/compute/compactor_v2/model/merge_input.py +8 -24
  8. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  9. deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
  10. deltacat/compute/compactor_v2/steps/merge.py +106 -171
  11. deltacat/compute/compactor_v2/utils/delta.py +97 -0
  12. deltacat/compute/compactor_v2/utils/merge.py +126 -0
  13. deltacat/compute/compactor_v2/utils/task_options.py +16 -4
  14. deltacat/compute/merge_on_read/__init__.py +4 -0
  15. deltacat/compute/merge_on_read/daft.py +40 -0
  16. deltacat/compute/merge_on_read/model/__init__.py +0 -0
  17. deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
  18. deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  19. deltacat/compute/merge_on_read/utils/delta.py +42 -0
  20. deltacat/storage/interface.py +10 -2
  21. deltacat/storage/model/types.py +3 -11
  22. deltacat/tests/catalog/__init__.py +0 -0
  23. deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
  24. deltacat/tests/compute/compact_partition_test_cases.py +126 -1
  25. deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
  26. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
  27. deltacat/tests/local_deltacat_storage/__init__.py +19 -2
  28. deltacat/tests/test_utils/pyarrow.py +33 -14
  29. deltacat/tests/utils/test_daft.py +42 -2
  30. deltacat/types/media.py +5 -0
  31. deltacat/types/tables.py +7 -1
  32. deltacat/utils/daft.py +84 -14
  33. {deltacat-0.2.11.dist-info → deltacat-1.0.1.dist-info}/METADATA +2 -2
  34. {deltacat-0.2.11.dist-info → deltacat-1.0.1.dist-info}/RECORD +37 -25
  35. {deltacat-0.2.11.dist-info → deltacat-1.0.1.dist-info}/LICENSE +0 -0
  36. {deltacat-0.2.11.dist-info → deltacat-1.0.1.dist-info}/WHEEL +0 -0
  37. {deltacat-0.2.11.dist-info → deltacat-1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,126 @@
1
+ import logging
2
+
3
+ from deltacat.compute.compactor.model.compact_partition_params import (
4
+ CompactPartitionParams,
5
+ )
6
+ from deltacat.compute.compactor_v2.model.merge_file_group import (
7
+ LocalMergeFileGroupsProvider,
8
+ )
9
+ from deltacat.compute.compactor_v2.model.merge_input import MergeInput
10
+ import pyarrow as pa
11
+ from deltacat import logs
12
+ from typing import List, Optional
13
+
14
+ from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
15
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
16
+ from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
17
+ from deltacat.compute.compactor import (
18
+ RoundCompletionInfo,
19
+ DeltaAnnotated,
20
+ )
21
+
22
+ from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
23
+
24
+ from deltacat.utils.performance import timed_invocation
25
+ from deltacat.storage import (
26
+ Partition,
27
+ )
28
+
29
+
30
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
31
+
32
+
33
+ def materialize(
34
+ input: MergeInput,
35
+ task_index: int,
36
+ compacted_tables: List[pa.Table],
37
+ ) -> MaterializeResult:
38
+ compacted_table = pa.concat_tables(compacted_tables)
39
+ if input.compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
40
+ # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
41
+ # TODO (pdames): compare performance to pandas-native materialize path
42
+ df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
43
+ compacted_table = df
44
+ delta, stage_delta_time = timed_invocation(
45
+ input.deltacat_storage.stage_delta,
46
+ compacted_table,
47
+ input.write_to_partition,
48
+ max_records_per_entry=input.max_records_per_output_file,
49
+ content_type=input.compacted_file_content_type,
50
+ s3_table_writer_kwargs=input.s3_table_writer_kwargs,
51
+ **input.deltacat_storage_kwargs,
52
+ )
53
+ compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
54
+ compacted_table
55
+ )
56
+ logger.debug(
57
+ f"Time taken for materialize task"
58
+ f" to upload {len(compacted_table)} records"
59
+ f" of size {compacted_table_size} is: {stage_delta_time}s"
60
+ )
61
+ manifest = delta.manifest
62
+ manifest_records = manifest.meta.record_count
63
+ assert manifest_records == len(compacted_table), (
64
+ f"Unexpected Error: Materialized delta manifest record count "
65
+ f"({manifest_records}) does not equal compacted table record count "
66
+ f"({len(compacted_table)})"
67
+ )
68
+ materialize_result = MaterializeResult.of(
69
+ delta=delta,
70
+ task_index=task_index,
71
+ # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
72
+ # and in-memory-table-bytes instead of tight coupling to paBytes
73
+ pyarrow_write_result=PyArrowWriteResult.of(
74
+ len(manifest.entries),
75
+ TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
76
+ manifest.meta.content_length,
77
+ len(compacted_table),
78
+ ),
79
+ )
80
+ logger.info(f"Materialize result: {materialize_result}")
81
+ return materialize_result
82
+
83
+
84
+ def generate_local_merge_input(
85
+ params: CompactPartitionParams,
86
+ annotated_deltas: List[DeltaAnnotated],
87
+ compacted_partition: Partition,
88
+ round_completion_info: Optional[RoundCompletionInfo],
89
+ ):
90
+ """
91
+ Generates a merge input for local deltas that do not reside in the Ray object store and
92
+ have not been subject to the hash bucketing process.
93
+
94
+ Args:
95
+ params: parameters for compacting a partition
96
+ annotated_deltas: a list of annotated deltas
97
+ compacted_partition: the compacted partition to write to
98
+ round_completion_info: keeps track of high watermarks and other metadata from previous compaction rounds
99
+
100
+ Returns:
101
+ A MergeInput object
102
+
103
+ """
104
+
105
+ return MergeInput.of(
106
+ merge_file_groups_provider=LocalMergeFileGroupsProvider(
107
+ annotated_deltas,
108
+ read_kwargs_provider=params.read_kwargs_provider,
109
+ deltacat_storage=params.deltacat_storage,
110
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
111
+ ),
112
+ write_to_partition=compacted_partition,
113
+ compacted_file_content_type=params.compacted_file_content_type,
114
+ primary_keys=params.primary_keys,
115
+ sort_keys=params.sort_keys,
116
+ drop_duplicates=params.drop_duplicates,
117
+ max_records_per_output_file=params.records_per_compacted_file,
118
+ enable_profiler=params.enable_profiler,
119
+ metrics_config=params.metrics_config,
120
+ s3_table_writer_kwargs=params.s3_table_writer_kwargs,
121
+ read_kwargs_provider=params.read_kwargs_provider,
122
+ round_completion_info=round_completion_info,
123
+ object_store=params.object_store,
124
+ deltacat_storage=params.deltacat_storage,
125
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
126
+ )
@@ -170,14 +170,24 @@ def hash_bucket_resource_options_provider(
170
170
  else:
171
171
  total_pk_size += pk_size
172
172
 
173
- # total size + pk size + pk hash column + hash bucket index column
173
+ # total size + pk size + pyarrow-to-numpy conversion + pk hash column + hashlib inefficiency + hash bucket index column
174
174
  # Refer to hash_bucket step for more details.
175
- total_memory = size_bytes + total_pk_size + num_rows * 20 + num_rows * 4
175
+ total_memory = (
176
+ size_bytes
177
+ + total_pk_size
178
+ + total_pk_size
179
+ + num_rows * 20
180
+ + num_rows * 20
181
+ + num_rows * 4
182
+ )
176
183
  debug_memory_params["size_bytes"] = size_bytes
177
184
  debug_memory_params["num_rows"] = num_rows
178
185
  debug_memory_params["total_pk_size"] = total_pk_size
179
186
  debug_memory_params["total_memory"] = total_memory
180
187
 
188
+ debug_memory_params["previous_inflation"] = previous_inflation
189
+ debug_memory_params["average_record_size_bytes"] = average_record_size_bytes
190
+
181
191
  # Consider buffer
182
192
  total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
183
193
  debug_memory_params["total_memory_with_buffer"] = total_memory
@@ -269,11 +279,13 @@ def merge_resource_options_provider(
269
279
  else:
270
280
  pk_size_bytes += pk_size
271
281
 
272
- # total data downloaded + primary key hash column + primary key column
273
- # + dict size for merge + incremental index array size
282
+ # total data downloaded + primary key hash column + pyarrow-to-numpy conversion
283
+ # + primary key column + hashlib inefficiency + dict size for merge + incremental index array size
274
284
  total_memory = (
275
285
  data_size
276
286
  + pk_size_bytes
287
+ + pk_size_bytes
288
+ + num_rows * 20
277
289
  + num_rows * 20
278
290
  + num_rows * 20
279
291
  + incremental_index_array_size
@@ -0,0 +1,4 @@
1
+ from deltacat.types.media import DistributedDatasetType
2
+ from deltacat.compute.merge_on_read.daft import merge as daft_merge
3
+
4
+ MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE = {DistributedDatasetType.DAFT.value: daft_merge}
@@ -0,0 +1,40 @@
1
+ import logging
2
+ from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
3
+ from deltacat.storage.model.types import DistributedDataset
4
+ from deltacat.types.media import TableType, DistributedDatasetType
5
+ from deltacat.compute.merge_on_read.utils.delta import create_df_from_all_deltas
6
+ from deltacat import logs
7
+
8
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
9
+
10
+
11
+ def merge(params: MergeOnReadParams, **kwargs) -> DistributedDataset:
12
+ """
13
+ Merges the given deltas and returns the result as distributed dataframe.
14
+ It reads the deltas into the Daft dataframe and leverages operations supported
15
+ by Daft to perform an efficient merge using Ray cluster.
16
+
17
+ TODO(raghumdani): Perform actual merge.
18
+ """
19
+
20
+ delta_dfs = create_df_from_all_deltas(
21
+ deltas=params.deltas,
22
+ table_type=TableType.PYARROW,
23
+ distributed_dataset_type=DistributedDatasetType.DAFT,
24
+ reader_kwargs=params.reader_kwargs,
25
+ deltacat_storage=params.deltacat_storage,
26
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
27
+ **kwargs,
28
+ )
29
+
30
+ logger.info(f"Merging {len(delta_dfs)} delta dfs...")
31
+
32
+ # TODO: This code should be optimized from daft side
33
+ result = None
34
+ for df in delta_dfs:
35
+ if result is None:
36
+ result = df
37
+ else:
38
+ result = result.concat(df)
39
+
40
+ return result
File without changes
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, Dict, List, Union, Any
3
+ from deltacat.storage import (
4
+ Delta,
5
+ DeltaLocator,
6
+ interface as unimplemented_deltacat_storage,
7
+ )
8
+
9
+
10
+ class MergeOnReadParams(dict):
11
+ """
12
+ This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
13
+ """
14
+
15
+ @staticmethod
16
+ def of(params: Optional[Dict]) -> MergeOnReadParams:
17
+ params = {} if params is None else params
18
+
19
+ result = MergeOnReadParams(params)
20
+ assert result.deltas is not None, "deltas is a required arg"
21
+
22
+ result.deltacat_storage = params.get(
23
+ "deltacat_storage", unimplemented_deltacat_storage
24
+ )
25
+ result.reader_kwargs = params.get("reader_kwargs", {})
26
+ result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
27
+
28
+ return result
29
+
30
+ @property
31
+ def deltas(self) -> List[Union[Delta, DeltaLocator]]:
32
+ """
33
+ The list of deltas to compact in-memory.
34
+ """
35
+ return self["deltas"]
36
+
37
+ @deltas.setter
38
+ def deltas(self, to_set: List[Union[Delta, DeltaLocator]]) -> None:
39
+ self["deltas"] = to_set
40
+
41
+ @property
42
+ def reader_kwargs(self) -> Dict[Any, Any]:
43
+ """
44
+ The key word arguments to be passed to the reader.
45
+ """
46
+ return self["reader_kwargs"]
47
+
48
+ @reader_kwargs.setter
49
+ def reader_kwargs(self, kwargs: Dict[Any, Any]) -> None:
50
+ self["reader_kwargs"] = kwargs
51
+
52
+ @property
53
+ def deltacat_storage(self) -> unimplemented_deltacat_storage:
54
+ return self["deltacat_storage"]
55
+
56
+ @deltacat_storage.setter
57
+ def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
58
+ self["deltacat_storage"] = storage
59
+
60
+ @property
61
+ def deltacat_storage_kwargs(self) -> dict:
62
+ return self["deltacat_storage_kwargs"]
63
+
64
+ @deltacat_storage_kwargs.setter
65
+ def deltacat_storage_kwargs(self, kwargs: dict) -> None:
66
+ self["deltacat_storage_kwargs"] = kwargs
File without changes
@@ -0,0 +1,42 @@
1
+ from typing import List, Dict, Any, Optional, Union
2
+ from deltacat.storage.model.delta import Delta, DeltaLocator
3
+ from deltacat.storage.model.types import DistributedDataset
4
+ from deltacat.storage import (
5
+ interface as unimplemented_deltacat_storage,
6
+ )
7
+ from deltacat.types.media import TableType, StorageType, DistributedDatasetType
8
+
9
+
10
+ def create_df_from_all_deltas(
11
+ deltas: List[Union[Delta, DeltaLocator]],
12
+ table_type: TableType,
13
+ distributed_dataset_type: DistributedDatasetType,
14
+ reader_kwargs: Optional[Dict[Any, Any]] = None,
15
+ deltacat_storage=unimplemented_deltacat_storage,
16
+ deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
17
+ *args,
18
+ **kwargs
19
+ ) -> List[DistributedDataset]: # type: ignore
20
+ """
21
+ This method creates a distributed dataset for each delta and returns their references.
22
+ """
23
+
24
+ if reader_kwargs is None:
25
+ reader_kwargs = {}
26
+ if deltacat_storage_kwargs is None:
27
+ deltacat_storage_kwargs = {}
28
+
29
+ df_list = []
30
+
31
+ for delta in deltas:
32
+ df = deltacat_storage.download_delta(
33
+ delta_like=delta,
34
+ table_type=table_type,
35
+ distributed_dataset_type=distributed_dataset_type,
36
+ storage_type=StorageType.DISTRIBUTED,
37
+ **reader_kwargs,
38
+ **deltacat_storage_kwargs
39
+ )
40
+ df_list.append(df)
41
+
42
+ return df_list
@@ -23,7 +23,12 @@ from deltacat.storage import (
23
23
  SortKey,
24
24
  PartitionLocator,
25
25
  )
26
- from deltacat.types.media import ContentType, StorageType, TableType
26
+ from deltacat.types.media import (
27
+ ContentType,
28
+ StorageType,
29
+ TableType,
30
+ DistributedDatasetType,
31
+ )
27
32
  from deltacat.utils.common import ReadKwargsProvider
28
33
 
29
34
 
@@ -178,9 +183,10 @@ def download_delta(
178
183
  columns: Optional[List[str]] = None,
179
184
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
180
185
  ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
186
+ distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
181
187
  *args,
182
188
  **kwargs
183
- ) -> Union[LocalDataset, DistributedDataset]:
189
+ ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
184
190
  """
185
191
  Download the given delta or delta locator into either a list of
186
192
  tables resident in the local node's memory, or into a dataset distributed
@@ -205,6 +211,8 @@ def download_delta_manifest_entry(
205
211
  given delta or delta locator. If a delta is provided with a non-empty
206
212
  manifest, then the entry is downloaded from this manifest. Otherwise, the
207
213
  manifest is first retrieved then the given entry index downloaded.
214
+
215
+ NOTE: The entry will be downloaded in the current node's memory.
208
216
  """
209
217
  raise NotImplementedError("download_delta_manifest_entry not implemented")
210
218
 
@@ -1,24 +1,16 @@
1
1
  from enum import Enum
2
- from typing import List, Union, Any
2
+ from typing import List, Union
3
3
 
4
4
  from pyarrow.parquet import ParquetFile
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  import pyarrow as pa
8
- import pkg_resources
9
- from ray.data._internal.arrow_block import ArrowRow
10
8
  from ray.data.dataset import Dataset
9
+ from daft import DataFrame as DaftDataFrame
11
10
 
12
11
  LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray, ParquetFile]
13
12
  LocalDataset = List[LocalTable]
14
- # Starting Ray 2.5.0, Dataset follows a strict mode (https://docs.ray.io/en/latest/data/faq.html#migrating-to-strict-mode),
15
- # and generic annotation is removed. So add a version checker to determine whether to use the old or new definition.
16
- ray_version = pkg_resources.parse_version(pkg_resources.get_distribution("ray").version)
17
- change_version = pkg_resources.parse_version("2.5.0")
18
- if ray_version < change_version:
19
- DistributedDataset = Dataset[Union[ArrowRow, np.ndarray, Any]]
20
- else:
21
- DistributedDataset = Dataset
13
+ DistributedDataset = Union[Dataset, DaftDataFrame]
22
14
 
23
15
 
24
16
  class DeltaType(str, Enum):
File without changes
@@ -0,0 +1,98 @@
1
+ import unittest
2
+ import sqlite3
3
+ import ray
4
+ import os
5
+ import deltacat.tests.local_deltacat_storage as ds
6
+ from deltacat.utils.common import current_time_ms
7
+ from deltacat.tests.test_utils.pyarrow import (
8
+ create_delta_from_csv_file,
9
+ commit_delta_to_partition,
10
+ )
11
+ from deltacat.types.media import DistributedDatasetType, ContentType
12
+ from deltacat.catalog import default_catalog_impl as dc
13
+
14
+
15
+ class TestReadTable(unittest.TestCase):
16
+ READ_TABLE_NAMESPACE = "catalog_read_table_namespace"
17
+ LOCAL_CATALOG_NAME = "local_catalog"
18
+ DB_FILE_PATH = f"{current_time_ms()}.db"
19
+ SAMPLE_FILE_PATH = "deltacat/tests/catalog/data/sample_table.csv"
20
+
21
+ @classmethod
22
+ def setUpClass(cls):
23
+ ray.init(local_mode=True, ignore_reinit_error=True)
24
+
25
+ con = sqlite3.connect(cls.DB_FILE_PATH)
26
+ cur = con.cursor()
27
+ cls.kwargs = {
28
+ ds.SQLITE_CON_ARG: con,
29
+ ds.SQLITE_CUR_ARG: cur,
30
+ "supported_content_types": [ContentType.CSV],
31
+ }
32
+ cls.deltacat_storage_kwargs = {ds.DB_FILE_PATH_ARG: cls.DB_FILE_PATH}
33
+
34
+ super().setUpClass()
35
+
36
+ @classmethod
37
+ def doClassCleanups(cls) -> None:
38
+ os.remove(cls.DB_FILE_PATH)
39
+
40
+ def test_daft_distributed_read_sanity(self):
41
+ # setup
42
+ READ_TABLE_TABLE_NAME = "test_read_table"
43
+ create_delta_from_csv_file(
44
+ self.READ_TABLE_NAMESPACE,
45
+ [self.SAMPLE_FILE_PATH],
46
+ table_name=READ_TABLE_TABLE_NAME,
47
+ **self.kwargs,
48
+ )
49
+
50
+ dc.initialize(ds=ds)
51
+ df = dc.read_table(
52
+ table=READ_TABLE_TABLE_NAME,
53
+ namespace=self.READ_TABLE_NAMESPACE,
54
+ catalog=self.LOCAL_CATALOG_NAME,
55
+ distributed_dataset_type=DistributedDatasetType.DAFT,
56
+ deltacat_storage_kwargs=self.kwargs,
57
+ )
58
+
59
+ # verify
60
+ self.assertEqual(df.count_rows(), 6)
61
+ self.assertEqual(df.column_names, ["pk", "value"])
62
+
63
+ def test_daft_distributed_read_multiple_deltas(self):
64
+ # setup
65
+ READ_TABLE_TABLE_NAME = "test_read_table_2"
66
+ delta = create_delta_from_csv_file(
67
+ self.READ_TABLE_NAMESPACE,
68
+ [self.SAMPLE_FILE_PATH],
69
+ table_name=READ_TABLE_TABLE_NAME,
70
+ **self.kwargs,
71
+ )
72
+
73
+ partition = ds.get_partition(
74
+ delta.stream_locator, delta.partition_values, **self.kwargs
75
+ )
76
+
77
+ commit_delta_to_partition(
78
+ partition=partition, file_paths=[self.SAMPLE_FILE_PATH], **self.kwargs
79
+ )
80
+
81
+ # action
82
+ dc.initialize(ds=ds)
83
+ df = dc.read_table(
84
+ table=READ_TABLE_TABLE_NAME,
85
+ namespace=self.READ_TABLE_NAMESPACE,
86
+ catalog=self.LOCAL_CATALOG_NAME,
87
+ distributed_dataset_type=DistributedDatasetType.DAFT,
88
+ merge_on_read=False,
89
+ deltacat_storage_kwargs=self.kwargs,
90
+ )
91
+
92
+ # verify
93
+ self.assertEqual(
94
+ df.count_rows(),
95
+ 12,
96
+ "we expect twice as many" " columns as merge on read is disabled",
97
+ )
98
+ self.assertEqual(df.column_names, ["pk", "value"])
@@ -442,6 +442,33 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
442
442
  drop_duplicates=True,
443
443
  skip_enabled_compact_partition_drivers=None,
444
444
  ),
445
+ "12-incremental-decimal-single-hash-bucket": IncrementalCompactionTestCaseParams(
446
+ primary_keys={"pk_col_1"},
447
+ sort_keys=[SortKey.of(key_name="sk_col_1")],
448
+ partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
449
+ partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
450
+ input_deltas=pa.Table.from_arrays(
451
+ [
452
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
453
+ pa.array([i for i in range(20)]),
454
+ ],
455
+ names=["pk_col_1", "sk_col_1"],
456
+ ),
457
+ input_deltas_delta_type=DeltaType.UPSERT,
458
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
459
+ [
460
+ pa.array([0.1, 0.2, 0.3, 0.4, 0.5]),
461
+ pa.array([3, 7, 11, 15, 19]),
462
+ ],
463
+ names=["pk_col_1", "sk_col_1"],
464
+ ),
465
+ do_create_placement_group=False,
466
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
467
+ hash_bucket_count=1,
468
+ read_kwargs_provider=None,
469
+ drop_duplicates=True,
470
+ skip_enabled_compact_partition_drivers=None,
471
+ ),
445
472
  }
446
473
 
447
474
  REBASE_THEN_INCREMENTAL_TEST_CASES = {
@@ -1091,6 +1118,104 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
1091
1118
  skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1092
1119
  ),
1093
1120
  "14-rebase-then-empty-incremental-delta": RebaseThenIncrementalCompactionTestCaseParams(
1121
+ primary_keys={"pk_col_1"},
1122
+ sort_keys=[
1123
+ SortKey.of(key_name="sk_col_1"),
1124
+ SortKey.of(key_name="sk_col_2"),
1125
+ ],
1126
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1127
+ partition_values=["1"],
1128
+ input_deltas=pa.Table.from_arrays(
1129
+ [
1130
+ pa.array([str(i) for i in range(10)]),
1131
+ pa.array([i for i in range(0, 10)]),
1132
+ pa.array(["foo"] * 10),
1133
+ pa.array([i / 10 for i in range(10, 20)]),
1134
+ ],
1135
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1136
+ ),
1137
+ input_deltas_delta_type=DeltaType.UPSERT,
1138
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1139
+ [
1140
+ pa.array([str(i) for i in range(10)]),
1141
+ pa.array([i for i in range(0, 10)]),
1142
+ pa.array(["foo"] * 10),
1143
+ pa.array([i / 10 for i in range(10, 20)]),
1144
+ ],
1145
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1146
+ ),
1147
+ incremental_deltas=None,
1148
+ incremental_deltas_delta_type=DeltaType.UPSERT,
1149
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1150
+ [
1151
+ pa.array([str(i) for i in range(10)]),
1152
+ pa.array([i for i in range(0, 10)]),
1153
+ pa.array(["foo"] * 10),
1154
+ pa.array([i / 10 for i in range(10, 20)]),
1155
+ ],
1156
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1157
+ ),
1158
+ do_create_placement_group=False,
1159
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1160
+ hash_bucket_count=3,
1161
+ read_kwargs_provider=None,
1162
+ drop_duplicates=True,
1163
+ skip_enabled_compact_partition_drivers=None,
1164
+ ),
1165
+ "15-rebase-then-incremental-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
1166
+ primary_keys={"pk_col_1"},
1167
+ sort_keys=[
1168
+ SortKey.of(key_name="sk_col_1"),
1169
+ SortKey.of(key_name="sk_col_2"),
1170
+ ],
1171
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1172
+ partition_values=["1"],
1173
+ input_deltas=pa.Table.from_arrays(
1174
+ [
1175
+ pa.array([str(i) for i in range(10)]),
1176
+ pa.array([i for i in range(0, 10)]),
1177
+ pa.array(["foo"] * 10),
1178
+ pa.array([i / 10 for i in range(10, 20)]),
1179
+ ],
1180
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1181
+ ),
1182
+ input_deltas_delta_type=DeltaType.UPSERT,
1183
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1184
+ [
1185
+ pa.array([str(i) for i in range(10)]),
1186
+ pa.array([i for i in range(0, 10)]),
1187
+ pa.array(["foo"] * 10),
1188
+ pa.array([i / 10 for i in range(10, 20)]),
1189
+ ],
1190
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1191
+ ),
1192
+ incremental_deltas=pa.Table.from_arrays(
1193
+ [
1194
+ pa.array([str(i) for i in range(10)]),
1195
+ pa.array([i for i in range(20, 30)]),
1196
+ pa.array(["foo"] * 10),
1197
+ pa.array([i / 10 for i in range(40, 50)]),
1198
+ ],
1199
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1200
+ ),
1201
+ incremental_deltas_delta_type=DeltaType.UPSERT,
1202
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1203
+ [
1204
+ pa.array([str(i) for i in range(10)]),
1205
+ pa.array([i for i in range(20, 30)]),
1206
+ pa.array(["foo"] * 10),
1207
+ pa.array([i / 10 for i in range(40, 50)]),
1208
+ ],
1209
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1210
+ ),
1211
+ do_create_placement_group=False,
1212
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1213
+ hash_bucket_count=1,
1214
+ read_kwargs_provider=None,
1215
+ drop_duplicates=True,
1216
+ skip_enabled_compact_partition_drivers=None,
1217
+ ),
1218
+ "16-rebase-then-empty-incremental-delta-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
1094
1219
  primary_keys={"pk_col_1"},
1095
1220
  sort_keys=[
1096
1221
  SortKey.of(key_name="sk_col_1"),
@@ -1137,9 +1262,9 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
1137
1262
  ),
1138
1263
  }
1139
1264
 
1140
-
1141
1265
  INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
1142
1266
 
1267
+
1143
1268
  REBASE_THEN_INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(
1144
1269
  REBASE_THEN_INCREMENTAL_TEST_CASES
1145
1270
  )
@@ -7,6 +7,7 @@ from typing import Any, Callable, Dict, List, Optional, Set
7
7
  from boto3.resources.base import ServiceResource
8
8
  import pyarrow as pa
9
9
  from pytest_benchmark.fixture import BenchmarkFixture
10
+ from deltacat.types.media import StorageType
10
11
 
11
12
  from deltacat.tests.compute.test_util_common import (
12
13
  get_rcf,
@@ -269,7 +270,9 @@ def test_compact_partition_incremental(
269
270
  **compaction_audit_obj
270
271
  )
271
272
 
272
- tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
273
+ tables = ds.download_delta(
274
+ compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
275
+ )
273
276
  actual_compacted_table = pa.concat_tables(tables)
274
277
  sorting_cols: List[Any] = [(val, "ascending") for val in primary_keys]
275
278
  # the compacted table may contain multiple files and chunks