deltacat 0.1.18b1__py3-none-any.whl → 0.1.18b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/compaction_session.py +62 -25
  3. deltacat/compute/compactor/model/delta_annotated.py +1 -1
  4. deltacat/compute/compactor/model/materialize_result.py +16 -2
  5. deltacat/compute/compactor/model/repartition_result.py +6 -0
  6. deltacat/compute/compactor/model/round_completion_info.py +8 -0
  7. deltacat/compute/compactor/repartition_session.py +174 -0
  8. deltacat/compute/compactor/steps/materialize.py +116 -27
  9. deltacat/compute/compactor/steps/repartition.py +210 -0
  10. deltacat/compute/compactor/utils/io.py +131 -49
  11. deltacat/compute/compactor/utils/round_completion_file.py +14 -16
  12. deltacat/constants.py +2 -0
  13. deltacat/storage/interface.py +1 -1
  14. deltacat/storage/model/types.py +10 -2
  15. deltacat/tests/compactor/utils/__init__.py +0 -0
  16. deltacat/tests/compactor/utils/test_io.py +69 -0
  17. deltacat/tests/test_repartition.py +193 -0
  18. deltacat/tests/test_utils/__init__.py +0 -0
  19. deltacat/tests/test_utils/constants.py +7 -0
  20. deltacat/tests/utils/test_resources.py +36 -0
  21. deltacat/utils/ray_utils/concurrency.py +2 -0
  22. deltacat/utils/resources.py +72 -0
  23. {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/METADATA +2 -5
  24. {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/RECORD +28 -18
  25. {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/WHEEL +1 -1
  26. /deltacat/{utils/profiling.py → tests/compactor/__init__.py} +0 -0
  27. {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/LICENSE +0 -0
  28. {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,210 @@
1
+ import importlib
2
+ import logging
3
+ from contextlib import nullcontext
4
+ import pyarrow.compute as pc
5
+ import pyarrow as pa
6
+ from typing import List, Optional
7
+ from deltacat.types.media import StorageType, ContentType
8
+ import ray
9
+ from deltacat import logs
10
+ from deltacat.compute.compactor import DeltaAnnotated
11
+ from deltacat.compute.compactor.model.repartition_result import RepartitionResult
12
+ from deltacat.storage import interface as unimplemented_deltacat_storage
13
+ from deltacat.storage import Partition
14
+ from deltacat.utils.ray_utils.runtime import (
15
+ get_current_ray_task_id,
16
+ get_current_ray_worker_id,
17
+ )
18
+ from deltacat.utils.common import ReadKwargsProvider
19
+ from deltacat.utils.performance import timed_invocation
20
+ from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
21
+ from deltacat.storage import Delta
22
+ from enum import Enum
23
+
24
+ if importlib.util.find_spec("memray"):
25
+ import memray
26
+
27
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
28
+
29
+ """
30
+ Similar to Spark (https://sparkbyexamples.com/spark/spark-partitioning-understanding/), where
31
+ partition helps in localizing the data and reduce the data shuffling across the network nodes reducing network latency
32
+ which is a major component of the transformation operation thereby reducing the time of completion.
33
+ Deltacat with Ray can support different partitioning strategies to reduce the data movement either across network or between compute and storage
34
+ Note that the term partition here is different from the term used in catalog
35
+ Type of Partition:
36
+ Range Partition: It assigns rows to partitions based on column values falling within a given range, e.g., repartition(column="last_updated", ranges=['2023-01-01', '2023-02-01', '2023-03-01']), data will be split into 4 files
37
+ Hash Partition: Hash Partitioning attempts to spread the data evenly across various partitions based on the key, e.g., repartition(column="last_updated", num_partitions=10), data will be split into 10 files evenly
38
+ """
39
+
40
+
41
+ class RepartitionType(str, Enum):
42
+ RANGE = "range"
43
+ HASH = "hash"
44
+
45
+
46
+ def generate_unique_name(base_name: str, existing_names: List[str]):
47
+ counter = 1
48
+ while base_name + str(counter) in existing_names:
49
+ counter += 1
50
+ return base_name + str(counter)
51
+
52
+
53
+ def repartition_range(
54
+ tables: List[pa.Table],
55
+ destination_partition: Partition,
56
+ repartition_args: dict,
57
+ max_records_per_output_file: int,
58
+ repartitioned_file_content_type: ContentType = ContentType.PARQUET,
59
+ deltacat_storage=unimplemented_deltacat_storage,
60
+ ):
61
+ """
62
+ Repartitions a list of Arrow tables based on specified ranges and stores the repartitioned tables.
63
+
64
+ Args:
65
+ tables (List[pa.Table]): List of tables to be repartitioned.
66
+ destination_partition (Partition): The partition to store the repartitioned tables.
67
+ repartition_args (dict): Arguments for repartitioning. Must include a "column" key for the column to partition
68
+ on and a "ranges" key for a list of partition range values.
69
+ max_records_per_output_file (int): Maximum number of records per output file.
70
+ repartitioned_file_content_type (ContentType, optional): The content type of the repartitioned files.
71
+ Defaults to ContentType.PARQUET.
72
+ deltacat_storage (unimplemented): Storage where to put the repartitioned data.
73
+
74
+ Raises:
75
+ ValueError: If no partition ranges are specified or if the column to partition on does not exist in the tables.
76
+
77
+ Returns:
78
+ RepartitionResult: Contains a list of the stored deltas for each partition range.
79
+ Examples:
80
+ repartition_args['ranges']= [x, y, z], The tables will be repartitioned into 4 files, i.e., (-inf, x], (x, y], (y, z], (z, inf)
81
+
82
+ Note:
83
+ The function assumes that the tables all share the same schema. If the column to partition on does not exist
84
+ in the tables, an error will be raised. For each partition range, a new file is created. This could result in
85
+ more output files than input files.
86
+ """
87
+ column: str = repartition_args["column"]
88
+ partition_ranges: List = repartition_args["ranges"]
89
+ if len(partition_ranges) == 0:
90
+ raise ValueError("No partition ranges specified")
91
+ # check if the column exists in the table
92
+ # TODO: design a better way to handle the case when the column does not exist in the table, e.g., backfill + repartition by stream position + file id
93
+ if not all(column in table.column_names for table in tables):
94
+ raise ValueError(f"Column {column} does not exist in the table")
95
+ partition_ranges.sort()
96
+ partition_ranges = [-float("Inf")] + partition_ranges + [float("Inf")]
97
+ partitioned_tables_list = [[] for _ in range(len(partition_ranges) - 1)]
98
+
99
+ total_record_count = 0
100
+ col_name_int64 = f"{column}_int64"
101
+ col_name_int64 = generate_unique_name(col_name_int64, tables[0].schema.names)
102
+ for table in tables:
103
+ total_record_count += len(table)
104
+ table_new = table.add_column(
105
+ 0,
106
+ pa.field(col_name_int64, pa.int64()),
107
+ pc.cast(table[column], pa.int64()),
108
+ )
109
+ # Iterate over pairs of values in partition_ranges
110
+ for i, (lower_limit, upper_limit) in enumerate(
111
+ zip(partition_ranges[:-1], partition_ranges[1:]), start=0
112
+ ):
113
+ # Add the table filtered by the lower and upper limits to partitioned_tables_list
114
+ partitioned_tables_list[i].append(
115
+ table_new.filter(
116
+ (pc.field(col_name_int64) > pc.scalar(lower_limit))
117
+ & (pc.field(col_name_int64) <= pc.scalar(upper_limit))
118
+ )
119
+ )
120
+ partition_table_length = 0
121
+ # After re-grouping the tables by specified ranges, for each group, we need concat and stage the tables
122
+ partition_deltas: List[Delta] = []
123
+ for partition_tables in partitioned_tables_list:
124
+ if len(partition_tables) > 0:
125
+ partition_table: pa.Table = pa.concat_tables(partition_tables)
126
+ if len(partition_table) > 0:
127
+ partition_table_length += len(partition_table)
128
+ partition_delta: Delta = deltacat_storage.stage_delta(
129
+ partition_table,
130
+ destination_partition,
131
+ max_records_per_entry=max_records_per_output_file,
132
+ content_type=repartitioned_file_content_type,
133
+ )
134
+ partition_deltas.append(partition_delta)
135
+
136
+ assert (
137
+ partition_table_length == total_record_count
138
+ ), f"Repartitioned table should have the same number of records {partition_table_length} as the original table {total_record_count}"
139
+ return RepartitionResult(
140
+ range_deltas=partition_deltas,
141
+ )
142
+
143
+
144
+ def _timed_repartition(
145
+ annotated_delta: DeltaAnnotated,
146
+ destination_partition: Partition,
147
+ repartition_type: RepartitionType,
148
+ repartition_args: dict,
149
+ max_records_per_output_file: int,
150
+ enable_profiler: bool,
151
+ read_kwargs_provider: Optional[ReadKwargsProvider],
152
+ repartitioned_file_content_type: ContentType = ContentType.PARQUET,
153
+ deltacat_storage=unimplemented_deltacat_storage,
154
+ ) -> RepartitionResult:
155
+ task_id = get_current_ray_task_id()
156
+ worker_id = get_current_ray_worker_id()
157
+ with memray.Tracker(
158
+ f"repartition_{worker_id}_{task_id}.bin"
159
+ ) if enable_profiler else nullcontext():
160
+ tables: List[pa.Table] = deltacat_storage.download_delta(
161
+ annotated_delta,
162
+ storage_type=StorageType.LOCAL,
163
+ file_reader_kwargs_provider=read_kwargs_provider,
164
+ )
165
+ if repartition_type == RepartitionType.RANGE:
166
+ return repartition_range(
167
+ tables=tables,
168
+ destination_partition=destination_partition,
169
+ repartition_args=repartition_args,
170
+ max_records_per_output_file=max_records_per_output_file,
171
+ repartitioned_file_content_type=repartitioned_file_content_type,
172
+ deltacat_storage=deltacat_storage,
173
+ )
174
+ else:
175
+ raise NotImplementedError(
176
+ f"Repartition type {repartition_type} is not supported."
177
+ )
178
+
179
+
180
+ @ray.remote
181
+ def repartition(
182
+ annotated_delta: DeltaAnnotated,
183
+ destination_partition: Partition,
184
+ repartition_type: RepartitionType,
185
+ repartition_args: dict,
186
+ max_records_per_output_file: int,
187
+ enable_profiler: bool,
188
+ metrics_config: Optional[MetricsConfig],
189
+ read_kwargs_provider: Optional[ReadKwargsProvider],
190
+ repartitioned_file_content_type: ContentType = ContentType.PARQUET,
191
+ deltacat_storage=unimplemented_deltacat_storage,
192
+ ) -> RepartitionResult:
193
+ logger.info(f"Starting repartition task...")
194
+ repartition_result, duration = timed_invocation(
195
+ func=_timed_repartition,
196
+ annotated_delta=annotated_delta,
197
+ destination_partition=destination_partition,
198
+ repartition_type=repartition_type,
199
+ repartition_args=repartition_args,
200
+ max_records_per_output_file=max_records_per_output_file,
201
+ enable_profiler=enable_profiler,
202
+ read_kwargs_provider=read_kwargs_provider,
203
+ repartitioned_file_content_type=repartitioned_file_content_type,
204
+ deltacat_storage=deltacat_storage,
205
+ )
206
+ if metrics_config:
207
+ emit_timer_metrics(
208
+ metrics_name="repartition", value=duration, metrics_config=metrics_config
209
+ )
210
+ return repartition_result
@@ -1,7 +1,11 @@
1
1
  import logging
2
2
  import math
3
3
  from deltacat.compute.stats.models.delta_stats import DeltaStats
4
- from deltacat.constants import PYARROW_INFLATION_MULTIPLIER, BYTES_PER_MEBIBYTE
4
+ from deltacat.constants import (
5
+ PYARROW_INFLATION_MULTIPLIER,
6
+ BYTES_PER_MEBIBYTE,
7
+ MEMORY_TO_HASH_BUCKET_COUNT_RATIO,
8
+ )
5
9
 
6
10
  from deltacat.storage import (
7
11
  PartitionLocator,
@@ -84,54 +88,6 @@ def discover_deltas(
84
88
  return input_deltas, previous_last_stream_position_compacted
85
89
 
86
90
 
87
- def _discover_deltas(
88
- source_partition_locator: PartitionLocator,
89
- start_position_exclusive: Optional[int],
90
- end_position_inclusive: int,
91
- deltacat_storage=unimplemented_deltacat_storage,
92
- **kwargs,
93
- ) -> List[Delta]:
94
- stream_locator = source_partition_locator.stream_locator
95
- namespace = stream_locator.namespace
96
- table_name = stream_locator.table_name
97
- table_version = stream_locator.table_version
98
- partition_values = source_partition_locator.partition_values
99
- deltas_list_result = deltacat_storage.list_deltas(
100
- namespace=namespace,
101
- table_name=table_name,
102
- partition_values=partition_values,
103
- table_version=table_version,
104
- first_stream_position=start_position_exclusive,
105
- last_stream_position=end_position_inclusive,
106
- ascending_order=True,
107
- include_manifest=True,
108
- **kwargs,
109
- )
110
- deltas = deltas_list_result.all_items()
111
- if not deltas:
112
- raise RuntimeError(
113
- f"Unexpected Error: Couldn't find any deltas to "
114
- f"compact in delta stream position range "
115
- f"('{start_position_exclusive}', "
116
- f"'{end_position_inclusive}']. Source partition: "
117
- f"{source_partition_locator}"
118
- )
119
- if start_position_exclusive == deltas[0].stream_position:
120
- first_delta = deltas.pop(0)
121
- logger.info(
122
- f"Removed exclusive start delta w/ expected stream "
123
- f"position '{start_position_exclusive}' from deltas to "
124
- f"compact: {first_delta}"
125
- )
126
- logger.info(
127
- f"Count of deltas to compact in delta stream "
128
- f"position range ('{start_position_exclusive}', "
129
- f"'{end_position_inclusive}']: {len(deltas)}. Source "
130
- f"partition: '{source_partition_locator}'"
131
- )
132
- return deltas
133
-
134
-
135
91
  def limit_input_deltas(
136
92
  input_deltas: List[Delta],
137
93
  cluster_resources: Dict[str, float],
@@ -285,3 +241,129 @@ def limit_input_deltas(
285
241
  logger.info(f"Input uniform delta count: {len(rebatched_da_list)}")
286
242
 
287
243
  return rebatched_da_list, hash_bucket_count, high_watermark, require_multiple_rounds
244
+
245
+
246
+ def fit_input_deltas(
247
+ input_deltas: List[Delta],
248
+ cluster_resources: Dict[str, float],
249
+ hash_bucket_count: Optional[int],
250
+ deltacat_storage=unimplemented_deltacat_storage,
251
+ ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
252
+ """
253
+ This method tries to fit all the input deltas to run into the existing cluster. Contrary to
254
+ 'limit_input_deltas', it will not fail if the current cluster cannot run the compaction reliably.
255
+ It is the responsibility of the caller to ensure that they pass enough resources for the job to execute.
256
+
257
+ Note: There is a possibility that individual file could be very large, which makes the deltas non uniform.
258
+ In such scenarios, it is advisable to allocate multiple vCPUs to the tasks to ensure parallelism.
259
+
260
+ Args:
261
+ input_deltas: The input deltas to be normalized.
262
+ cluster_resources: Total available resources in the cluster.
263
+ hash_bucket_count: The hash bucket count.
264
+ deltacat_storage: An implementation of the DeltaCAT storage interface.
265
+
266
+ Returns:
267
+ Tuple of list of annotated deltas, recommended hash bucket count, high watermark,
268
+ and whether multiple rounds are required (which is always False)
269
+ """
270
+ worker_cpus = int(cluster_resources["CPU"])
271
+ total_memory = float(cluster_resources["memory"])
272
+ high_watermark = HighWatermark()
273
+ annotated_input_da_list = []
274
+ delta_bytes = 0
275
+ total_files = 0
276
+
277
+ if not input_deltas:
278
+ raise AssertionError("No input deltas found!")
279
+
280
+ for delta in input_deltas:
281
+ manifest_entries = delta.manifest.entries
282
+ position = delta.stream_position
283
+
284
+ for entry in manifest_entries:
285
+ delta_bytes += entry.meta.content_length
286
+
287
+ total_files += len(manifest_entries)
288
+
289
+ high_watermark.set(
290
+ delta.locator.partition_locator,
291
+ max(position, high_watermark.get(delta.locator.partition_locator)),
292
+ )
293
+ delta_annotated = DeltaAnnotated.of(delta)
294
+ annotated_input_da_list.append(delta_annotated)
295
+
296
+ # We assume that the cluster is capable of distributing all tasks
297
+ # correctly. Hence, the correct in-memory size will be in the ratio of
298
+ # in-disk size.
299
+ def estimate_size(content_length):
300
+ return (content_length * 1.0 / delta_bytes) * total_memory
301
+
302
+ # Assuming each CPU consumes equal amount of memory
303
+ min_delta_bytes = total_memory / worker_cpus
304
+ rebatched_da_list = DeltaAnnotated.rebatch(
305
+ annotated_deltas=annotated_input_da_list,
306
+ min_delta_bytes=min_delta_bytes,
307
+ estimation_function=estimate_size,
308
+ )
309
+
310
+ # Recommended hash buckets based on the experiments performed
311
+ # using S3 input for optimal throughput.
312
+ if hash_bucket_count is None:
313
+ hash_bucket_count = int(
314
+ math.ceil(total_memory / MEMORY_TO_HASH_BUCKET_COUNT_RATIO)
315
+ )
316
+
317
+ logger.info(
318
+ f"Input delta bytes: {delta_bytes}, Total files: {total_files}, The worker_cpus: {worker_cpus}, "
319
+ f" total_memory: {total_memory}, and hash_bucket_count: {hash_bucket_count}"
320
+ )
321
+ return rebatched_da_list, hash_bucket_count, high_watermark, False
322
+
323
+
324
+ def _discover_deltas(
325
+ source_partition_locator: PartitionLocator,
326
+ start_position_exclusive: Optional[int],
327
+ end_position_inclusive: int,
328
+ deltacat_storage=unimplemented_deltacat_storage,
329
+ **kwargs,
330
+ ) -> List[Delta]:
331
+ stream_locator = source_partition_locator.stream_locator
332
+ namespace = stream_locator.namespace
333
+ table_name = stream_locator.table_name
334
+ table_version = stream_locator.table_version
335
+ partition_values = source_partition_locator.partition_values
336
+ deltas_list_result = deltacat_storage.list_deltas(
337
+ namespace=namespace,
338
+ table_name=table_name,
339
+ partition_values=partition_values,
340
+ table_version=table_version,
341
+ first_stream_position=start_position_exclusive,
342
+ last_stream_position=end_position_inclusive,
343
+ ascending_order=True,
344
+ include_manifest=True,
345
+ **kwargs,
346
+ )
347
+ deltas = deltas_list_result.all_items()
348
+ if not deltas:
349
+ raise RuntimeError(
350
+ f"Unexpected Error: Couldn't find any deltas to "
351
+ f"compact in delta stream position range "
352
+ f"('{start_position_exclusive}', "
353
+ f"'{end_position_inclusive}']. Source partition: "
354
+ f"{source_partition_locator}"
355
+ )
356
+ if start_position_exclusive == deltas[0].stream_position:
357
+ first_delta = deltas.pop(0)
358
+ logger.info(
359
+ f"Removed exclusive start delta w/ expected stream "
360
+ f"position '{start_position_exclusive}' from deltas to "
361
+ f"compact: {first_delta}"
362
+ )
363
+ logger.info(
364
+ f"Count of deltas to compact in delta stream "
365
+ f"position range ('{start_position_exclusive}', "
366
+ f"'{end_position_inclusive}']: {len(deltas)}. Source "
367
+ f"partition: '{source_partition_locator}'"
368
+ )
369
+ return deltas
@@ -4,6 +4,8 @@ import logging
4
4
  from deltacat import logs
5
5
  from deltacat.compute.compactor import RoundCompletionInfo
6
6
  from deltacat.storage import PartitionLocator
7
+ from deltacat.aws import s3u as s3_utils
8
+ from typing import Optional
7
9
 
8
10
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
9
11
 
@@ -20,8 +22,6 @@ def read_round_completion_file(
20
22
  bucket: str, source_partition_locator: PartitionLocator
21
23
  ) -> RoundCompletionInfo:
22
24
 
23
- from deltacat.aws import s3u as s3_utils
24
-
25
25
  round_completion_file_url = get_round_completion_file_s3_url(
26
26
  bucket,
27
27
  source_partition_locator,
@@ -37,21 +37,19 @@ def read_round_completion_file(
37
37
 
38
38
 
39
39
  def write_round_completion_file(
40
- bucket: str,
41
- source_partition_locator: PartitionLocator,
40
+ bucket: Optional[str],
41
+ source_partition_locator: Optional[PartitionLocator],
42
42
  round_completion_info: RoundCompletionInfo,
43
+ completion_file_s3_url: str = None,
43
44
  ) -> str:
44
45
 
45
- from deltacat.aws import s3u as s3_utils
46
-
47
46
  logger.info(f"writing round completion file contents: {round_completion_info}")
48
- round_completion_file_s3_url = get_round_completion_file_s3_url(
49
- bucket,
50
- source_partition_locator,
51
- )
52
- logger.info(f"writing round completion file to: {round_completion_file_s3_url}")
53
- s3_utils.upload(
54
- round_completion_file_s3_url, str(json.dumps(round_completion_info))
55
- )
56
- logger.info(f"round completion file written to: {round_completion_file_s3_url}")
57
- return round_completion_file_s3_url
47
+ if completion_file_s3_url is None:
48
+ completion_file_s3_url = get_round_completion_file_s3_url(
49
+ bucket,
50
+ source_partition_locator,
51
+ )
52
+ logger.info(f"writing round completion file to: {completion_file_s3_url}")
53
+ s3_utils.upload(completion_file_s3_url, str(json.dumps(round_completion_info)))
54
+ logger.info(f"round completion file written to: {completion_file_s3_url}")
55
+ return completion_file_s3_url
deltacat/constants.py CHANGED
@@ -52,3 +52,5 @@ PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS = 6
52
52
  PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG = {
53
53
  "retries": {"max_attempts": 25, "mode": "standard"}
54
54
  }
55
+
56
+ MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE
@@ -443,7 +443,7 @@ def get_partition(
443
443
 
444
444
 
445
445
  def stage_delta(
446
- data: Union[LocalTable, LocalDataset, DistributedDataset],
446
+ data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
447
447
  partition: Partition,
448
448
  delta_type: DeltaType = DeltaType.UPSERT,
449
449
  max_records_per_entry: Optional[int] = None,
@@ -1,15 +1,23 @@
1
1
  from enum import Enum
2
- from typing import Any, List, Union
2
+ from typing import List, Union, Any
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
  import pyarrow as pa
7
+ import pkg_resources
7
8
  from ray.data._internal.arrow_block import ArrowRow
8
9
  from ray.data.dataset import Dataset
9
10
 
10
11
  LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray]
11
12
  LocalDataset = List[LocalTable]
12
- DistributedDataset = Dataset[Union[ArrowRow, np.ndarray, Any]]
13
+ # Starting Ray 2.5.0, Dataset follows a strict mode (https://docs.ray.io/en/latest/data/faq.html#migrating-to-strict-mode),
14
+ # and generic annotation is removed. So add a version checker to determine whether to use the old or new definition.
15
+ ray_version = pkg_resources.parse_version(pkg_resources.get_distribution("ray").version)
16
+ change_version = pkg_resources.parse_version("2.5.0")
17
+ if ray_version < change_version:
18
+ DistributedDataset = Dataset[Union[ArrowRow, np.ndarray, Any]]
19
+ else:
20
+ DistributedDataset = Dataset
13
21
 
14
22
 
15
23
  class DeltaType(str, Enum):
File without changes
@@ -0,0 +1,69 @@
1
+ import unittest
2
+ from unittest import mock
3
+ from deltacat.tests.test_utils.constants import TEST_DELTA
4
+
5
+
6
+ class TestFitInputDeltas(unittest.TestCase):
7
+ @classmethod
8
+ def setUpClass(cls):
9
+ cls.module_patcher = mock.patch.dict("sys.modules", {"ray": mock.MagicMock()})
10
+ cls.module_patcher.start()
11
+
12
+ super().setUpClass()
13
+
14
+ def test_sanity(self):
15
+ from deltacat.compute.compactor.utils import io
16
+
17
+ (
18
+ delta_list,
19
+ hash_bucket_count,
20
+ high_watermark,
21
+ require_multiple_rounds,
22
+ ) = io.fit_input_deltas([TEST_DELTA], {"CPU": 1, "memory": 20000000}, None)
23
+
24
+ self.assertIsNotNone(hash_bucket_count)
25
+ self.assertTrue(1, len(delta_list))
26
+ self.assertIsNotNone(high_watermark)
27
+ self.assertFalse(require_multiple_rounds)
28
+
29
+ def test_when_hash_bucket_count_overridden(self):
30
+ from deltacat.compute.compactor.utils import io
31
+
32
+ (
33
+ delta_list,
34
+ hash_bucket_count,
35
+ high_watermark,
36
+ require_multiple_rounds,
37
+ ) = io.fit_input_deltas([TEST_DELTA], {"CPU": 1, "memory": 20000000}, 20)
38
+
39
+ self.assertEqual(20, hash_bucket_count)
40
+ self.assertEqual(1, len(delta_list))
41
+ self.assertIsNotNone(high_watermark)
42
+ self.assertFalse(require_multiple_rounds)
43
+
44
+ def test_when_not_enough_memory_splits_manifest_entries(self):
45
+ from deltacat.compute.compactor.utils import io
46
+
47
+ (
48
+ delta_list,
49
+ hash_bucket_count,
50
+ high_watermark,
51
+ require_multiple_rounds,
52
+ ) = io.fit_input_deltas([TEST_DELTA], {"CPU": 2, "memory": 10}, 20)
53
+
54
+ self.assertIsNotNone(hash_bucket_count)
55
+ self.assertTrue(2, len(delta_list))
56
+ self.assertIsNotNone(high_watermark)
57
+ self.assertFalse(require_multiple_rounds)
58
+
59
+ def test_when_no_input_deltas(self):
60
+ from deltacat.compute.compactor.utils import io
61
+
62
+ with self.assertRaises(AssertionError):
63
+ io.fit_input_deltas([], {"CPU": 100, "memory": 20000.0}, None)
64
+
65
+ def test_when_cpu_resources_is_not_passed(self):
66
+ from deltacat.compute.compactor.utils import io
67
+
68
+ with self.assertRaises(KeyError):
69
+ io.fit_input_deltas([], {}, None)