deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +287 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +91 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +2 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +16 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +506 -0
  20. deltacat/compute/compactor_v2/constants.py +34 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +41 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  30. deltacat/compute/compactor_v2/utils/io.py +149 -0
  31. deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  32. deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  33. deltacat/compute/metastats/meta_stats.py +4 -2
  34. deltacat/compute/metastats/stats.py +1 -0
  35. deltacat/compute/metastats/utils/io.py +4 -0
  36. deltacat/compute/stats/utils/io.py +20 -5
  37. deltacat/exceptions.py +4 -0
  38. deltacat/io/memcached_object_store.py +37 -14
  39. deltacat/logs.py +4 -3
  40. deltacat/storage/interface.py +8 -1
  41. deltacat/storage/model/types.py +2 -1
  42. deltacat/tests/aws/test_clients.py +16 -3
  43. deltacat/tests/compute/__init__.py +0 -0
  44. deltacat/tests/compute/common.py +96 -0
  45. deltacat/tests/compute/compactor/__init__.py +0 -0
  46. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  47. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
  48. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  49. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  50. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  51. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  52. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  53. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  54. deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  55. deltacat/tests/compute/testcases.py +390 -0
  56. deltacat/tests/io/test_memcached_object_store.py +5 -4
  57. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  58. deltacat/tests/test_utils/pyarrow.py +32 -0
  59. deltacat/tests/test_utils/utils.py +13 -0
  60. deltacat/tests/utils/data/__init__.py +0 -0
  61. deltacat/tests/utils/test_daft.py +76 -0
  62. deltacat/tests/utils/test_pyarrow.py +133 -0
  63. deltacat/tests/utils/test_resources.py +23 -20
  64. deltacat/types/media.py +1 -0
  65. deltacat/types/partial_download.py +82 -0
  66. deltacat/types/tables.py +1 -0
  67. deltacat/utils/arguments.py +26 -0
  68. deltacat/utils/daft.py +87 -0
  69. deltacat/utils/placement.py +20 -3
  70. deltacat/utils/pyarrow.py +213 -1
  71. deltacat/utils/ray_utils/concurrency.py +26 -1
  72. deltacat/utils/resources.py +72 -1
  73. deltacat/utils/s3fs.py +21 -0
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +17 -3
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/RECORD +80 -47
  76. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
  77. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  78. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
  80. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, List, Optional, Any
4
+ from deltacat.utils.metrics import MetricsConfig
5
+ from deltacat.utils.common import ReadKwargsProvider
6
+ from deltacat.io.object_store import IObjectStore
7
+ from deltacat.storage import interface as unimplemented_deltacat_storage
8
+ from deltacat.compute.compactor import DeltaAnnotated
9
+
10
+
11
+ class HashBucketInput(Dict):
12
+ @staticmethod
13
+ def of(
14
+ annotated_delta: DeltaAnnotated,
15
+ primary_keys: List[str],
16
+ num_hash_buckets: int,
17
+ num_hash_groups: int,
18
+ enable_profiler: Optional[bool] = False,
19
+ metrics_config: Optional[MetricsConfig] = None,
20
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
21
+ object_store: Optional[IObjectStore] = None,
22
+ deltacat_storage=unimplemented_deltacat_storage,
23
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
24
+ ) -> HashBucketInput:
25
+
26
+ result = HashBucketInput()
27
+ result["annotated_delta"] = annotated_delta
28
+ result["primary_keys"] = primary_keys
29
+ result["num_hash_buckets"] = num_hash_buckets
30
+ result["num_hash_groups"] = num_hash_groups
31
+ result["enable_profiler"] = enable_profiler
32
+ result["metrics_config"] = metrics_config
33
+ result["read_kwargs_provider"] = read_kwargs_provider
34
+ result["object_store"] = object_store
35
+ result["deltacat_storage"] = deltacat_storage
36
+ result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
37
+
38
+ return result
39
+
40
+ @property
41
+ def annotated_delta(self) -> DeltaAnnotated:
42
+ return self["annotated_delta"]
43
+
44
+ @property
45
+ def primary_keys(self) -> List[str]:
46
+ return self["primary_keys"]
47
+
48
+ @property
49
+ def num_hash_buckets(self) -> int:
50
+ return self["num_hash_buckets"]
51
+
52
+ @property
53
+ def num_hash_groups(self) -> int:
54
+ return self["num_hash_groups"]
55
+
56
+ @property
57
+ def enable_profiler(self) -> Optional[bool]:
58
+ return self.get("enable_profiler")
59
+
60
+ @property
61
+ def metrics_config(self) -> Optional[MetricsConfig]:
62
+ return self.get("metrics_config")
63
+
64
+ @property
65
+ def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
66
+ return self.get("read_kwargs_provider")
67
+
68
+ @property
69
+ def object_store(self) -> Optional[IObjectStore]:
70
+ return self.get("object_store")
71
+
72
+ @property
73
+ def deltacat_storage(self) -> unimplemented_deltacat_storage:
74
+ return self.get("deltacat_storage")
75
+
76
+ @property
77
+ def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
78
+ return self.get("deltacat_storage_kwargs")
@@ -0,0 +1,12 @@
1
+ from typing import NamedTuple
2
+
3
+ import numpy as np
4
+
5
+
6
+ class HashBucketResult(NamedTuple):
7
+ hash_bucket_group_to_obj_id_tuple: np.ndarray
8
+ hb_size_bytes: np.int64
9
+ hb_record_count: np.int64
10
+ peak_memory_usage_bytes: np.double
11
+ telemetry_time_in_seconds: np.double
12
+ task_completed_at: np.double
@@ -0,0 +1,127 @@
1
+ from __future__ import annotations
2
+
3
+ from ray.types import ObjectRef
4
+ from typing import Dict, List, Optional, Any
5
+ from deltacat.utils.metrics import MetricsConfig
6
+ from deltacat.utils.common import ReadKwargsProvider
7
+ from deltacat.io.object_store import IObjectStore
8
+ from deltacat.storage import (
9
+ Partition,
10
+ SortKey,
11
+ interface as unimplemented_deltacat_storage,
12
+ )
13
+ from deltacat.types.media import ContentType
14
+ from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
15
+ from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
16
+
17
+
18
+ class MergeInput(Dict):
19
+ @staticmethod
20
+ def of(
21
+ dfe_groups_refs: List[ObjectRef[DeltaFileEnvelopeGroups]],
22
+ write_to_partition: Partition,
23
+ compacted_file_content_type: ContentType,
24
+ primary_keys: List[str],
25
+ hash_group_index: int,
26
+ num_hash_groups: int,
27
+ sort_keys: Optional[List[SortKey]] = None,
28
+ merge_task_index: Optional[int] = 0,
29
+ max_records_per_output_file: Optional[int] = 4_000_000,
30
+ enable_profiler: Optional[bool] = False,
31
+ metrics_config: Optional[MetricsConfig] = None,
32
+ s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
33
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
34
+ round_completion_info: Optional[RoundCompletionInfo] = None,
35
+ object_store: Optional[IObjectStore] = None,
36
+ deltacat_storage=unimplemented_deltacat_storage,
37
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
38
+ ) -> MergeInput:
39
+
40
+ result = MergeInput()
41
+ result["dfe_groups_refs"] = dfe_groups_refs
42
+ result["write_to_partition"] = write_to_partition
43
+ result["compacted_file_content_type"] = compacted_file_content_type
44
+ result["primary_keys"] = primary_keys
45
+ result["hash_group_index"] = hash_group_index
46
+ result["num_hash_groups"] = num_hash_groups
47
+ result["sort_keys"] = sort_keys
48
+ result["merge_task_index"] = merge_task_index
49
+ result["max_records_per_output_file"] = max_records_per_output_file
50
+ result["enable_profiler"] = enable_profiler
51
+ result["metrics_config"] = metrics_config
52
+ result["s3_table_writer_kwargs"] = s3_table_writer_kwargs or {}
53
+ result["read_kwargs_provider"] = read_kwargs_provider
54
+ result["round_completion_info"] = round_completion_info
55
+ result["object_store"] = object_store
56
+ result["deltacat_storage"] = deltacat_storage
57
+ result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
58
+
59
+ return result
60
+
61
+ @property
62
+ def dfe_groups_refs(self) -> List[ObjectRef[DeltaFileEnvelopeGroups]]:
63
+ return self["dfe_groups_refs"]
64
+
65
+ @property
66
+ def write_to_partition(self) -> Partition:
67
+ return self["write_to_partition"]
68
+
69
+ @property
70
+ def compacted_file_content_type(self) -> ContentType:
71
+ return self["compacted_file_content_type"]
72
+
73
+ @property
74
+ def primary_keys(self) -> List[str]:
75
+ return self["primary_keys"]
76
+
77
+ @property
78
+ def hash_group_index(self) -> int:
79
+ return self["hash_group_index"]
80
+
81
+ @property
82
+ def num_hash_groups(self) -> int:
83
+ return self["num_hash_groups"]
84
+
85
+ @property
86
+ def sort_keys(self) -> Optional[List[SortKey]]:
87
+ return self.get("sort_keys")
88
+
89
+ @property
90
+ def merge_task_index(self) -> int:
91
+ return self.get("merge_task_index")
92
+
93
+ @property
94
+ def max_records_per_output_file(self) -> int:
95
+ return self.get("max_records_per_output_file")
96
+
97
+ @property
98
+ def enable_profiler(self) -> bool:
99
+ return self.get("enable_profiler")
100
+
101
+ @property
102
+ def metrics_config(self) -> Optional[MetricsConfig]:
103
+ return self.get("metrics_config")
104
+
105
+ @property
106
+ def s3_table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
107
+ return self.get("s3_table_writer_kwargs")
108
+
109
+ @property
110
+ def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
111
+ return self.get("read_kwargs_provider")
112
+
113
+ @property
114
+ def round_completion_info(self) -> Optional[RoundCompletionInfo]:
115
+ return self.get("round_completion_info")
116
+
117
+ @property
118
+ def object_store(self) -> Optional[IObjectStore]:
119
+ return self.get("object_store")
120
+
121
+ @property
122
+ def deltacat_storage(self) -> unimplemented_deltacat_storage:
123
+ return self["deltacat_storage"]
124
+
125
+ @property
126
+ def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
127
+ return self.get("deltacat_storage_kwargs")
@@ -0,0 +1,12 @@
1
+ from typing import NamedTuple, List
2
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
3
+
4
+ import numpy as np
5
+
6
+
7
+ class MergeResult(NamedTuple):
8
+ materialize_results: List[MaterializeResult]
9
+ deduped_record_count: np.int64
10
+ peak_memory_usage_bytes: np.double
11
+ telemetry_time_in_seconds: np.double
12
+ task_completed_at: np.double
File without changes
@@ -0,0 +1,203 @@
1
+ import importlib
2
+ import logging
3
+ import time
4
+ from contextlib import nullcontext
5
+ from typing import List, Optional, Tuple
6
+ from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
7
+ import numpy as np
8
+ import pyarrow as pa
9
+ import ray
10
+ from deltacat import logs
11
+ from deltacat.compute.compactor import (
12
+ DeltaAnnotated,
13
+ DeltaFileEnvelope,
14
+ )
15
+ from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
16
+ from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
17
+ from deltacat.compute.compactor_v2.utils.primary_key_index import (
18
+ group_hash_bucket_indices,
19
+ group_by_pk_hash_bucket,
20
+ )
21
+ from deltacat.storage import interface as unimplemented_deltacat_storage
22
+ from deltacat.types.media import StorageType
23
+ from deltacat.utils.ray_utils.runtime import (
24
+ get_current_ray_task_id,
25
+ get_current_ray_worker_id,
26
+ )
27
+ from deltacat.utils.common import ReadKwargsProvider
28
+ from deltacat.utils.performance import timed_invocation
29
+ from deltacat.utils.metrics import emit_timer_metrics
30
+ from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
31
+
32
+ if importlib.util.find_spec("memray"):
33
+ import memray
34
+
35
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
+
37
+
38
+ def _read_delta_file_envelopes(
39
+ annotated_delta: DeltaAnnotated,
40
+ read_kwargs_provider: Optional[ReadKwargsProvider],
41
+ deltacat_storage=unimplemented_deltacat_storage,
42
+ deltacat_storage_kwargs: Optional[dict] = None,
43
+ ) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
44
+
45
+ tables = deltacat_storage.download_delta(
46
+ annotated_delta,
47
+ max_parallelism=1,
48
+ file_reader_kwargs_provider=read_kwargs_provider,
49
+ storage_type=StorageType.LOCAL,
50
+ **deltacat_storage_kwargs,
51
+ )
52
+ annotations = annotated_delta.annotations
53
+ assert (
54
+ len(tables) == len(annotations),
55
+ f"Unexpected Error: Length of downloaded delta manifest tables "
56
+ f"({len(tables)}) doesn't match the length of delta manifest "
57
+ f"annotations ({len(annotations)}).",
58
+ )
59
+ if not tables:
60
+ return None, 0, 0
61
+
62
+ delta_stream_position = annotations[0].annotation_stream_position
63
+ delta_type = annotations[0].annotation_delta_type
64
+
65
+ for annotation in annotations:
66
+ assert annotation.annotation_stream_position == delta_stream_position, (
67
+ f"Annotation stream position does not match - {annotation.annotation_stream_position} "
68
+ f"!= {delta_stream_position}"
69
+ )
70
+ assert annotation.annotation_delta_type == delta_type, (
71
+ f"Annotation delta type does not match - {annotation.annotation_delta_type} "
72
+ f"!= {delta_type}"
73
+ )
74
+
75
+ delta_file_envelopes = []
76
+ table = pa.concat_tables(tables)
77
+ total_record_count = len(table)
78
+ total_size_bytes = int(table.nbytes)
79
+
80
+ delta_file = DeltaFileEnvelope.of(
81
+ stream_position=delta_stream_position,
82
+ delta_type=delta_type,
83
+ table=table,
84
+ )
85
+ delta_file_envelopes.append(delta_file)
86
+ return delta_file_envelopes, total_record_count, total_size_bytes
87
+
88
+
89
+ def _group_file_records_by_pk_hash_bucket(
90
+ annotated_delta: DeltaAnnotated,
91
+ num_hash_buckets: int,
92
+ primary_keys: List[str],
93
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
94
+ deltacat_storage=unimplemented_deltacat_storage,
95
+ deltacat_storage_kwargs: Optional[dict] = None,
96
+ ) -> Tuple[Optional[DeltaFileEnvelopeGroups], int, int]:
97
+ # read input parquet s3 objects into a list of delta file envelopes
98
+ (
99
+ delta_file_envelopes,
100
+ total_record_count,
101
+ total_size_bytes,
102
+ ) = _read_delta_file_envelopes(
103
+ annotated_delta,
104
+ read_kwargs_provider,
105
+ deltacat_storage,
106
+ deltacat_storage_kwargs,
107
+ )
108
+
109
+ if delta_file_envelopes is None:
110
+ return None, 0, 0
111
+
112
+ logger.info(f"Read all delta file envelopes: {len(delta_file_envelopes)}")
113
+
114
+ # group the data by primary key hash value
115
+ hb_to_delta_file_envelopes = np.empty([num_hash_buckets], dtype="object")
116
+ for dfe in delta_file_envelopes:
117
+ logger.info("Grouping by pk hash bucket")
118
+ start = time.monotonic()
119
+ hash_bucket_to_table = group_by_pk_hash_bucket(
120
+ dfe.table,
121
+ num_hash_buckets,
122
+ primary_keys,
123
+ )
124
+ group_end = time.monotonic()
125
+ logger.info(f"Grouping took: {group_end - start}")
126
+ for hb, table in enumerate(hash_bucket_to_table):
127
+ if table:
128
+ if hb_to_delta_file_envelopes[hb] is None:
129
+ hb_to_delta_file_envelopes[hb] = []
130
+ hb_to_delta_file_envelopes[hb].append(
131
+ DeltaFileEnvelope.of(
132
+ stream_position=dfe.stream_position,
133
+ file_index=dfe.file_index,
134
+ delta_type=dfe.delta_type,
135
+ table=table,
136
+ )
137
+ )
138
+ return hb_to_delta_file_envelopes, total_record_count, total_size_bytes
139
+
140
+
141
+ def _timed_hash_bucket(input: HashBucketInput):
142
+ task_id = get_current_ray_task_id()
143
+ worker_id = get_current_ray_worker_id()
144
+ with memray.Tracker(
145
+ f"hash_bucket_{worker_id}_{task_id}.bin"
146
+ ) if input.enable_profiler else nullcontext():
147
+ (
148
+ delta_file_envelope_groups,
149
+ total_record_count,
150
+ total_size_bytes,
151
+ ) = _group_file_records_by_pk_hash_bucket(
152
+ annotated_delta=input.annotated_delta,
153
+ num_hash_buckets=input.num_hash_buckets,
154
+ primary_keys=input.primary_keys,
155
+ read_kwargs_provider=input.read_kwargs_provider,
156
+ deltacat_storage=input.deltacat_storage,
157
+ deltacat_storage_kwargs=input.deltacat_storage_kwargs,
158
+ )
159
+ hash_bucket_group_to_obj_id_tuple = group_hash_bucket_indices(
160
+ hash_bucket_object_groups=delta_file_envelope_groups,
161
+ num_buckets=input.num_hash_buckets,
162
+ num_groups=input.num_hash_groups,
163
+ object_store=input.object_store,
164
+ )
165
+
166
+ peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
167
+ return HashBucketResult(
168
+ hash_bucket_group_to_obj_id_tuple,
169
+ np.int64(total_size_bytes),
170
+ np.int64(total_record_count),
171
+ np.double(peak_memory_usage_bytes),
172
+ np.double(0.0),
173
+ np.double(time.time()),
174
+ )
175
+
176
+
177
+ @ray.remote
178
+ def hash_bucket(input: HashBucketInput) -> HashBucketResult:
179
+
180
+ logger.info(f"Starting hash bucket task...")
181
+ hash_bucket_result, duration = timed_invocation(
182
+ func=_timed_hash_bucket, input=input
183
+ )
184
+
185
+ emit_metrics_time = 0.0
186
+ if input.metrics_config:
187
+ emit_result, latency = timed_invocation(
188
+ func=emit_timer_metrics,
189
+ metrics_name="hash_bucket",
190
+ value=duration,
191
+ metrics_config=input.metrics_config,
192
+ )
193
+ emit_metrics_time = latency
194
+
195
+ logger.info(f"Finished hash bucket task...")
196
+ return HashBucketResult(
197
+ hash_bucket_result[0],
198
+ hash_bucket_result[1],
199
+ hash_bucket_result[2],
200
+ hash_bucket_result[3],
201
+ np.double(emit_metrics_time),
202
+ hash_bucket_result[5],
203
+ )
@@ -0,0 +1,41 @@
1
+ import logging
2
+ from deltacat.compute.compactor_v2.model.merge_input import MergeInput
3
+ import numpy as np
4
+ import ray
5
+ from deltacat import logs
6
+ from deltacat.compute.compactor_v2.model.merge_result import MergeResult
7
+ from deltacat.utils.performance import timed_invocation
8
+ from deltacat.utils.metrics import emit_timer_metrics
9
+
10
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
11
+
12
+
13
+ def _timed_merge(input: MergeInput) -> MergeResult:
14
+ # TODO: Implementation goes here
15
+ pass
16
+
17
+
18
+ @ray.remote
19
+ def merge(input: MergeInput) -> MergeResult:
20
+
21
+ logger.info(f"Starting merge task...")
22
+ merge_result, duration = timed_invocation(func=_timed_merge, input=input)
23
+
24
+ emit_metrics_time = 0.0
25
+ if input.metrics_config:
26
+ emit_result, latency = timed_invocation(
27
+ func=emit_timer_metrics,
28
+ metrics_name="merge",
29
+ value=duration,
30
+ metrics_config=input.metrics_config,
31
+ )
32
+ emit_metrics_time = latency
33
+
34
+ logger.info(f"Finished merge task...")
35
+ return MergeResult(
36
+ merge_result[0],
37
+ merge_result[1],
38
+ merge_result[2],
39
+ np.double(emit_metrics_time),
40
+ merge_result[4],
41
+ )
File without changes
@@ -0,0 +1,37 @@
1
+ from deltacat.storage import (
2
+ Delta,
3
+ interface as unimplemented_deltacat_storage,
4
+ )
5
+ from typing import Dict, Optional, Any
6
+ from deltacat.types.media import TableType
7
+ from deltacat.types.media import ContentType
8
+ from deltacat.types.partial_download import PartialParquetParameters
9
+
10
+
11
+ def append_content_type_params(
12
+ delta: Delta,
13
+ entry_index: int,
14
+ deltacat_storage: unimplemented_deltacat_storage,
15
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
16
+ ):
17
+
18
+ entry = delta.manifest.entries[entry_index]
19
+
20
+ if entry.meta.content_type == ContentType.PARQUET:
21
+ # Set partial download content type parameters to allow
22
+ # row group level rebatching for parquet.
23
+ pq_file = deltacat_storage.download_delta_manifest_entry(
24
+ delta.locator,
25
+ entry_index=entry_index,
26
+ table_type=TableType.PYARROW_PARQUET,
27
+ **deltacat_storage_kwargs,
28
+ )
29
+
30
+ if not entry.meta.content_type_parameters:
31
+ entry.meta.content_type_parameters = []
32
+
33
+ entry.meta.content_type_parameters.append(
34
+ PartialParquetParameters.of(pq_metadata=pq_file.metadata)
35
+ )
36
+
37
+ return entry
@@ -0,0 +1,149 @@
1
+ import logging
2
+ import functools
3
+ from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
4
+ from deltacat.storage import (
5
+ PartitionLocator,
6
+ Delta,
7
+ interface as unimplemented_deltacat_storage,
8
+ )
9
+ from deltacat import logs
10
+ from deltacat.compute.compactor.utils import io as io_v1
11
+ from deltacat.compute.compactor import DeltaAnnotated
12
+ from typing import Dict, List, Optional, Any
13
+ from deltacat.compute.compactor_v2.constants import (
14
+ MIN_FILES_IN_BATCH,
15
+ MIN_DELTA_BYTES_IN_BATCH,
16
+ )
17
+ from deltacat.compute.compactor.model.compaction_session_audit_info import (
18
+ CompactionSessionAuditInfo,
19
+ )
20
+ from deltacat.compute.compactor_v2.utils.task_options import (
21
+ estimate_manifest_entry_size_bytes,
22
+ )
23
+ from deltacat.compute.compactor_v2.utils.content_type_params import (
24
+ append_content_type_params,
25
+ )
26
+
27
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
28
+
29
+
30
+ def discover_deltas(
31
+ source_partition_locator: PartitionLocator,
32
+ last_stream_position_to_compact: int,
33
+ rebase_source_partition_locator: Optional[PartitionLocator] = None,
34
+ rebase_source_partition_high_watermark: Optional[int] = None,
35
+ rcf_high_watermark: Optional[int] = None,
36
+ deltacat_storage=unimplemented_deltacat_storage,
37
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
38
+ list_deltas_kwargs: Optional[Dict[str, Any]] = {},
39
+ ) -> List[Delta]:
40
+
41
+ previous_compacted_high_watermark = (
42
+ rebase_source_partition_high_watermark or rcf_high_watermark
43
+ )
44
+
45
+ delta_source_partition_locator = (
46
+ rebase_source_partition_locator or source_partition_locator
47
+ )
48
+
49
+ result = []
50
+
51
+ delta_source_incremental_deltas = io_v1._discover_deltas(
52
+ delta_source_partition_locator,
53
+ previous_compacted_high_watermark,
54
+ last_stream_position_to_compact,
55
+ deltacat_storage,
56
+ deltacat_storage_kwargs,
57
+ list_deltas_kwargs,
58
+ )
59
+
60
+ result.extend(delta_source_incremental_deltas)
61
+
62
+ logger.info(
63
+ f"Length of input deltas from delta source table is {len(delta_source_incremental_deltas)}"
64
+ f" from ({previous_compacted_high_watermark}, {last_stream_position_to_compact}]"
65
+ )
66
+
67
+ if rebase_source_partition_locator:
68
+ previous_compacted_deltas = io_v1._discover_deltas(
69
+ source_partition_locator,
70
+ None,
71
+ None,
72
+ deltacat_storage,
73
+ deltacat_storage_kwargs,
74
+ list_deltas_kwargs,
75
+ )
76
+
77
+ result.extend(previous_compacted_deltas)
78
+
79
+ logger.info(
80
+ f"Length of input deltas from previous compacted table is {len(previous_compacted_deltas)}"
81
+ f" from ({None}, {None}]"
82
+ )
83
+
84
+ return result
85
+
86
+
87
+ def create_uniform_input_deltas(
88
+ input_deltas: List[Delta],
89
+ hash_bucket_count: int,
90
+ compaction_audit: CompactionSessionAuditInfo,
91
+ min_delta_bytes: Optional[float] = MIN_DELTA_BYTES_IN_BATCH,
92
+ min_file_counts: Optional[float] = MIN_FILES_IN_BATCH,
93
+ previous_inflation: Optional[float] = PYARROW_INFLATION_MULTIPLIER,
94
+ deltacat_storage=unimplemented_deltacat_storage,
95
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
96
+ ) -> List[DeltaAnnotated]:
97
+
98
+ delta_bytes = 0
99
+ delta_manifest_entries = 0
100
+ estimated_da_bytes = 0
101
+ input_da_list = []
102
+
103
+ for delta in input_deltas:
104
+ manifest_entries = delta.manifest.entries
105
+ delta_manifest_entries += len(manifest_entries)
106
+ for entry_index in range(len(manifest_entries)):
107
+ entry = append_content_type_params(
108
+ delta=delta,
109
+ entry_index=entry_index,
110
+ deltacat_storage=deltacat_storage,
111
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
112
+ )
113
+
114
+ delta_bytes += entry.meta.content_length
115
+ estimated_da_bytes += estimate_manifest_entry_size_bytes(
116
+ entry=entry, previous_inflation=previous_inflation
117
+ )
118
+
119
+ delta_annotated = DeltaAnnotated.of(delta)
120
+ input_da_list.append(delta_annotated)
121
+
122
+ logger.info(f"Input deltas to compact this round: " f"{len(input_da_list)}")
123
+ logger.info(f"Input delta bytes to compact: {delta_bytes}")
124
+ logger.info(f"Input delta files to compact: {delta_manifest_entries}")
125
+
126
+ if not input_da_list:
127
+ raise RuntimeError("No input deltas to compact!")
128
+
129
+ size_estimation_function = functools.partial(
130
+ estimate_manifest_entry_size_bytes, previous_inflation=previous_inflation
131
+ )
132
+
133
+ rebatched_da_list = DeltaAnnotated.rebatch(
134
+ input_da_list,
135
+ min_delta_bytes=min_delta_bytes,
136
+ min_file_counts=min_file_counts,
137
+ estimation_function=size_estimation_function,
138
+ )
139
+
140
+ compaction_audit.set_input_size_bytes(delta_bytes)
141
+ compaction_audit.set_input_file_count(delta_manifest_entries)
142
+ compaction_audit.set_estimated_in_memory_size_bytes_during_discovery(
143
+ estimated_da_bytes
144
+ )
145
+
146
+ logger.info(f"Hash bucket count: {hash_bucket_count}")
147
+ logger.info(f"Input uniform delta count: {len(rebatched_da_list)}")
148
+
149
+ return rebatched_da_list