deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +297 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +95 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +4 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +22 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +509 -0
  20. deltacat/compute/compactor_v2/constants.py +37 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +143 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +469 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
  30. deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
  31. deltacat/compute/compactor_v2/utils/io.py +152 -0
  32. deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
  33. deltacat/compute/compactor_v2/utils/task_options.py +221 -0
  34. deltacat/compute/metastats/meta_stats.py +4 -2
  35. deltacat/compute/metastats/stats.py +1 -0
  36. deltacat/compute/metastats/utils/io.py +4 -0
  37. deltacat/compute/stats/utils/io.py +20 -5
  38. deltacat/exceptions.py +4 -0
  39. deltacat/io/memcached_object_store.py +37 -14
  40. deltacat/logs.py +4 -3
  41. deltacat/storage/interface.py +8 -1
  42. deltacat/storage/model/types.py +2 -1
  43. deltacat/tests/aws/test_clients.py +16 -3
  44. deltacat/tests/compute/__init__.py +0 -0
  45. deltacat/tests/compute/common.py +96 -0
  46. deltacat/tests/compute/compactor/__init__.py +0 -0
  47. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  48. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
  49. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  50. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  51. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  52. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  53. deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
  54. deltacat/tests/compute/testcases.py +395 -0
  55. deltacat/tests/io/test_memcached_object_store.py +5 -4
  56. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  57. deltacat/tests/test_utils/pyarrow.py +49 -0
  58. deltacat/tests/test_utils/utils.py +13 -0
  59. deltacat/tests/utils/data/__init__.py +0 -0
  60. deltacat/tests/utils/test_daft.py +76 -0
  61. deltacat/tests/utils/test_pyarrow.py +133 -0
  62. deltacat/tests/utils/test_resources.py +23 -20
  63. deltacat/types/media.py +1 -0
  64. deltacat/types/partial_download.py +83 -0
  65. deltacat/types/tables.py +6 -0
  66. deltacat/utils/arguments.py +25 -0
  67. deltacat/utils/daft.py +87 -0
  68. deltacat/utils/placement.py +20 -3
  69. deltacat/utils/pyarrow.py +218 -1
  70. deltacat/utils/ray_utils/concurrency.py +26 -1
  71. deltacat/utils/resources.py +72 -1
  72. deltacat/utils/s3fs.py +21 -0
  73. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
  76. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  77. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  78. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,143 @@
1
+ from __future__ import annotations
2
+
3
+ from ray.types import ObjectRef
4
+ from typing import Dict, List, Optional, Any
5
+ from deltacat.utils.metrics import MetricsConfig
6
+ from deltacat.utils.common import ReadKwargsProvider
7
+ from deltacat.io.object_store import IObjectStore
8
+ from deltacat.storage import (
9
+ Partition,
10
+ SortKey,
11
+ interface as unimplemented_deltacat_storage,
12
+ )
13
+ from deltacat.compute.compactor_v2.constants import (
14
+ DROP_DUPLICATES,
15
+ MAX_RECORDS_PER_COMPACTED_FILE,
16
+ )
17
+ from deltacat.types.media import ContentType
18
+ from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
19
+ from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
20
+
21
+
22
+ class MergeInput(Dict):
23
+ @staticmethod
24
+ def of(
25
+ dfe_groups_refs: List[ObjectRef[DeltaFileEnvelopeGroups]],
26
+ write_to_partition: Partition,
27
+ compacted_file_content_type: ContentType,
28
+ primary_keys: List[str],
29
+ hash_group_index: int,
30
+ num_hash_groups: int,
31
+ hash_bucket_count: int,
32
+ drop_duplicates: Optional[bool] = DROP_DUPLICATES,
33
+ sort_keys: Optional[List[SortKey]] = None,
34
+ merge_task_index: Optional[int] = 0,
35
+ max_records_per_output_file: Optional[int] = MAX_RECORDS_PER_COMPACTED_FILE,
36
+ enable_profiler: Optional[bool] = False,
37
+ metrics_config: Optional[MetricsConfig] = None,
38
+ s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
39
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
40
+ round_completion_info: Optional[RoundCompletionInfo] = None,
41
+ object_store: Optional[IObjectStore] = None,
42
+ deltacat_storage=unimplemented_deltacat_storage,
43
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
44
+ ) -> MergeInput:
45
+
46
+ result = MergeInput()
47
+ result["dfe_groups_refs"] = dfe_groups_refs
48
+ result["write_to_partition"] = write_to_partition
49
+ result["compacted_file_content_type"] = compacted_file_content_type
50
+ result["primary_keys"] = primary_keys
51
+ result["hash_group_index"] = hash_group_index
52
+ result["num_hash_groups"] = num_hash_groups
53
+ result["hash_bucket_count"] = hash_bucket_count
54
+ result["drop_duplicates"] = drop_duplicates
55
+ result["sort_keys"] = sort_keys
56
+ result["merge_task_index"] = merge_task_index
57
+ result["max_records_per_output_file"] = max_records_per_output_file
58
+ result["enable_profiler"] = enable_profiler
59
+ result["metrics_config"] = metrics_config
60
+ result["s3_table_writer_kwargs"] = s3_table_writer_kwargs or {}
61
+ result["read_kwargs_provider"] = read_kwargs_provider
62
+ result["round_completion_info"] = round_completion_info
63
+ result["object_store"] = object_store
64
+ result["deltacat_storage"] = deltacat_storage
65
+ result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
66
+
67
+ return result
68
+
69
+ @property
70
+ def dfe_groups_refs(self) -> List[ObjectRef[DeltaFileEnvelopeGroups]]:
71
+ return self["dfe_groups_refs"]
72
+
73
+ @property
74
+ def write_to_partition(self) -> Partition:
75
+ return self["write_to_partition"]
76
+
77
+ @property
78
+ def compacted_file_content_type(self) -> ContentType:
79
+ return self["compacted_file_content_type"]
80
+
81
+ @property
82
+ def primary_keys(self) -> List[str]:
83
+ return self["primary_keys"]
84
+
85
+ @property
86
+ def hash_group_index(self) -> int:
87
+ return self["hash_group_index"]
88
+
89
+ @property
90
+ def num_hash_groups(self) -> int:
91
+ return self["num_hash_groups"]
92
+
93
+ @property
94
+ def hash_bucket_count(self) -> int:
95
+ return self["hash_bucket_count"]
96
+
97
+ @property
98
+ def drop_duplicates(self) -> int:
99
+ return self["drop_duplicates"]
100
+
101
+ @property
102
+ def sort_keys(self) -> Optional[List[SortKey]]:
103
+ return self.get("sort_keys")
104
+
105
+ @property
106
+ def merge_task_index(self) -> int:
107
+ return self.get("merge_task_index")
108
+
109
+ @property
110
+ def max_records_per_output_file(self) -> int:
111
+ return self.get("max_records_per_output_file")
112
+
113
+ @property
114
+ def enable_profiler(self) -> bool:
115
+ return self.get("enable_profiler")
116
+
117
+ @property
118
+ def metrics_config(self) -> Optional[MetricsConfig]:
119
+ return self.get("metrics_config")
120
+
121
+ @property
122
+ def s3_table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
123
+ return self.get("s3_table_writer_kwargs")
124
+
125
+ @property
126
+ def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
127
+ return self.get("read_kwargs_provider")
128
+
129
+ @property
130
+ def round_completion_info(self) -> Optional[RoundCompletionInfo]:
131
+ return self.get("round_completion_info")
132
+
133
+ @property
134
+ def object_store(self) -> Optional[IObjectStore]:
135
+ return self.get("object_store")
136
+
137
+ @property
138
+ def deltacat_storage(self) -> unimplemented_deltacat_storage:
139
+ return self["deltacat_storage"]
140
+
141
+ @property
142
+ def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
143
+ return self.get("deltacat_storage_kwargs")
@@ -0,0 +1,12 @@
1
+ from typing import NamedTuple, List
2
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
3
+
4
+ import numpy as np
5
+
6
+
7
+ class MergeResult(NamedTuple):
8
+ materialize_results: List[MaterializeResult]
9
+ deduped_record_count: np.int64
10
+ peak_memory_usage_bytes: np.double
11
+ telemetry_time_in_seconds: np.double
12
+ task_completed_at: np.double
File without changes
@@ -0,0 +1,203 @@
1
+ import importlib
2
+ import logging
3
+ import time
4
+ from contextlib import nullcontext
5
+ from typing import List, Optional, Tuple
6
+ from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
7
+ import numpy as np
8
+ import pyarrow as pa
9
+ import ray
10
+ from deltacat import logs
11
+ from deltacat.compute.compactor import (
12
+ DeltaAnnotated,
13
+ DeltaFileEnvelope,
14
+ )
15
+ from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
16
+ from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
17
+ from deltacat.compute.compactor_v2.utils.primary_key_index import (
18
+ group_hash_bucket_indices,
19
+ group_by_pk_hash_bucket,
20
+ )
21
+ from deltacat.storage import interface as unimplemented_deltacat_storage
22
+ from deltacat.types.media import StorageType
23
+ from deltacat.utils.ray_utils.runtime import (
24
+ get_current_ray_task_id,
25
+ get_current_ray_worker_id,
26
+ )
27
+ from deltacat.utils.common import ReadKwargsProvider
28
+ from deltacat.utils.performance import timed_invocation
29
+ from deltacat.utils.metrics import emit_timer_metrics
30
+ from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
31
+
32
+ if importlib.util.find_spec("memray"):
33
+ import memray
34
+
35
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
+
37
+
38
+ def _read_delta_file_envelopes(
39
+ annotated_delta: DeltaAnnotated,
40
+ read_kwargs_provider: Optional[ReadKwargsProvider],
41
+ deltacat_storage=unimplemented_deltacat_storage,
42
+ deltacat_storage_kwargs: Optional[dict] = None,
43
+ ) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
44
+
45
+ tables = deltacat_storage.download_delta(
46
+ annotated_delta,
47
+ max_parallelism=1,
48
+ file_reader_kwargs_provider=read_kwargs_provider,
49
+ storage_type=StorageType.LOCAL,
50
+ **deltacat_storage_kwargs,
51
+ )
52
+ annotations = annotated_delta.annotations
53
+ assert (
54
+ len(tables) == len(annotations),
55
+ f"Unexpected Error: Length of downloaded delta manifest tables "
56
+ f"({len(tables)}) doesn't match the length of delta manifest "
57
+ f"annotations ({len(annotations)}).",
58
+ )
59
+ if not tables:
60
+ return None, 0, 0
61
+
62
+ delta_stream_position = annotations[0].annotation_stream_position
63
+ delta_type = annotations[0].annotation_delta_type
64
+
65
+ for annotation in annotations:
66
+ assert annotation.annotation_stream_position == delta_stream_position, (
67
+ f"Annotation stream position does not match - {annotation.annotation_stream_position} "
68
+ f"!= {delta_stream_position}"
69
+ )
70
+ assert annotation.annotation_delta_type == delta_type, (
71
+ f"Annotation delta type does not match - {annotation.annotation_delta_type} "
72
+ f"!= {delta_type}"
73
+ )
74
+
75
+ delta_file_envelopes = []
76
+ table = pa.concat_tables(tables)
77
+ total_record_count = len(table)
78
+ total_size_bytes = int(table.nbytes)
79
+
80
+ delta_file = DeltaFileEnvelope.of(
81
+ stream_position=delta_stream_position,
82
+ delta_type=delta_type,
83
+ table=table,
84
+ )
85
+ delta_file_envelopes.append(delta_file)
86
+ return delta_file_envelopes, total_record_count, total_size_bytes
87
+
88
+
89
+ def _group_file_records_by_pk_hash_bucket(
90
+ annotated_delta: DeltaAnnotated,
91
+ num_hash_buckets: int,
92
+ primary_keys: List[str],
93
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
94
+ deltacat_storage=unimplemented_deltacat_storage,
95
+ deltacat_storage_kwargs: Optional[dict] = None,
96
+ ) -> Tuple[Optional[DeltaFileEnvelopeGroups], int, int]:
97
+ # read input parquet s3 objects into a list of delta file envelopes
98
+ (
99
+ delta_file_envelopes,
100
+ total_record_count,
101
+ total_size_bytes,
102
+ ) = _read_delta_file_envelopes(
103
+ annotated_delta,
104
+ read_kwargs_provider,
105
+ deltacat_storage,
106
+ deltacat_storage_kwargs,
107
+ )
108
+
109
+ if delta_file_envelopes is None:
110
+ return None, 0, 0
111
+
112
+ logger.info(f"Read all delta file envelopes: {len(delta_file_envelopes)}")
113
+
114
+ # group the data by primary key hash value
115
+ hb_to_delta_file_envelopes = np.empty([num_hash_buckets], dtype="object")
116
+ for dfe in delta_file_envelopes:
117
+ logger.info("Grouping by pk hash bucket")
118
+ start = time.monotonic()
119
+ hash_bucket_to_table = group_by_pk_hash_bucket(
120
+ dfe.table,
121
+ num_hash_buckets,
122
+ primary_keys,
123
+ )
124
+ group_end = time.monotonic()
125
+ logger.info(f"Grouping took: {group_end - start}")
126
+ for hb, table in enumerate(hash_bucket_to_table):
127
+ if table:
128
+ if hb_to_delta_file_envelopes[hb] is None:
129
+ hb_to_delta_file_envelopes[hb] = []
130
+ hb_to_delta_file_envelopes[hb].append(
131
+ DeltaFileEnvelope.of(
132
+ stream_position=dfe.stream_position,
133
+ file_index=dfe.file_index,
134
+ delta_type=dfe.delta_type,
135
+ table=table,
136
+ )
137
+ )
138
+ return hb_to_delta_file_envelopes, total_record_count, total_size_bytes
139
+
140
+
141
+ def _timed_hash_bucket(input: HashBucketInput):
142
+ task_id = get_current_ray_task_id()
143
+ worker_id = get_current_ray_worker_id()
144
+ with memray.Tracker(
145
+ f"hash_bucket_{worker_id}_{task_id}.bin"
146
+ ) if input.enable_profiler else nullcontext():
147
+ (
148
+ delta_file_envelope_groups,
149
+ total_record_count,
150
+ total_size_bytes,
151
+ ) = _group_file_records_by_pk_hash_bucket(
152
+ annotated_delta=input.annotated_delta,
153
+ num_hash_buckets=input.num_hash_buckets,
154
+ primary_keys=input.primary_keys,
155
+ read_kwargs_provider=input.read_kwargs_provider,
156
+ deltacat_storage=input.deltacat_storage,
157
+ deltacat_storage_kwargs=input.deltacat_storage_kwargs,
158
+ )
159
+ hash_bucket_group_to_obj_id_tuple = group_hash_bucket_indices(
160
+ hash_bucket_object_groups=delta_file_envelope_groups,
161
+ num_buckets=input.num_hash_buckets,
162
+ num_groups=input.num_hash_groups,
163
+ object_store=input.object_store,
164
+ )
165
+
166
+ peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
167
+ return HashBucketResult(
168
+ hash_bucket_group_to_obj_id_tuple,
169
+ np.int64(total_size_bytes),
170
+ np.int64(total_record_count),
171
+ np.double(peak_memory_usage_bytes),
172
+ np.double(0.0),
173
+ np.double(time.time()),
174
+ )
175
+
176
+
177
+ @ray.remote
178
+ def hash_bucket(input: HashBucketInput) -> HashBucketResult:
179
+
180
+ logger.info(f"Starting hash bucket task...")
181
+ hash_bucket_result, duration = timed_invocation(
182
+ func=_timed_hash_bucket, input=input
183
+ )
184
+
185
+ emit_metrics_time = 0.0
186
+ if input.metrics_config:
187
+ emit_result, latency = timed_invocation(
188
+ func=emit_timer_metrics,
189
+ metrics_name="hash_bucket",
190
+ value=duration,
191
+ metrics_config=input.metrics_config,
192
+ )
193
+ emit_metrics_time = latency
194
+
195
+ logger.info(f"Finished hash bucket task...")
196
+ return HashBucketResult(
197
+ hash_bucket_result[0],
198
+ hash_bucket_result[1],
199
+ hash_bucket_result[2],
200
+ hash_bucket_result[3],
201
+ np.double(emit_metrics_time),
202
+ hash_bucket_result[5],
203
+ )