deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,5 @@
1
1
  import logging
2
- import time
3
2
  from collections import defaultdict
4
- from itertools import repeat
5
3
  from typing import Any, Dict, List, Optional, Tuple
6
4
 
7
5
  import numpy as np
@@ -10,7 +8,6 @@ import pyarrow.compute as pc
10
8
  import ray
11
9
  from ray import cloudpickle
12
10
  from ray.types import ObjectRef
13
- from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
14
11
 
15
12
  from deltacat import logs
16
13
  from deltacat.compute.compactor import (
@@ -25,15 +22,7 @@ from deltacat.compute.compactor import (
25
22
  from deltacat.compute.compactor.utils import primary_key_index as pki
26
23
  from deltacat.compute.compactor.utils import system_columns as sc
27
24
  from deltacat.compute.compactor.utils.system_columns import get_minimal_hb_schema
28
- from deltacat.storage import DeltaType
29
- from deltacat.compute.compactor import SortKey, SortOrder, \
30
- RoundCompletionInfo, PrimaryKeyIndexVersionLocator, DeltaFileEnvelope, \
31
- DeltaFileLocator, PyArrowWriteResult
32
- from deltacat.compute.compactor.utils import system_columns as sc, \
33
- primary_key_index as pki
34
25
  from deltacat.utils.performance import timed_invocation
35
-
36
- from typing import Any, Dict, List, Optional, Tuple
37
26
  from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
38
27
 
39
28
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -46,18 +35,21 @@ DedupeTaskIndexWithObjectId = Tuple[DedupeTaskIndex, PickledObjectRef]
46
35
  DedupeResult = Tuple[
47
36
  Dict[MaterializeBucketIndex, DedupeTaskIndexWithObjectId],
48
37
  List[ObjectRef[DeltaFileLocatorToRecords]],
49
- PyArrowWriteResult
38
+ PyArrowWriteResult,
50
39
  ]
51
40
 
52
41
 
53
42
  def _union_primary_key_indices(
54
- s3_bucket: str,
55
- round_completion_info: RoundCompletionInfo,
56
- hash_bucket_index: int,
57
- df_envelopes_list: List[List[DeltaFileEnvelope]]) -> pa.Table:
58
-
59
- logger.info(f"[Hash bucket index {hash_bucket_index}] Reading dedupe input for "
60
- f"{len(df_envelopes_list)} delta file envelope lists...")
43
+ s3_bucket: str,
44
+ round_completion_info: RoundCompletionInfo,
45
+ hash_bucket_index: int,
46
+ df_envelopes_list: List[List[DeltaFileEnvelope]],
47
+ ) -> pa.Table:
48
+
49
+ logger.info(
50
+ f"[Hash bucket index {hash_bucket_index}] Reading dedupe input for "
51
+ f"{len(df_envelopes_list)} delta file envelope lists..."
52
+ )
61
53
  # read compacted input parquet files first
62
54
  # (which implicitly have older stream positions than deltas)
63
55
  hb_tables = []
@@ -67,12 +59,16 @@ def _union_primary_key_indices(
67
59
  hash_bucket_index,
68
60
  round_completion_info.primary_key_index_version_locator,
69
61
  # Enforce consistent column ordering by reading from a schema, to prevent schema mismatch errors
70
- file_reader_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(schema=get_minimal_hb_schema())
62
+ file_reader_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
63
+ schema=get_minimal_hb_schema()
64
+ ),
71
65
  )
72
66
  if tables:
73
67
  prior_pk_index_table = pa.concat_tables(tables)
74
- logger.info(f"Number of records in prior primary index for hash bucket"
75
- f" {hash_bucket_index}: {prior_pk_index_table.num_rows}")
68
+ logger.info(
69
+ f"Number of records in prior primary index for hash bucket"
70
+ f" {hash_bucket_index}: {prior_pk_index_table.num_rows}"
71
+ )
76
72
  hb_tables.append(prior_pk_index_table)
77
73
 
78
74
  # sort by delta file stream position now instead of sorting every row later
@@ -87,43 +83,53 @@ def _union_primary_key_indices(
87
83
 
88
84
  hb_table = pa.concat_tables(hb_tables)
89
85
 
90
- logger.info(f"Total records in hash bucket {hash_bucket_index} is {hb_table.num_rows}")
86
+ logger.info(
87
+ f"Total records in hash bucket {hash_bucket_index} is {hb_table.num_rows}"
88
+ )
91
89
  return hb_table
92
90
 
93
91
 
94
92
  def _drop_duplicates_by_primary_key_hash(table: pa.Table) -> pa.Table:
95
93
  value_to_last_row_idx = {}
96
- row_idx = 0
97
- pk_op_chunk_iter = zip(
98
- sc.pk_hash_column(table).iterchunks(),
99
- sc.delta_type_column(table).iterchunks(),
94
+
95
+ pk_hash_np = sc.pk_hash_column_np(table)
96
+ op_type_np = sc.delta_type_column_np(table)
97
+
98
+ assert len(pk_hash_np) == len(op_type_np), (
99
+ f"Primary key digest column length ({len(pk_hash_np)}) doesn't "
100
+ f"match delta type column length ({len(op_type_np)})."
100
101
  )
101
- for (pk_chunk, op_chunk) in pk_op_chunk_iter:
102
- pk_op_val_iter = zip(
103
- pk_chunk.to_numpy(zero_copy_only=False),
104
- op_chunk.to_numpy(zero_copy_only=False),
105
- )
106
- for (pk_val, op_val) in pk_op_val_iter:
107
- # operation type is True for `UPSERT` and False for `DELETE`
108
- if op_val:
109
- # UPSERT this row
110
- value_to_last_row_idx[pk_val] = row_idx
111
- else:
112
- # DELETE this row
113
- value_to_last_row_idx.pop(pk_val, None)
114
- row_idx += 1
102
+
103
+ # TODO(raghumdani): move the dedupe to C++ using arrow methods or similar.
104
+ row_idx = 0
105
+ pk_op_val_iter = zip(pk_hash_np, op_type_np)
106
+ for (pk_val, op_val) in pk_op_val_iter:
107
+
108
+ # operation type is True for `UPSERT` and False for `DELETE`
109
+ if op_val:
110
+ # UPSERT this row
111
+ value_to_last_row_idx[pk_val] = row_idx
112
+ else:
113
+ # DELETE this row
114
+ value_to_last_row_idx.pop(pk_val, None)
115
+
116
+ row_idx += 1
117
+
115
118
  return table.take(list(value_to_last_row_idx.values()))
116
119
 
117
120
 
118
121
  def _write_new_primary_key_index(
119
- s3_bucket: str,
120
- new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
121
- max_rows_per_index_file: int,
122
- dedupe_task_index: int,
123
- deduped_tables: List[Tuple[int, pa.Table]]) -> PyArrowWriteResult:
124
-
125
- logger.info(f"[Dedupe task index {dedupe_task_index}] Writing new deduped primary key index: "
126
- f"{new_primary_key_index_version_locator}")
122
+ s3_bucket: str,
123
+ new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
124
+ max_rows_per_index_file: int,
125
+ dedupe_task_index: int,
126
+ deduped_tables: List[Tuple[int, pa.Table]],
127
+ ) -> PyArrowWriteResult:
128
+
129
+ logger.info(
130
+ f"[Dedupe task index {dedupe_task_index}] Writing new deduped primary key index: "
131
+ f"{new_primary_key_index_version_locator}"
132
+ )
127
133
 
128
134
  pki_results = []
129
135
  for hb_index, table in deduped_tables:
@@ -137,36 +143,43 @@ def _write_new_primary_key_index(
137
143
  pki_results.append(hb_pki_result)
138
144
 
139
145
  result = PyArrowWriteResult.union(pki_results)
140
- logger.info(f"[Dedupe task index {dedupe_task_index}] Wrote new deduped primary key index: "
141
- f"{new_primary_key_index_version_locator}. Result: {result}")
146
+ logger.info(
147
+ f"[Dedupe task index {dedupe_task_index}] Wrote new deduped primary key index: "
148
+ f"{new_primary_key_index_version_locator}. Result: {result}"
149
+ )
142
150
  return result
143
151
 
144
152
 
145
153
  def delta_file_locator_to_mat_bucket_index(
146
- df_locator: DeltaFileLocator,
147
- materialize_bucket_count: int) -> int:
154
+ df_locator: DeltaFileLocator, materialize_bucket_count: int
155
+ ) -> int:
148
156
  digest = df_locator.digest()
149
157
  return int.from_bytes(digest, "big") % materialize_bucket_count
150
158
 
159
+
151
160
  @ray.remote(num_returns=3)
152
161
  def dedupe(
153
- compaction_artifact_s3_bucket: str,
154
- round_completion_info: Optional[RoundCompletionInfo],
155
- new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
156
- object_ids: List[Any],
157
- sort_keys: List[SortKey],
158
- max_records_per_index_file: int,
159
- num_materialize_buckets: int,
160
- dedupe_task_index: int,
161
- delete_old_primary_key_index: bool) -> DedupeResult:
162
+ compaction_artifact_s3_bucket: str,
163
+ round_completion_info: Optional[RoundCompletionInfo],
164
+ new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
165
+ object_ids: List[Any],
166
+ sort_keys: List[SortKey],
167
+ max_records_per_index_file: int,
168
+ num_materialize_buckets: int,
169
+ dedupe_task_index: int,
170
+ delete_old_primary_key_index: bool,
171
+ ) -> DedupeResult:
162
172
 
163
173
  logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
164
174
  # TODO (pdames): mitigate risk of running out of memory here in cases of
165
175
  # severe skew of primary key updates in deltas
166
176
  src_file_records_obj_refs = [
167
- cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
168
- logger.info(f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
169
- f"groups for {len(src_file_records_obj_refs)} object refs...")
177
+ cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
178
+ ]
179
+ logger.info(
180
+ f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
181
+ f"groups for {len(src_file_records_obj_refs)} object refs..."
182
+ )
170
183
 
171
184
  delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
172
185
  hb_index_to_delta_file_envelopes_list = defaultdict(list)
@@ -176,8 +189,10 @@ def dedupe(
176
189
  hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
177
190
  src_file_id_to_row_indices = defaultdict(list)
178
191
  deduped_tables = []
179
- logger.info(f"[Dedupe task {dedupe_task_index}] Running {len(hb_index_to_delta_file_envelopes_list)} "
180
- f"dedupe rounds...")
192
+ logger.info(
193
+ f"[Dedupe task {dedupe_task_index}] Running {len(hb_index_to_delta_file_envelopes_list)} "
194
+ f"dedupe rounds..."
195
+ )
181
196
  for hb_idx, dfe_list in hb_index_to_delta_file_envelopes_list.items():
182
197
  logger.info(f"{dedupe_task_index}: union primary keys for hb_index: {hb_idx}")
183
198
 
@@ -186,32 +201,39 @@ def dedupe(
186
201
  s3_bucket=compaction_artifact_s3_bucket,
187
202
  round_completion_info=round_completion_info,
188
203
  hash_bucket_index=hb_idx,
189
- df_envelopes_list=dfe_list)
190
- logger.info(f"[Dedupe {dedupe_task_index}] Dedupe round input "
191
- f"record count: {len(table)}, took {union_time}s")
204
+ df_envelopes_list=dfe_list,
205
+ )
206
+ logger.info(
207
+ f"[Dedupe {dedupe_task_index}] Dedupe round input "
208
+ f"record count: {len(table)}, took {union_time}s"
209
+ )
192
210
 
193
211
  # sort by sort keys
194
212
  if len(sort_keys):
195
213
  # TODO (pdames): convert to O(N) dedupe w/ sort keys
196
- sort_keys.extend([
197
- SortKey.of(
198
- sc._PARTITION_STREAM_POSITION_COLUMN_NAME,
199
- SortOrder.ASCENDING
200
- ),
201
- SortKey.of(
202
- sc._ORDERED_FILE_IDX_COLUMN_NAME,
203
- SortOrder.ASCENDING
204
- ),
205
- ])
214
+ sort_keys.extend(
215
+ [
216
+ SortKey.of(
217
+ sc._PARTITION_STREAM_POSITION_COLUMN_NAME, SortOrder.ASCENDING
218
+ ),
219
+ SortKey.of(sc._ORDERED_FILE_IDX_COLUMN_NAME, SortOrder.ASCENDING),
220
+ ]
221
+ )
206
222
  table = table.take(pc.sort_indices(table, sort_keys=sort_keys))
207
223
 
208
224
  # drop duplicates by primary key hash column
209
- logger.info(f"[Dedupe task index {dedupe_task_index}] Dropping duplicates for {hb_idx}")
225
+ logger.info(
226
+ f"[Dedupe task index {dedupe_task_index}] Dropping duplicates for {hb_idx}"
227
+ )
210
228
 
211
- table, drop_time = timed_invocation(func=_drop_duplicates_by_primary_key_hash, table=table)
229
+ table, drop_time = timed_invocation(
230
+ func=_drop_duplicates_by_primary_key_hash, table=table
231
+ )
212
232
 
213
- logger.info(f"[Dedupe task index {dedupe_task_index}] Dedupe round output "
214
- f"record count: {len(table)}, took: {drop_time}s")
233
+ logger.info(
234
+ f"[Dedupe task index {dedupe_task_index}] Dedupe round output "
235
+ f"record count: {len(table)}, took: {drop_time}s"
236
+ )
215
237
 
216
238
  deduped_tables.append((hb_idx, table))
217
239
 
@@ -230,7 +252,9 @@ def dedupe(
230
252
 
231
253
  logger.info(f"Finished all dedupe rounds...")
232
254
  mat_bucket_to_src_file_record_count = defaultdict(dict)
233
- mat_bucket_to_src_file_records: Dict[MaterializeBucketIndex, DeltaFileLocatorToRecords] = defaultdict(dict)
255
+ mat_bucket_to_src_file_records: Dict[
256
+ MaterializeBucketIndex, DeltaFileLocatorToRecords
257
+ ] = defaultdict(dict)
234
258
  for src_dfl, src_row_indices in src_file_id_to_row_indices.items():
235
259
  mat_bucket = delta_file_locator_to_mat_bucket_index(
236
260
  src_dfl,
@@ -239,10 +263,11 @@ def dedupe(
239
263
  mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
240
264
  src_row_indices,
241
265
  )
242
- mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = \
243
- len(src_row_indices)
266
+ mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(src_row_indices)
244
267
 
245
- mat_bucket_to_dd_idx_obj_id: Dict[MaterializeBucketIndex, DedupeTaskIndexWithObjectId] = {}
268
+ mat_bucket_to_dd_idx_obj_id: Dict[
269
+ MaterializeBucketIndex, DedupeTaskIndexWithObjectId
270
+ ] = {}
246
271
  src_file_records_obj_refs: List[ObjectRef[DeltaFileLocatorToRecords]] = []
247
272
  for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
248
273
  object_ref = ray.put(src_file_records)
@@ -254,15 +279,17 @@ def dedupe(
254
279
  )
255
280
  del object_ref
256
281
  del pickled_object_ref
257
- logger.info(f"Count of materialize buckets with object refs: "
258
- f"{len(mat_bucket_to_dd_idx_obj_id)}")
282
+ logger.info(
283
+ f"Count of materialize buckets with object refs: "
284
+ f"{len(mat_bucket_to_dd_idx_obj_id)}"
285
+ )
259
286
 
260
287
  write_pki_result: PyArrowWriteResult = _write_new_primary_key_index(
261
288
  compaction_artifact_s3_bucket,
262
289
  new_primary_key_index_version_locator,
263
290
  max_records_per_index_file,
264
291
  dedupe_task_index,
265
- deduped_tables
292
+ deduped_tables,
266
293
  )
267
294
 
268
295
  if delete_old_primary_key_index:
@@ -271,6 +298,4 @@ def dedupe(
271
298
  round_completion_info.primary_key_index_version_locator,
272
299
  )
273
300
  logger.info(f"[Dedupe task index {dedupe_task_index}] Finished dedupe task...")
274
- return mat_bucket_to_dd_idx_obj_id, \
275
- src_file_records_obj_refs, \
276
- write_pki_result
301
+ return mat_bucket_to_dd_idx_obj_id, src_file_records_obj_refs, write_pki_result
@@ -1,37 +1,37 @@
1
- import ray
2
- import pyarrow as pa
3
- import numpy as np
4
1
  import logging
5
-
6
- from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
7
2
  from itertools import chain
3
+ from typing import Generator, List, Optional, Tuple
4
+
5
+ import numpy as np
6
+ import pyarrow as pa
7
+ import ray
8
+ from ray.types import ObjectRef
8
9
 
9
10
  from deltacat import logs
10
- from deltacat.compute.compactor import DeltaAnnotated, DeltaFileEnvelope, \
11
- SortKey
12
- from deltacat.compute.compactor.utils.primary_key_index import \
13
- group_hash_bucket_indices, group_record_indices_by_hash_bucket
11
+ from deltacat.compute.compactor import DeltaAnnotated, DeltaFileEnvelope, SortKey
12
+ from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
13
+ from deltacat.compute.compactor.utils import system_columns as sc
14
+ from deltacat.compute.compactor.utils.primary_key_index import (
15
+ group_hash_bucket_indices,
16
+ group_record_indices_by_hash_bucket,
17
+ )
14
18
  from deltacat.storage import interface as unimplemented_deltacat_storage
15
19
  from deltacat.types.media import StorageType
16
20
  from deltacat.utils.common import sha1_digest
17
- from deltacat.compute.compactor.utils import system_columns as sc
18
-
19
- from typing import List, Optional, Generator, Tuple
20
-
21
- from ray.types import ObjectRef
22
21
 
23
22
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
24
23
 
25
- _PK_BYTES_DELIMITER = b'L6kl7u5f'
24
+ _PK_BYTES_DELIMITER = b"L6kl7u5f"
26
25
 
27
26
  HashBucketGroupToObjectId = np.ndarray
28
- HashBucketResult = Tuple[HashBucketGroupToObjectId, List[ObjectRef[DeltaFileEnvelopeGroups]]]
27
+ HashBucketResult = Tuple[
28
+ HashBucketGroupToObjectId, List[ObjectRef[DeltaFileEnvelopeGroups]]
29
+ ]
29
30
 
30
31
 
31
32
  def _group_by_pk_hash_bucket(
32
- table: pa.Table,
33
- num_buckets: int,
34
- primary_keys: List[str]) -> np.ndarray:
33
+ table: pa.Table, num_buckets: int, primary_keys: List[str]
34
+ ) -> np.ndarray:
35
35
 
36
36
  # generate the primary key digest column
37
37
  all_pk_column_fields = []
@@ -66,19 +66,17 @@ def _hash_pk_bytes_generator(all_column_fields) -> Generator[bytes, None, None]:
66
66
  for field_index in range(len(all_column_fields[0])):
67
67
  bytes_to_join = []
68
68
  for column_fields in all_column_fields:
69
- bytes_to_join.append(
70
- bytes(str(column_fields[field_index]), "utf-8")
71
- )
69
+ bytes_to_join.append(bytes(str(column_fields[field_index]), "utf-8"))
72
70
  yield sha1_digest(_PK_BYTES_DELIMITER.join(bytes_to_join))
73
71
 
74
72
 
75
73
  def _group_file_records_by_pk_hash_bucket(
76
- annotated_delta: DeltaAnnotated,
77
- num_hash_buckets: int,
78
- primary_keys: List[str],
79
- sort_key_names: List[str],
80
- deltacat_storage=unimplemented_deltacat_storage) \
81
- -> Optional[DeltaFileEnvelopeGroups]:
74
+ annotated_delta: DeltaAnnotated,
75
+ num_hash_buckets: int,
76
+ primary_keys: List[str],
77
+ sort_key_names: List[str],
78
+ deltacat_storage=unimplemented_deltacat_storage,
79
+ ) -> Optional[DeltaFileEnvelopeGroups]:
82
80
 
83
81
  # read input parquet s3 objects into a list of delta file envelopes
84
82
  delta_file_envelopes = _read_delta_file_envelopes(
@@ -104,18 +102,18 @@ def _group_file_records_by_pk_hash_bucket(
104
102
  hb_to_delta_file_envelopes[hb] = []
105
103
  hb_to_delta_file_envelopes[hb].append(
106
104
  DeltaFileEnvelope.of(
107
- dfe.stream_position,
108
- dfe.file_index,
109
- dfe.delta_type,
110
- table))
105
+ dfe.stream_position, dfe.file_index, dfe.delta_type, table
106
+ )
107
+ )
111
108
  return hb_to_delta_file_envelopes
112
109
 
110
+
113
111
  def _read_delta_file_envelopes(
114
- annotated_delta: DeltaAnnotated,
115
- primary_keys: List[str],
116
- sort_key_names: List[str],
117
- deltacat_storage=unimplemented_deltacat_storage) \
118
- -> Optional[List[DeltaFileEnvelope]]:
112
+ annotated_delta: DeltaAnnotated,
113
+ primary_keys: List[str],
114
+ sort_key_names: List[str],
115
+ deltacat_storage=unimplemented_deltacat_storage,
116
+ ) -> Optional[List[DeltaFileEnvelope]]:
119
117
 
120
118
  columns_to_read = list(chain(primary_keys, sort_key_names))
121
119
  tables = deltacat_storage.download_delta(
@@ -125,10 +123,12 @@ def _read_delta_file_envelopes(
125
123
  storage_type=StorageType.LOCAL,
126
124
  )
127
125
  annotations = annotated_delta.annotations
128
- assert(len(tables) == len(annotations),
129
- f"Unexpected Error: Length of downloaded delta manifest tables "
130
- f"({len(tables)}) doesn't match the length of delta manifest "
131
- f"annotations ({len(annotations)}).")
126
+ assert (
127
+ len(tables) == len(annotations),
128
+ f"Unexpected Error: Length of downloaded delta manifest tables "
129
+ f"({len(tables)}) doesn't match the length of delta manifest "
130
+ f"annotations ({len(annotations)}).",
131
+ )
132
132
  if not tables:
133
133
  return None
134
134
 
@@ -146,12 +146,13 @@ def _read_delta_file_envelopes(
146
146
 
147
147
  @ray.remote(num_returns=2)
148
148
  def hash_bucket(
149
- annotated_delta: DeltaAnnotated,
150
- primary_keys: List[str],
151
- sort_keys: List[SortKey],
152
- num_buckets: int,
153
- num_groups: int,
154
- deltacat_storage=unimplemented_deltacat_storage) -> HashBucketResult:
149
+ annotated_delta: DeltaAnnotated,
150
+ primary_keys: List[str],
151
+ sort_keys: List[SortKey],
152
+ num_buckets: int,
153
+ num_groups: int,
154
+ deltacat_storage=unimplemented_deltacat_storage,
155
+ ) -> HashBucketResult:
155
156
 
156
157
  logger.info(f"Starting hash bucket task...")
157
158
  sort_key_names = [key.key_name for key in sort_keys]