deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -8,14 +8,8 @@ import pyarrow as pa
8
8
  import ray
9
9
  import s3fs
10
10
  from ray import cloudpickle
11
- from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
12
11
  from ray.types import ObjectRef
13
12
 
14
- from deltacat.storage import Manifest, PartitionLocator
15
- from deltacat.utils.ray_utils.concurrency import invoke_parallel
16
- from deltacat.compute.compactor import PyArrowWriteResult, \
17
- RoundCompletionInfo, PrimaryKeyIndexMeta, PrimaryKeyIndexLocator, \
18
- PrimaryKeyIndexVersionMeta, PrimaryKeyIndexVersionLocator
19
13
  from deltacat import logs
20
14
  from deltacat.aws import s3u
21
15
  from deltacat.compute.compactor import (
@@ -30,29 +24,31 @@ from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb
30
24
  from deltacat.compute.compactor.steps.rehash import rewrite_index as ri
31
25
  from deltacat.compute.compactor.utils import round_completion_file as rcf
32
26
  from deltacat.compute.compactor.utils import system_columns as sc
27
+ from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
33
28
  from deltacat.storage import Manifest, PartitionLocator
34
29
  from deltacat.types.media import ContentEncoding, ContentType
35
30
  from deltacat.types.tables import get_table_slicer, get_table_writer
36
31
  from deltacat.utils.common import ReadKwargsProvider
37
- from deltacat.utils.ray_utils.concurrency import (
38
- invoke_parallel
39
- )
32
+ from deltacat.utils.ray_utils.concurrency import invoke_parallel
40
33
 
41
34
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
42
35
 
43
36
 
44
37
  def rehash(
45
- options_provider: Callable[[int, Any], Dict[str, Any]],
46
- s3_bucket: str,
47
- source_partition_locator: PartitionLocator,
48
- old_rci: RoundCompletionInfo,
49
- new_hash_bucket_count: int,
50
- hash_bucket_index_group_count: int,
51
- records_per_primary_key_index_file: int,
52
- delete_old_primary_key_index: bool) -> RoundCompletionInfo:
53
-
54
- logger.info(f"Rehashing primary key index. Old round completion info: "
55
- f"{old_rci}. New hash bucket count: {new_hash_bucket_count}")
38
+ options_provider: Callable[[int, Any], Dict[str, Any]],
39
+ s3_bucket: str,
40
+ source_partition_locator: PartitionLocator,
41
+ old_rci: RoundCompletionInfo,
42
+ new_hash_bucket_count: int,
43
+ hash_bucket_index_group_count: int,
44
+ records_per_primary_key_index_file: int,
45
+ delete_old_primary_key_index: bool,
46
+ ) -> RoundCompletionInfo:
47
+
48
+ logger.info(
49
+ f"Rehashing primary key index. Old round completion info: "
50
+ f"{old_rci}. New hash bucket count: {new_hash_bucket_count}"
51
+ )
56
52
 
57
53
  # collect old primary key index information
58
54
  old_pki_version_locator = old_rci.primary_key_index_version_locator
@@ -60,10 +56,12 @@ def rehash(
60
56
  old_pki_meta = old_pkiv_meta.primary_key_index_meta
61
57
  old_compacted_partition_locator = old_pki_meta.compacted_partition_locator
62
58
  if old_pkiv_meta.hash_bucket_count == new_hash_bucket_count:
63
- raise ValueError(f"Primary key index rehash failed. Old hash bucket "
64
- f"count ({new_hash_bucket_count}) is "
65
- f"equal to new hash bucket count. Partition: "
66
- f"{old_compacted_partition_locator}.")
59
+ raise ValueError(
60
+ f"Primary key index rehash failed. Old hash bucket "
61
+ f"count ({new_hash_bucket_count}) is "
62
+ f"equal to new hash bucket count. Partition: "
63
+ f"{old_compacted_partition_locator}."
64
+ )
67
65
 
68
66
  # generate a new unique primary key index version locator to rehash into
69
67
  new_pki_meta = PrimaryKeyIndexMeta.of(
@@ -78,7 +76,8 @@ def rehash(
78
76
  new_hash_bucket_count,
79
77
  )
80
78
  rehashed_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
81
- new_pki_version_meta)
79
+ new_pki_version_meta
80
+ )
82
81
 
83
82
  # launch a rehash task for each bucket of the old primary key index version
84
83
  old_hash_bucket_count = old_pkiv_meta.hash_bucket_count
@@ -124,6 +123,7 @@ def rehash(
124
123
  PyArrowWriteResult.union(pki_stats),
125
124
  old_rci.sort_keys_bit_width,
126
125
  rehashed_pki_version_locator,
126
+ old_rci.rebase_source_partition_locator,
127
127
  )
128
128
  rcf.write_round_completion_file(
129
129
  s3_bucket,
@@ -136,41 +136,48 @@ def rehash(
136
136
  s3_bucket,
137
137
  old_pki_version_locator,
138
138
  )
139
- logger.info(f"Rehashed primary key index. New round completion info: "
140
- f"{round_completion_info}.")
139
+ logger.info(
140
+ f"Rehashed primary key index. New round completion info: "
141
+ f"{round_completion_info}."
142
+ )
141
143
  return round_completion_info
142
144
 
143
145
 
144
146
  def download_hash_bucket_entries(
145
- s3_bucket: str,
146
- hash_bucket_index: int,
147
- primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
148
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
149
- -> List[pa.Table]:
150
-
151
- pk_index_manifest_s3_url = primary_key_index_version_locator\
152
- .get_pkiv_hb_index_manifest_s3_url(
147
+ s3_bucket: str,
148
+ hash_bucket_index: int,
149
+ primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
150
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
151
+ ) -> List[pa.Table]:
152
+
153
+ pk_index_manifest_s3_url = (
154
+ primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
153
155
  s3_bucket,
154
156
  hash_bucket_index,
155
157
  )
158
+ )
156
159
  result = s3u.download(pk_index_manifest_s3_url, False)
157
- logger.info(f"Downloading primary key index hash bucket manifest entries: "
158
- f"{pk_index_manifest_s3_url}. Primary key index version "
159
- f"locator: {primary_key_index_version_locator}")
160
+ logger.info(
161
+ f"Downloading primary key index hash bucket manifest entries: "
162
+ f"{pk_index_manifest_s3_url}. Primary key index version "
163
+ f"locator: {primary_key_index_version_locator}"
164
+ )
160
165
  pk_index_manifest = Manifest(json.loads(result["Body"].read().decode("utf-8")))
161
- tables = s3u.download_manifest_entries(pk_index_manifest,
162
- file_reader_kwargs_provider=file_reader_kwargs_provider)
166
+ tables = s3u.download_manifest_entries(
167
+ pk_index_manifest, file_reader_kwargs_provider=file_reader_kwargs_provider
168
+ )
163
169
  if not tables:
164
170
  logger.warning(
165
171
  f"Primary key index manifest is empty at: "
166
172
  f"{pk_index_manifest_s3_url}. Primary key index version "
167
- f"locator: {primary_key_index_version_locator}")
173
+ f"locator: {primary_key_index_version_locator}"
174
+ )
168
175
  return tables
169
176
 
170
177
 
171
178
  def delete_primary_key_index_version(
172
- s3_bucket: str,
173
- pki_version_locator: PrimaryKeyIndexVersionLocator) -> None:
179
+ s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
180
+ ) -> None:
174
181
 
175
182
  logger.info(f"Deleting primary key index: {pki_version_locator}")
176
183
  s3u.delete_files_by_prefix(
@@ -181,8 +188,8 @@ def delete_primary_key_index_version(
181
188
 
182
189
 
183
190
  def group_record_indices_by_hash_bucket(
184
- pki_table: pa.Table,
185
- num_buckets: int) -> np.ndarray:
191
+ pki_table: pa.Table, num_buckets: int
192
+ ) -> np.ndarray:
186
193
 
187
194
  hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
188
195
  record_index = 0
@@ -196,11 +203,10 @@ def group_record_indices_by_hash_bucket(
196
203
 
197
204
 
198
205
  def group_hash_bucket_indices(
199
- hash_bucket_object_groups: np.ndarray,
200
- num_buckets: int,
201
- num_groups: int) -> Tuple[np.ndarray, List[ObjectRef]]:
206
+ hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
207
+ ) -> Tuple[np.ndarray, List[ObjectRef]]:
202
208
  """
203
- Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
209
+ Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
204
210
  """
205
211
 
206
212
  object_refs = []
@@ -214,8 +220,7 @@ def group_hash_bucket_indices(
214
220
  if obj:
215
221
  hb_group = hb_index % num_groups
216
222
  if hb_group_to_object[hb_group] is None:
217
- hb_group_to_object[hb_group] = np.empty(
218
- [num_buckets], dtype="object")
223
+ hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
219
224
  hb_group_to_object[hb_group][hb_index] = obj
220
225
 
221
226
  for hb_group, obj in enumerate(hb_group_to_object):
@@ -225,21 +230,19 @@ def group_hash_bucket_indices(
225
230
  pickled_obj_ref = cloudpickle.dumps(obj_ref)
226
231
  object_refs.append(pickled_obj_ref)
227
232
  hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
228
- # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
229
- # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
230
- # (e.g., if the ObjectRef is deserialized by a non-Ray process).
231
- # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
232
- # The object now has a permanent reference and the data can't be freed from Ray’s object store.
233
- # Manually deleting the untrackable object references offsets these permanent references and
234
- # helps to allow these objects to be garbage collected normally.
233
+ # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
234
+ # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
235
+ # (e.g., if the ObjectRef is deserialized by a non-Ray process).
236
+ # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
237
+ # The object now has a permanent reference and the data can't be freed from Ray’s object store.
238
+ # Manually deleting the untrackable object references offsets these permanent references and
239
+ # helps to allow these objects to be garbage collected normally.
235
240
  del obj_ref
236
241
  del pickled_obj_ref
237
242
  return hash_bucket_group_to_obj_id, object_refs
238
243
 
239
244
 
240
- def pk_digest_to_hash_bucket_index(
241
- digest,
242
- num_buckets: int) -> int:
245
+ def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
243
246
  """
244
247
  Deterministically get the hash bucket a particular digest belongs to
245
248
  based on number of total hash buckets.
@@ -249,11 +252,12 @@ def pk_digest_to_hash_bucket_index(
249
252
 
250
253
 
251
254
  def write_primary_key_index_files(
252
- table: pa.Table,
253
- primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
254
- s3_bucket: str,
255
- hb_index: int,
256
- records_per_index_file: int) -> PyArrowWriteResult:
255
+ table: pa.Table,
256
+ primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
257
+ s3_bucket: str,
258
+ hb_index: int,
259
+ records_per_index_file: int,
260
+ ) -> PyArrowWriteResult:
257
261
  """
258
262
  Writes primary key index files for the given hash bucket index out to the
259
263
  specified S3 bucket at the path identified by the given primary key index
@@ -262,19 +266,24 @@ def write_primary_key_index_files(
262
266
 
263
267
  TODO(raghumdani): Support writing primary key index to any data catalog
264
268
  """
265
- logger.info(f"Writing primary key index files for hash bucket {hb_index}. "
266
- f"Primary key index version locator: "
267
- f"{primary_key_index_version_locator}.")
269
+ logger.info(
270
+ f"Writing primary key index files for hash bucket {hb_index}. "
271
+ f"Primary key index version locator: "
272
+ f"{primary_key_index_version_locator}."
273
+ )
268
274
  s3_file_system = s3fs.S3FileSystem(
269
275
  anon=False,
270
276
  s3_additional_kwargs={
271
277
  "ContentType": ContentType.PARQUET.value,
272
278
  "ContentEncoding": ContentEncoding.IDENTITY.value,
273
279
  },
274
- config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
280
+ config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG,
281
+ )
282
+ pkiv_hb_index_s3_url_base = (
283
+ primary_key_index_version_locator.get_pkiv_hb_index_s3_url_base(
284
+ s3_bucket, hb_index
285
+ )
275
286
  )
276
- pkiv_hb_index_s3_url_base = primary_key_index_version_locator\
277
- .get_pkiv_hb_index_s3_url_base(s3_bucket, hb_index)
278
287
  manifest_entries = s3u.upload_sliced_table(
279
288
  table,
280
289
  pkiv_hb_index_s3_url_base,
@@ -284,19 +293,21 @@ def write_primary_key_index_files(
284
293
  get_table_slicer(table),
285
294
  )
286
295
  manifest = Manifest.of(manifest_entries)
287
- pkiv_hb_index_s3_manifest_s3_url = primary_key_index_version_locator\
288
- .get_pkiv_hb_index_manifest_s3_url(s3_bucket, hb_index)
289
- s3u.upload(
290
- pkiv_hb_index_s3_manifest_s3_url,
291
- str(json.dumps(manifest))
296
+ pkiv_hb_index_s3_manifest_s3_url = (
297
+ primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
298
+ s3_bucket, hb_index
299
+ )
292
300
  )
301
+ s3u.upload(pkiv_hb_index_s3_manifest_s3_url, str(json.dumps(manifest)))
293
302
  result = PyArrowWriteResult.of(
294
303
  len(manifest_entries),
295
304
  table.nbytes,
296
305
  manifest.meta.content_length,
297
306
  len(table),
298
307
  )
299
- logger.info(f"Wrote primary key index files for hash bucket {hb_index}. "
300
- f"Primary key index version locator: "
301
- f"{primary_key_index_version_locator}. Result: {result}")
308
+ logger.info(
309
+ f"Wrote primary key index files for hash bucket {hb_index}. "
310
+ f"Primary key index version locator: "
311
+ f"{primary_key_index_version_locator}. Result: {result}"
312
+ )
302
313
  return result
@@ -1,35 +1,35 @@
1
- import logging
2
1
  import json
2
+ import logging
3
3
 
4
- from deltacat.storage import PartitionLocator
5
- from deltacat.compute.compactor import RoundCompletionInfo
6
4
  from deltacat import logs
5
+ from deltacat.compute.compactor import RoundCompletionInfo
6
+ from deltacat.storage import PartitionLocator
7
7
 
8
8
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
9
9
 
10
10
 
11
11
  def get_round_completion_file_s3_url(
12
- bucket: str,
13
- source_partition_locator: PartitionLocator,
14
- pki_root_path: str) -> str:
12
+ bucket: str, source_partition_locator: PartitionLocator, pki_root_path: str
13
+ ) -> str:
15
14
 
16
15
  base_url = source_partition_locator.path(f"s3://{bucket}")
17
16
  return f"{base_url}/{pki_root_path}.json"
18
17
 
19
18
 
20
19
  def read_round_completion_file(
21
- bucket: str,
22
- source_partition_locator: PartitionLocator,
23
- primary_key_index_root_path: str) -> RoundCompletionInfo:
20
+ bucket: str,
21
+ source_partition_locator: PartitionLocator,
22
+ primary_key_index_root_path: str,
23
+ ) -> RoundCompletionInfo:
24
24
 
25
25
  from deltacat.aws import s3u as s3_utils
26
+
26
27
  round_completion_file_url = get_round_completion_file_s3_url(
27
28
  bucket,
28
29
  source_partition_locator,
29
30
  primary_key_index_root_path,
30
31
  )
31
- logger.info(
32
- f"reading round completion file from: {round_completion_file_url}")
32
+ logger.info(f"reading round completion file from: {round_completion_file_url}")
33
33
  round_completion_info = None
34
34
  result = s3_utils.download(round_completion_file_url, False)
35
35
  if result:
@@ -40,24 +40,23 @@ def read_round_completion_file(
40
40
 
41
41
 
42
42
  def write_round_completion_file(
43
- bucket: str,
44
- source_partition_locator: PartitionLocator,
45
- primary_key_index_root_path: str,
46
- round_completion_info: RoundCompletionInfo):
43
+ bucket: str,
44
+ source_partition_locator: PartitionLocator,
45
+ primary_key_index_root_path: str,
46
+ round_completion_info: RoundCompletionInfo,
47
+ ) -> str:
47
48
 
48
49
  from deltacat.aws import s3u as s3_utils
49
- logger.info(
50
- f"writing round completion file contents: {round_completion_info}")
50
+
51
+ logger.info(f"writing round completion file contents: {round_completion_info}")
51
52
  round_completion_file_s3_url = get_round_completion_file_s3_url(
52
53
  bucket,
53
54
  source_partition_locator,
54
55
  primary_key_index_root_path,
55
56
  )
56
- logger.info(
57
- f"writing round completion file to: {round_completion_file_s3_url}")
57
+ logger.info(f"writing round completion file to: {round_completion_file_s3_url}")
58
58
  s3_utils.upload(
59
- round_completion_file_s3_url,
60
- str(json.dumps(round_completion_info))
59
+ round_completion_file_s3_url, str(json.dumps(round_completion_info))
61
60
  )
62
- logger.info(
63
- f"round completion file written to: {round_completion_file_s3_url}")
61
+ logger.info(f"round completion file written to: {round_completion_file_s3_url}")
62
+ return round_completion_file_s3_url
@@ -1,10 +1,11 @@
1
- import pyarrow as pa
2
- import numpy as np
3
1
  from itertools import repeat
4
2
  from typing import Union
5
3
 
6
- from deltacat.storage import DeltaType
4
+ import numpy as np
5
+ import pyarrow as pa
6
+
7
7
  from deltacat.compute.compactor import DeltaFileEnvelope
8
+ from deltacat.storage import DeltaType
8
9
 
9
10
  _SYS_COL_UUID = "4000f124-dfbd-48c6-885b-7b22621a6d41"
10
11
 
@@ -65,10 +66,7 @@ _IS_SOURCE_COLUMN_FIELD = pa.field(
65
66
 
66
67
 
67
68
  def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
68
- return pa.array(
69
- obj,
70
- _PK_HASH_COLUMN_TYPE
71
- )
69
+ return pa.array(obj, _PK_HASH_COLUMN_TYPE)
72
70
 
73
71
 
74
72
  def pk_hash_column_np(table: pa.Table) -> np.ndarray:
@@ -79,6 +77,10 @@ def pk_hash_column(table: pa.Table) -> pa.ChunkedArray:
79
77
  return table[_PK_HASH_COLUMN_NAME]
80
78
 
81
79
 
80
+ def delta_type_column_np(table: pa.Table) -> np.ndarray:
81
+ return table[_DELTA_TYPE_COLUMN_NAME].to_numpy()
82
+
83
+
82
84
  def delta_type_column(table: pa.Table) -> pa.ChunkedArray:
83
85
  return table[_DELTA_TYPE_COLUMN_NAME]
84
86
 
@@ -101,8 +103,7 @@ def stream_position_column_np(table: pa.Table) -> np.ndarray:
101
103
  return table[_PARTITION_STREAM_POSITION_COLUMN_NAME].to_numpy()
102
104
 
103
105
 
104
- def get_file_index_column_array(obj) \
105
- -> Union[pa.Array, pa.ChunkedArray]:
106
+ def get_file_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
106
107
  return pa.array(
107
108
  obj,
108
109
  _ORDERED_FILE_IDX_COLUMN_TYPE,
@@ -113,8 +114,7 @@ def file_index_column_np(table: pa.Table) -> np.ndarray:
113
114
  return table[_ORDERED_FILE_IDX_COLUMN_NAME].to_numpy()
114
115
 
115
116
 
116
- def get_record_index_column_array(obj) -> \
117
- Union[pa.Array, pa.ChunkedArray]:
117
+ def get_record_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
118
118
  return pa.array(
119
119
  obj,
120
120
  _ORDERED_RECORD_IDX_COLUMN_TYPE,
@@ -144,7 +144,8 @@ def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
144
144
 
145
145
 
146
146
  def project_delta_file_metadata_on_table(
147
- delta_file_envelope: DeltaFileEnvelope) -> pa.Table:
147
+ delta_file_envelope: DeltaFileEnvelope,
148
+ ) -> pa.Table:
148
149
 
149
150
  table = delta_file_envelope.table
150
151
 
@@ -181,42 +182,33 @@ def project_delta_file_metadata_on_table(
181
182
  return table
182
183
 
183
184
 
184
- def append_stream_position_column(
185
- table: pa.Table,
186
- stream_positions):
185
+ def append_stream_position_column(table: pa.Table, stream_positions):
187
186
 
188
187
  table = table.append_column(
189
188
  _PARTITION_STREAM_POSITION_COLUMN_FIELD,
190
- get_stream_position_column_array(stream_positions)
189
+ get_stream_position_column_array(stream_positions),
191
190
  )
192
191
  return table
193
192
 
194
193
 
195
- def append_file_idx_column(
196
- table: pa.Table,
197
- ordered_file_indices):
194
+ def append_file_idx_column(table: pa.Table, ordered_file_indices):
198
195
 
199
196
  table = table.append_column(
200
197
  _ORDERED_FILE_IDX_COLUMN_FIELD,
201
- get_file_index_column_array(ordered_file_indices)
198
+ get_file_index_column_array(ordered_file_indices),
202
199
  )
203
200
  return table
204
201
 
205
202
 
206
- def append_pk_hash_column(
207
- table: pa.Table,
208
- pk_hashes) -> pa.Table:
203
+ def append_pk_hash_column(table: pa.Table, pk_hashes) -> pa.Table:
209
204
 
210
205
  table = table.append_column(
211
- _PK_HASH_COLUMN_FIELD,
212
- get_pk_hash_column_array(pk_hashes)
206
+ _PK_HASH_COLUMN_FIELD, get_pk_hash_column_array(pk_hashes)
213
207
  )
214
208
  return table
215
209
 
216
210
 
217
- def append_record_idx_col(
218
- table: pa.Table,
219
- ordered_record_indices) -> pa.Table:
211
+ def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
220
212
 
221
213
  table = table.append_column(
222
214
  _ORDERED_RECORD_IDX_COLUMN_FIELD,
@@ -225,9 +217,7 @@ def append_record_idx_col(
225
217
  return table
226
218
 
227
219
 
228
- def append_dedupe_task_idx_col(
229
- table: pa.Table,
230
- dedupe_task_indices) -> pa.Table:
220
+ def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table:
231
221
 
232
222
  table = table.append_column(
233
223
  _DEDUPE_TASK_IDX_COLUMN_FIELD,
@@ -244,9 +234,7 @@ def delta_type_from_field(delta_type_field: bool) -> DeltaType:
244
234
  return DeltaType.UPSERT if delta_type_field else DeltaType.DELETE
245
235
 
246
236
 
247
- def append_delta_type_col(
248
- table: pa.Table,
249
- delta_types) -> pa.Table:
237
+ def append_delta_type_col(table: pa.Table, delta_types) -> pa.Table:
250
238
 
251
239
  table = table.append_column(
252
240
  _DELTA_TYPE_COLUMN_FIELD,
@@ -255,9 +243,7 @@ def append_delta_type_col(
255
243
  return table
256
244
 
257
245
 
258
- def append_is_source_col(
259
- table: pa.Table,
260
- booleans) -> pa.Table:
246
+ def append_is_source_col(table: pa.Table, booleans) -> pa.Table:
261
247
 
262
248
  table = table.append_column(
263
249
  _IS_SOURCE_COLUMN_FIELD,
@@ -267,11 +253,13 @@ def append_is_source_col(
267
253
 
268
254
 
269
255
  def get_minimal_hb_schema() -> pa.schema:
270
- return pa.schema([
271
- _PK_HASH_COLUMN_FIELD,
272
- _ORDERED_RECORD_IDX_COLUMN_FIELD,
273
- _ORDERED_FILE_IDX_COLUMN_FIELD,
274
- _PARTITION_STREAM_POSITION_COLUMN_FIELD,
275
- _DELTA_TYPE_COLUMN_FIELD,
276
- _IS_SOURCE_COLUMN_FIELD
277
- ])
256
+ return pa.schema(
257
+ [
258
+ _PK_HASH_COLUMN_FIELD,
259
+ _ORDERED_RECORD_IDX_COLUMN_FIELD,
260
+ _ORDERED_FILE_IDX_COLUMN_FIELD,
261
+ _PARTITION_STREAM_POSITION_COLUMN_FIELD,
262
+ _DELTA_TYPE_COLUMN_FIELD,
263
+ _IS_SOURCE_COLUMN_FIELD,
264
+ ]
265
+ )