deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +188 -218
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +259 -316
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +152 -259
  22. deltacat/compute/compactor/steps/hash_bucket.py +57 -73
  23. deltacat/compute/compactor/steps/materialize.py +138 -99
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +131 -90
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -42
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +8 -10
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  64. deltacat/types/media.py +3 -4
  65. deltacat/types/tables.py +31 -21
  66. deltacat/utils/common.py +5 -11
  67. deltacat/utils/numpy.py +20 -22
  68. deltacat/utils/pandas.py +73 -100
  69. deltacat/utils/performance.py +3 -9
  70. deltacat/utils/placement.py +276 -231
  71. deltacat/utils/pyarrow.py +302 -89
  72. deltacat/utils/ray_utils/collections.py +2 -1
  73. deltacat/utils/ray_utils/concurrency.py +38 -32
  74. deltacat/utils/ray_utils/dataset.py +28 -28
  75. deltacat/utils/ray_utils/performance.py +5 -9
  76. deltacat/utils/ray_utils/runtime.py +9 -10
  77. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
  78. deltacat-0.1.11.dist-info/RECORD +110 -0
  79. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
  80. deltacat/autoscaler/events/__init__.py +0 -0
  81. deltacat/autoscaler/events/compaction/__init__.py +0 -0
  82. deltacat/autoscaler/events/compaction/cluster.py +0 -82
  83. deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
  84. deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
  85. deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
  86. deltacat/autoscaler/events/compaction/input.py +0 -27
  87. deltacat/autoscaler/events/compaction/process.py +0 -25
  88. deltacat/autoscaler/events/compaction/session_manager.py +0 -13
  89. deltacat/autoscaler/events/compaction/utils.py +0 -216
  90. deltacat/autoscaler/events/compaction/workflow.py +0 -303
  91. deltacat/autoscaler/events/dispatcher.py +0 -95
  92. deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
  93. deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
  94. deltacat/autoscaler/events/event_store.py +0 -55
  95. deltacat/autoscaler/events/exceptions.py +0 -6
  96. deltacat/autoscaler/events/processor.py +0 -177
  97. deltacat/autoscaler/events/session_manager.py +0 -25
  98. deltacat/autoscaler/events/states.py +0 -88
  99. deltacat/autoscaler/events/workflow.py +0 -54
  100. deltacat/autoscaler/node_group.py +0 -230
  101. deltacat/autoscaler/utils.py +0 -69
  102. deltacat-0.1.8.dist-info/RECORD +0 -131
  103. /deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
  104. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
  105. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,23 @@
1
1
  import logging
2
- import time
3
2
  import math
4
- from deltacat.compute.stats.models.delta_stats import DeltaStats
5
- from deltacat.constants import PYARROW_INFLATION_MULTIPLIER, BYTES_PER_MEBIBYTE
3
+ from typing import Dict, List, Optional, Tuple
6
4
 
7
- from deltacat.storage import PartitionLocator, Delta, \
8
- interface as unimplemented_deltacat_storage
9
5
  from deltacat import logs
10
6
  from deltacat.compute.compactor import DeltaAnnotated
11
-
12
- from typing import Dict, List, Optional, Tuple
7
+ from deltacat.compute.stats.models.delta_stats import DeltaStats
8
+ from deltacat.constants import BYTES_PER_MEBIBYTE, PYARROW_INFLATION_MULTIPLIER
9
+ from deltacat.storage import Delta, PartitionLocator
10
+ from deltacat.storage import interface as unimplemented_deltacat_storage
13
11
 
14
12
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
13
 
16
14
 
17
15
  def discover_deltas(
18
- source_partition_locator: PartitionLocator,
19
- start_position_exclusive: Optional[int],
20
- end_position_inclusive: int,
21
- deltacat_storage=unimplemented_deltacat_storage) -> List[Delta]:
16
+ source_partition_locator: PartitionLocator,
17
+ start_position_exclusive: Optional[int],
18
+ end_position_inclusive: int,
19
+ deltacat_storage=unimplemented_deltacat_storage,
20
+ ) -> List[Delta]:
22
21
 
23
22
  stream_locator = source_partition_locator.stream_locator
24
23
  namespace = stream_locator.namespace
@@ -36,32 +35,38 @@ def discover_deltas(
36
35
  )
37
36
  deltas = deltas_list_result.all_items()
38
37
  if not deltas:
39
- raise RuntimeError(f"Unexpected Error: Couldn't find any deltas to "
40
- f"compact in delta stream position range "
41
- f"('{start_position_exclusive}', "
42
- f"'{end_position_inclusive}']. Source partition: "
43
- f"{source_partition_locator}")
38
+ raise RuntimeError(
39
+ f"Unexpected Error: Couldn't find any deltas to "
40
+ f"compact in delta stream position range "
41
+ f"('{start_position_exclusive}', "
42
+ f"'{end_position_inclusive}']. Source partition: "
43
+ f"{source_partition_locator}"
44
+ )
44
45
  if start_position_exclusive:
45
46
  first_delta = deltas.pop(0)
46
- logger.info(f"Removed exclusive start delta w/ expected stream "
47
- f"position '{start_position_exclusive}' from deltas to "
48
- f"compact: {first_delta}")
49
- logger.info(f"Count of deltas to compact in delta stream "
50
- f"position range ('{start_position_exclusive}', "
51
- f"'{end_position_inclusive}']: {len(deltas)}. Source "
52
- f"partition: '{source_partition_locator}'")
47
+ logger.info(
48
+ f"Removed exclusive start delta w/ expected stream "
49
+ f"position '{start_position_exclusive}' from deltas to "
50
+ f"compact: {first_delta}"
51
+ )
52
+ logger.info(
53
+ f"Count of deltas to compact in delta stream "
54
+ f"position range ('{start_position_exclusive}', "
55
+ f"'{end_position_inclusive}']: {len(deltas)}. Source "
56
+ f"partition: '{source_partition_locator}'"
57
+ )
53
58
  return deltas
54
59
 
55
60
 
56
61
  def limit_input_deltas(
57
- input_deltas: List[Delta],
58
- cluster_resources: Dict[str, float],
59
- hash_bucket_count: int,
60
- min_pk_index_pa_bytes: int,
61
- user_hash_bucket_chunk_size: int,
62
- input_deltas_stats: Dict[int, DeltaStats],
63
- deltacat_storage=unimplemented_deltacat_storage) \
64
- -> Tuple[List[DeltaAnnotated], int, int]:
62
+ input_deltas: List[Delta],
63
+ cluster_resources: Dict[str, float],
64
+ hash_bucket_count: int,
65
+ min_pk_index_pa_bytes: int,
66
+ user_hash_bucket_chunk_size: int,
67
+ input_deltas_stats: Dict[int, DeltaStats],
68
+ deltacat_storage=unimplemented_deltacat_storage,
69
+ ) -> Tuple[List[DeltaAnnotated], int, int]:
65
70
 
66
71
  # TODO (pdames): when row counts are available in metadata, use them
67
72
  # instead of bytes - memory consumption depends more on number of
@@ -78,9 +83,10 @@ def limit_input_deltas(
78
83
  # )
79
84
  if min_pk_index_pa_bytes > 0:
80
85
  required_heap_mem_for_dedupe = worker_obj_store_mem - min_pk_index_pa_bytes
81
- assert required_heap_mem_for_dedupe > 0, \
82
- f"Not enough required memory available to re-batch input deltas" \
86
+ assert required_heap_mem_for_dedupe > 0, (
87
+ f"Not enough required memory available to re-batch input deltas"
83
88
  f"and initiate the dedupe step."
89
+ )
84
90
  # Size of batched deltas must also be reduced to have enough space for primary
85
91
  # key index files (from earlier compaction rounds) in the dedupe step, since
86
92
  # they will be loaded into worker heap memory.
@@ -88,8 +94,7 @@ def limit_input_deltas(
88
94
 
89
95
  logger.info(f"Total worker object store memory: {worker_obj_store_mem}")
90
96
  worker_obj_store_mem_per_task = worker_obj_store_mem / worker_cpus
91
- logger.info(f"Worker object store memory/task: "
92
- f"{worker_obj_store_mem_per_task}")
97
+ logger.info(f"Worker object store memory/task: " f"{worker_obj_store_mem_per_task}")
93
98
  worker_task_mem = cluster_resources["memory"]
94
99
  logger.info(f"Total worker memory: {worker_task_mem}")
95
100
  # TODO (pdames): ensure fixed memory per CPU in heterogenous clusters
@@ -105,8 +110,10 @@ def limit_input_deltas(
105
110
  if input_deltas_stats is None:
106
111
  input_deltas_stats = {}
107
112
 
108
- input_deltas_stats = {int(stream_pos): DeltaStats(delta_stats)
109
- for stream_pos, delta_stats in input_deltas_stats.items()}
113
+ input_deltas_stats = {
114
+ int(stream_pos): DeltaStats(delta_stats)
115
+ for stream_pos, delta_stats in input_deltas_stats.items()
116
+ }
110
117
  for delta in input_deltas:
111
118
  manifest = deltacat_storage.get_delta_manifest(delta)
112
119
  delta.manifest = manifest
@@ -118,7 +125,8 @@ def limit_input_deltas(
118
125
  # TODO (pdames): ensure pyarrow object fits in per-task obj store mem
119
126
  logger.warning(
120
127
  f"Stats are missing for delta stream position {delta.stream_position}, "
121
- f"materialized delta may not fit in per-task object store memory.")
128
+ f"materialized delta may not fit in per-task object store memory."
129
+ )
122
130
  manifest_entries = delta.manifest.entries
123
131
  delta_manifest_entries += len(manifest_entries)
124
132
  for entry in manifest_entries:
@@ -130,13 +138,13 @@ def limit_input_deltas(
130
138
  logger.info(
131
139
  f"Input deltas limited to "
132
140
  f"{len(limited_input_da_list)} by object store mem "
133
- f"({delta_bytes_pyarrow} > {worker_obj_store_mem})")
141
+ f"({delta_bytes_pyarrow} > {worker_obj_store_mem})"
142
+ )
134
143
  break
135
144
  delta_annotated = DeltaAnnotated.of(delta)
136
145
  limited_input_da_list.append(delta_annotated)
137
146
 
138
- logger.info(f"Input deltas to compact this round: "
139
- f"{len(limited_input_da_list)}")
147
+ logger.info(f"Input deltas to compact this round: " f"{len(limited_input_da_list)}")
140
148
  logger.info(f"Input delta bytes to compact: {delta_bytes}")
141
149
  logger.info(f"Input delta files to compact: {delta_manifest_entries}")
142
150
  logger.info(f"Latest input delta stream position: {latest_stream_position}")
@@ -146,10 +154,12 @@ def limit_input_deltas(
146
154
 
147
155
  # TODO (pdames): determine min hash buckets from size of all deltas
148
156
  # (not just deltas for this round)
149
- min_hash_bucket_count = int(max(
150
- math.ceil(delta_bytes_pyarrow / worker_obj_store_mem_per_task),
151
- min(worker_cpus, 256),
152
- ))
157
+ min_hash_bucket_count = int(
158
+ max(
159
+ math.ceil(delta_bytes_pyarrow / worker_obj_store_mem_per_task),
160
+ min(worker_cpus, 256),
161
+ )
162
+ )
153
163
  logger.info(f"Minimum recommended hash buckets: {min_hash_bucket_count}")
154
164
 
155
165
  if hash_bucket_count is None:
@@ -168,7 +178,8 @@ def limit_input_deltas(
168
178
  f"resolve this problem either specify a larger number of hash "
169
179
  f"buckets when running compaction, omit a custom hash bucket "
170
180
  f"count when running compaction, or provision workers with more "
171
- f"task memory per CPU.")
181
+ f"task memory per CPU."
182
+ )
172
183
 
173
184
  hash_bucket_chunk_size = user_hash_bucket_chunk_size
174
185
  max_hash_bucket_chunk_size = math.ceil(
@@ -185,7 +196,8 @@ def limit_input_deltas(
185
196
  f"specify a smaller hash bucket chunk size when running "
186
197
  f"compaction, omit a custom hash bucket chunk size when running "
187
198
  f"compaction, or provision workers with more task and object "
188
- f"store memory per CPU.")
199
+ f"store memory per CPU."
200
+ )
189
201
  elif not hash_bucket_chunk_size:
190
202
  hash_bucket_chunk_size_load_balanced = max(
191
203
  math.ceil(max(delta_bytes, delta_bytes_pyarrow) / worker_cpus),
@@ -1,48 +1,54 @@
1
- import logging
2
1
  import json
3
- import ray
4
- import pyarrow as pa
5
- import numpy as np
6
- import s3fs
2
+ import logging
7
3
  from collections import defaultdict
4
+ from typing import Any, Callable, Dict, List, Optional, Tuple
8
5
 
9
- from deltacat.utils.common import ReadKwargsProvider
6
+ import numpy as np
7
+ import pyarrow as pa
8
+ import ray
9
+ import s3fs
10
10
  from ray import cloudpickle
11
+ from ray.types import ObjectRef
11
12
 
12
- from deltacat.storage import Manifest, PartitionLocator
13
- from deltacat.utils.ray_utils.concurrency import invoke_parallel, \
14
- round_robin_options_provider
15
- from deltacat.compute.compactor import PyArrowWriteResult, \
16
- RoundCompletionInfo, PrimaryKeyIndexMeta, PrimaryKeyIndexLocator, \
17
- PrimaryKeyIndexVersionMeta, PrimaryKeyIndexVersionLocator
13
+ from deltacat import logs
14
+ from deltacat.aws import s3u
15
+ from deltacat.compute.compactor import (
16
+ PrimaryKeyIndexLocator,
17
+ PrimaryKeyIndexMeta,
18
+ PrimaryKeyIndexVersionLocator,
19
+ PrimaryKeyIndexVersionMeta,
20
+ PyArrowWriteResult,
21
+ RoundCompletionInfo,
22
+ )
23
+ from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb
24
+ from deltacat.compute.compactor.steps.rehash import rewrite_index as ri
18
25
  from deltacat.compute.compactor.utils import round_completion_file as rcf
19
26
  from deltacat.compute.compactor.utils import system_columns as sc
20
- from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb, \
21
- rewrite_index as ri
22
- from deltacat.types.tables import get_table_writer, get_table_slicer
23
- from deltacat.types.media import ContentType, ContentEncoding
24
- from deltacat.aws import s3u
25
- from deltacat import logs
26
-
27
- from typing import Any, Callable, Dict, List, Optional, Tuple
28
-
29
- from ray.types import ObjectRef
27
+ from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
28
+ from deltacat.storage import Manifest, PartitionLocator
29
+ from deltacat.types.media import ContentEncoding, ContentType
30
+ from deltacat.types.tables import get_table_slicer, get_table_writer
31
+ from deltacat.utils.common import ReadKwargsProvider
32
+ from deltacat.utils.ray_utils.concurrency import invoke_parallel
30
33
 
31
34
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
32
35
 
33
36
 
34
37
  def rehash(
35
- options_provider: Callable[[int, Any], Dict[str, Any]],
36
- s3_bucket: str,
37
- source_partition_locator: PartitionLocator,
38
- old_rci: RoundCompletionInfo,
39
- new_hash_bucket_count: int,
40
- hash_bucket_index_group_count: int,
41
- records_per_primary_key_index_file: int,
42
- delete_old_primary_key_index: bool) -> RoundCompletionInfo:
43
-
44
- logger.info(f"Rehashing primary key index. Old round completion info: "
45
- f"{old_rci}. New hash bucket count: {new_hash_bucket_count}")
38
+ options_provider: Callable[[int, Any], Dict[str, Any]],
39
+ s3_bucket: str,
40
+ source_partition_locator: PartitionLocator,
41
+ old_rci: RoundCompletionInfo,
42
+ new_hash_bucket_count: int,
43
+ hash_bucket_index_group_count: int,
44
+ records_per_primary_key_index_file: int,
45
+ delete_old_primary_key_index: bool,
46
+ ) -> RoundCompletionInfo:
47
+
48
+ logger.info(
49
+ f"Rehashing primary key index. Old round completion info: "
50
+ f"{old_rci}. New hash bucket count: {new_hash_bucket_count}"
51
+ )
46
52
 
47
53
  # collect old primary key index information
48
54
  old_pki_version_locator = old_rci.primary_key_index_version_locator
@@ -50,10 +56,12 @@ def rehash(
50
56
  old_pki_meta = old_pkiv_meta.primary_key_index_meta
51
57
  old_compacted_partition_locator = old_pki_meta.compacted_partition_locator
52
58
  if old_pkiv_meta.hash_bucket_count == new_hash_bucket_count:
53
- raise ValueError(f"Primary key index rehash failed. Old hash bucket "
54
- f"count ({new_hash_bucket_count}) is "
55
- f"equal to new hash bucket count. Partition: "
56
- f"{old_compacted_partition_locator}.")
59
+ raise ValueError(
60
+ f"Primary key index rehash failed. Old hash bucket "
61
+ f"count ({new_hash_bucket_count}) is "
62
+ f"equal to new hash bucket count. Partition: "
63
+ f"{old_compacted_partition_locator}."
64
+ )
57
65
 
58
66
  # generate a new unique primary key index version locator to rehash into
59
67
  new_pki_meta = PrimaryKeyIndexMeta.of(
@@ -68,7 +76,8 @@ def rehash(
68
76
  new_hash_bucket_count,
69
77
  )
70
78
  rehashed_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
71
- new_pki_version_meta)
79
+ new_pki_version_meta
80
+ )
72
81
 
73
82
  # launch a rehash task for each bucket of the old primary key index version
74
83
  old_hash_bucket_count = old_pkiv_meta.hash_bucket_count
@@ -114,6 +123,7 @@ def rehash(
114
123
  PyArrowWriteResult.union(pki_stats),
115
124
  old_rci.sort_keys_bit_width,
116
125
  rehashed_pki_version_locator,
126
+ old_rci.rebase_source_partition_locator,
117
127
  )
118
128
  rcf.write_round_completion_file(
119
129
  s3_bucket,
@@ -126,41 +136,48 @@ def rehash(
126
136
  s3_bucket,
127
137
  old_pki_version_locator,
128
138
  )
129
- logger.info(f"Rehashed primary key index. New round completion info: "
130
- f"{round_completion_info}.")
139
+ logger.info(
140
+ f"Rehashed primary key index. New round completion info: "
141
+ f"{round_completion_info}."
142
+ )
131
143
  return round_completion_info
132
144
 
133
145
 
134
146
  def download_hash_bucket_entries(
135
- s3_bucket: str,
136
- hash_bucket_index: int,
137
- primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
138
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
139
- -> List[pa.Table]:
140
-
141
- pk_index_manifest_s3_url = primary_key_index_version_locator\
142
- .get_pkiv_hb_index_manifest_s3_url(
147
+ s3_bucket: str,
148
+ hash_bucket_index: int,
149
+ primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
150
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
151
+ ) -> List[pa.Table]:
152
+
153
+ pk_index_manifest_s3_url = (
154
+ primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
143
155
  s3_bucket,
144
156
  hash_bucket_index,
145
157
  )
158
+ )
146
159
  result = s3u.download(pk_index_manifest_s3_url, False)
147
- logger.info(f"Downloading primary key index hash bucket manifest entries: "
148
- f"{pk_index_manifest_s3_url}. Primary key index version "
149
- f"locator: {primary_key_index_version_locator}")
160
+ logger.info(
161
+ f"Downloading primary key index hash bucket manifest entries: "
162
+ f"{pk_index_manifest_s3_url}. Primary key index version "
163
+ f"locator: {primary_key_index_version_locator}"
164
+ )
150
165
  pk_index_manifest = Manifest(json.loads(result["Body"].read().decode("utf-8")))
151
- tables = s3u.download_manifest_entries(pk_index_manifest,
152
- file_reader_kwargs_provider=file_reader_kwargs_provider)
166
+ tables = s3u.download_manifest_entries(
167
+ pk_index_manifest, file_reader_kwargs_provider=file_reader_kwargs_provider
168
+ )
153
169
  if not tables:
154
170
  logger.warning(
155
171
  f"Primary key index manifest is empty at: "
156
172
  f"{pk_index_manifest_s3_url}. Primary key index version "
157
- f"locator: {primary_key_index_version_locator}")
173
+ f"locator: {primary_key_index_version_locator}"
174
+ )
158
175
  return tables
159
176
 
160
177
 
161
178
  def delete_primary_key_index_version(
162
- s3_bucket: str,
163
- pki_version_locator: PrimaryKeyIndexVersionLocator) -> None:
179
+ s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
180
+ ) -> None:
164
181
 
165
182
  logger.info(f"Deleting primary key index: {pki_version_locator}")
166
183
  s3u.delete_files_by_prefix(
@@ -171,8 +188,8 @@ def delete_primary_key_index_version(
171
188
 
172
189
 
173
190
  def group_record_indices_by_hash_bucket(
174
- pki_table: pa.Table,
175
- num_buckets: int) -> np.ndarray:
191
+ pki_table: pa.Table, num_buckets: int
192
+ ) -> np.ndarray:
176
193
 
177
194
  hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
178
195
  record_index = 0
@@ -186,9 +203,11 @@ def group_record_indices_by_hash_bucket(
186
203
 
187
204
 
188
205
  def group_hash_bucket_indices(
189
- hash_bucket_object_groups: np.ndarray,
190
- num_buckets: int,
191
- num_groups: int) -> Tuple[np.ndarray, List[ObjectRef]]:
206
+ hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
207
+ ) -> Tuple[np.ndarray, List[ObjectRef]]:
208
+ """
209
+ Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
210
+ """
192
211
 
193
212
  object_refs = []
194
213
  hash_bucket_group_to_obj_id = np.empty([num_groups], dtype="object")
@@ -201,50 +220,70 @@ def group_hash_bucket_indices(
201
220
  if obj:
202
221
  hb_group = hb_index % num_groups
203
222
  if hb_group_to_object[hb_group] is None:
204
- hb_group_to_object[hb_group] = np.empty(
205
- [num_buckets], dtype="object")
223
+ hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
206
224
  hb_group_to_object[hb_group][hb_index] = obj
207
225
 
208
226
  for hb_group, obj in enumerate(hb_group_to_object):
209
- if obj is not None:
210
- obj_ref = ray.put(obj)
211
- object_refs.append(obj_ref)
212
- hash_bucket_group_to_obj_id[hb_group] = cloudpickle.dumps(obj_ref)
213
-
227
+ if obj is None:
228
+ continue
229
+ obj_ref = ray.put(obj)
230
+ pickled_obj_ref = cloudpickle.dumps(obj_ref)
231
+ object_refs.append(pickled_obj_ref)
232
+ hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
233
+ # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
234
+ # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
235
+ # (e.g., if the ObjectRef is deserialized by a non-Ray process).
236
+ # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
237
+ # The object now has a permanent reference and the data can't be freed from Ray’s object store.
238
+ # Manually deleting the untrackable object references offsets these permanent references and
239
+ # helps to allow these objects to be garbage collected normally.
240
+ del obj_ref
241
+ del pickled_obj_ref
214
242
  return hash_bucket_group_to_obj_id, object_refs
215
243
 
216
244
 
217
- def pk_digest_to_hash_bucket_index(
218
- digest,
219
- num_buckets: int) -> int:
245
+ def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
246
+ """
247
+ Deterministically get the hash bucket a particular digest belongs to
248
+ based on number of total hash buckets.
249
+ """
220
250
 
221
251
  return int.from_bytes(digest, "big") % num_buckets
222
252
 
223
253
 
224
254
  def write_primary_key_index_files(
225
- table: pa.Table,
226
- primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
227
- s3_bucket: str,
228
- hb_index: int,
229
- records_per_index_file: int) -> PyArrowWriteResult:
255
+ table: pa.Table,
256
+ primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
257
+ s3_bucket: str,
258
+ hb_index: int,
259
+ records_per_index_file: int,
260
+ ) -> PyArrowWriteResult:
230
261
  """
231
262
  Writes primary key index files for the given hash bucket index out to the
232
263
  specified S3 bucket at the path identified by the given primary key index
233
264
  version locator. Output is written as 1 or more Parquet files with the
234
265
  given maximum number of records per file.
266
+
267
+ TODO(raghumdani): Support writing primary key index to any data catalog
235
268
  """
236
- logger.info(f"Writing primary key index files for hash bucket {hb_index}. "
237
- f"Primary key index version locator: "
238
- f"{primary_key_index_version_locator}.")
269
+ logger.info(
270
+ f"Writing primary key index files for hash bucket {hb_index}. "
271
+ f"Primary key index version locator: "
272
+ f"{primary_key_index_version_locator}."
273
+ )
239
274
  s3_file_system = s3fs.S3FileSystem(
240
275
  anon=False,
241
276
  s3_additional_kwargs={
242
277
  "ContentType": ContentType.PARQUET.value,
243
278
  "ContentEncoding": ContentEncoding.IDENTITY.value,
244
- }
279
+ },
280
+ config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG,
281
+ )
282
+ pkiv_hb_index_s3_url_base = (
283
+ primary_key_index_version_locator.get_pkiv_hb_index_s3_url_base(
284
+ s3_bucket, hb_index
285
+ )
245
286
  )
246
- pkiv_hb_index_s3_url_base = primary_key_index_version_locator\
247
- .get_pkiv_hb_index_s3_url_base(s3_bucket, hb_index)
248
287
  manifest_entries = s3u.upload_sliced_table(
249
288
  table,
250
289
  pkiv_hb_index_s3_url_base,
@@ -254,19 +293,21 @@ def write_primary_key_index_files(
254
293
  get_table_slicer(table),
255
294
  )
256
295
  manifest = Manifest.of(manifest_entries)
257
- pkiv_hb_index_s3_manifest_s3_url = primary_key_index_version_locator\
258
- .get_pkiv_hb_index_manifest_s3_url(s3_bucket, hb_index)
259
- s3u.upload(
260
- pkiv_hb_index_s3_manifest_s3_url,
261
- str(json.dumps(manifest))
296
+ pkiv_hb_index_s3_manifest_s3_url = (
297
+ primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
298
+ s3_bucket, hb_index
299
+ )
262
300
  )
301
+ s3u.upload(pkiv_hb_index_s3_manifest_s3_url, str(json.dumps(manifest)))
263
302
  result = PyArrowWriteResult.of(
264
303
  len(manifest_entries),
265
304
  table.nbytes,
266
305
  manifest.meta.content_length,
267
306
  len(table),
268
307
  )
269
- logger.info(f"Wrote primary key index files for hash bucket {hb_index}. "
270
- f"Primary key index version locator: "
271
- f"{primary_key_index_version_locator}. Result: {result}")
308
+ logger.info(
309
+ f"Wrote primary key index files for hash bucket {hb_index}. "
310
+ f"Primary key index version locator: "
311
+ f"{primary_key_index_version_locator}. Result: {result}"
312
+ )
272
313
  return result
@@ -1,35 +1,35 @@
1
- import logging
2
1
  import json
2
+ import logging
3
3
 
4
- from deltacat.storage import PartitionLocator
5
- from deltacat.compute.compactor import RoundCompletionInfo
6
4
  from deltacat import logs
5
+ from deltacat.compute.compactor import RoundCompletionInfo
6
+ from deltacat.storage import PartitionLocator
7
7
 
8
8
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
9
9
 
10
10
 
11
11
  def get_round_completion_file_s3_url(
12
- bucket: str,
13
- source_partition_locator: PartitionLocator,
14
- pki_root_path: str) -> str:
12
+ bucket: str, source_partition_locator: PartitionLocator, pki_root_path: str
13
+ ) -> str:
15
14
 
16
15
  base_url = source_partition_locator.path(f"s3://{bucket}")
17
16
  return f"{base_url}/{pki_root_path}.json"
18
17
 
19
18
 
20
19
  def read_round_completion_file(
21
- bucket: str,
22
- source_partition_locator: PartitionLocator,
23
- primary_key_index_root_path: str) -> RoundCompletionInfo:
20
+ bucket: str,
21
+ source_partition_locator: PartitionLocator,
22
+ primary_key_index_root_path: str,
23
+ ) -> RoundCompletionInfo:
24
24
 
25
25
  from deltacat.aws import s3u as s3_utils
26
+
26
27
  round_completion_file_url = get_round_completion_file_s3_url(
27
28
  bucket,
28
29
  source_partition_locator,
29
30
  primary_key_index_root_path,
30
31
  )
31
- logger.info(
32
- f"reading round completion file from: {round_completion_file_url}")
32
+ logger.info(f"reading round completion file from: {round_completion_file_url}")
33
33
  round_completion_info = None
34
34
  result = s3_utils.download(round_completion_file_url, False)
35
35
  if result:
@@ -40,24 +40,23 @@ def read_round_completion_file(
40
40
 
41
41
 
42
42
  def write_round_completion_file(
43
- bucket: str,
44
- source_partition_locator: PartitionLocator,
45
- primary_key_index_root_path: str,
46
- round_completion_info: RoundCompletionInfo):
43
+ bucket: str,
44
+ source_partition_locator: PartitionLocator,
45
+ primary_key_index_root_path: str,
46
+ round_completion_info: RoundCompletionInfo,
47
+ ) -> str:
47
48
 
48
49
  from deltacat.aws import s3u as s3_utils
49
- logger.info(
50
- f"writing round completion file contents: {round_completion_info}")
50
+
51
+ logger.info(f"writing round completion file contents: {round_completion_info}")
51
52
  round_completion_file_s3_url = get_round_completion_file_s3_url(
52
53
  bucket,
53
54
  source_partition_locator,
54
55
  primary_key_index_root_path,
55
56
  )
56
- logger.info(
57
- f"writing round completion file to: {round_completion_file_s3_url}")
57
+ logger.info(f"writing round completion file to: {round_completion_file_s3_url}")
58
58
  s3_utils.upload(
59
- round_completion_file_s3_url,
60
- str(json.dumps(round_completion_info))
59
+ round_completion_file_s3_url, str(json.dumps(round_completion_info))
61
60
  )
62
- logger.info(
63
- f"round completion file written to: {round_completion_file_s3_url}")
61
+ logger.info(f"round completion file written to: {round_completion_file_s3_url}")
62
+ return round_completion_file_s3_url