deltacat 0.1.18b11__py3-none-any.whl → 0.1.18b13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
43
43
 
44
44
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
45
45
 
46
- __version__ = "0.1.18b11"
46
+ __version__ = "0.1.18b13"
47
47
 
48
48
 
49
49
  __all__ = [
@@ -144,7 +144,9 @@ def repartition(
144
144
  logger.info(f"repartition {repar_end - repar_start} seconds")
145
145
  logger.info(f"Got {len(ordered_deltas)} task results.")
146
146
  # ordered_deltas are ordered as [cold1, cold2, coldN, hot1, hot2, hotN]
147
- merged_delta = Delta.merge_deltas(ordered_deltas)
147
+ merged_delta = Delta.merge_deltas(
148
+ ordered_deltas, stream_position=last_stream_position_to_compact
149
+ )
148
150
  compacted_delta = deltacat_storage.commit_delta(
149
151
  merged_delta, properties=kwargs.get("properties", {})
150
152
  )
@@ -2,6 +2,7 @@ import importlib
2
2
  import logging
3
3
  from contextlib import nullcontext
4
4
  import pyarrow.compute as pc
5
+ from deltacat.constants import SIGNED_INT64_MIN_VALUE, SIGNED_INT64_MAX_VALUE
5
6
  import pyarrow as pa
6
7
  from typing import List, Optional
7
8
  from deltacat.types.media import StorageType, ContentType
@@ -93,7 +94,9 @@ def repartition_range(
93
94
  if not all(column in table.column_names for table in tables):
94
95
  raise ValueError(f"Column {column} does not exist in the table")
95
96
  partition_ranges.sort()
96
- partition_ranges = [-float("Inf")] + partition_ranges + [float("Inf")]
97
+ partition_ranges = (
98
+ [SIGNED_INT64_MIN_VALUE] + partition_ranges + [SIGNED_INT64_MAX_VALUE]
99
+ )
97
100
  partitioned_tables_list = [[] for _ in range(len(partition_ranges) - 1)]
98
101
 
99
102
  total_record_count = 0
@@ -106,6 +109,7 @@ def repartition_range(
106
109
  pa.field(col_name_int64, pa.int64()),
107
110
  pc.cast(table[column], pa.int64()),
108
111
  )
112
+ null_row_table = table_new.filter(pc.field(col_name_int64).is_null())
109
113
  # Iterate over pairs of values in partition_ranges
110
114
  for i, (lower_limit, upper_limit) in enumerate(
111
115
  zip(partition_ranges[:-1], partition_ranges[1:]), start=0
@@ -117,12 +121,19 @@ def repartition_range(
117
121
  & (pc.field(col_name_int64) <= pc.scalar(upper_limit))
118
122
  )
119
123
  )
124
+ if i == 0:
125
+ partitioned_tables_list[i].append(null_row_table)
126
+
120
127
  partition_table_length = 0
121
128
  # After re-grouping the tables by specified ranges, for each group, we need concat and stage the tables
122
129
  partition_deltas: List[Delta] = []
123
130
  for partition_tables in partitioned_tables_list:
124
131
  if len(partition_tables) > 0:
125
- partition_table: pa.Table = pa.concat_tables(partition_tables)
132
+ print(f"column to be dropped: {col_name_int64}")
133
+ partition_table: pa.Table = pa.concat_tables(partition_tables).drop(
134
+ [col_name_int64]
135
+ )
136
+ assert col_name_int64 not in partition_table.schema.names
126
137
  if len(partition_table) > 0:
127
138
  partition_table_length += len(partition_table)
128
139
  partition_delta: Delta = deltacat_storage.stage_delta(
@@ -136,6 +147,7 @@ def repartition_range(
136
147
  assert (
137
148
  partition_table_length == total_record_count
138
149
  ), f"Repartitioned table should have the same number of records {partition_table_length} as the original table {total_record_count}"
150
+
139
151
  return RepartitionResult(
140
152
  range_deltas=partition_deltas,
141
153
  )
@@ -1,180 +1,21 @@
1
- import json
2
1
  import logging
3
- from collections import defaultdict
4
- from typing import Any, Callable, Dict, List, Optional, Tuple
2
+ from typing import List, Optional, Tuple
5
3
 
6
4
  import numpy as np
7
5
  import pyarrow as pa
8
- import ray
9
- import s3fs
10
6
  from ray.types import ObjectRef
11
7
 
12
8
  from deltacat import logs
13
9
  from deltacat.aws import s3u
14
10
  from deltacat.compute.compactor import (
15
- PrimaryKeyIndexLocator,
16
- PrimaryKeyIndexMeta,
17
11
  PrimaryKeyIndexVersionLocator,
18
- PrimaryKeyIndexVersionMeta,
19
- PyArrowWriteResult,
20
- RoundCompletionInfo,
21
12
  )
22
- from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb
23
- from deltacat.compute.compactor.steps.rehash import rewrite_index as ri
24
- from deltacat.compute.compactor.utils import round_completion_file as rcf
25
13
  from deltacat.compute.compactor.utils import system_columns as sc
26
- from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
27
- from deltacat.storage import Manifest, PartitionLocator
28
- from deltacat.types.media import ContentEncoding, ContentType
29
- from deltacat.types.tables import get_table_slicer, get_table_writer
30
- from deltacat.utils.common import ReadKwargsProvider
31
- from deltacat.utils.ray_utils.concurrency import invoke_parallel
32
14
  from deltacat.io.object_store import IObjectStore
33
15
 
34
16
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
35
17
 
36
18
 
37
- def rehash(
38
- options_provider: Callable[[int, Any], Dict[str, Any]],
39
- s3_bucket: str,
40
- source_partition_locator: PartitionLocator,
41
- old_rci: RoundCompletionInfo,
42
- new_hash_bucket_count: int,
43
- hash_bucket_index_group_count: int,
44
- records_per_primary_key_index_file: int,
45
- delete_old_primary_key_index: bool,
46
- ) -> RoundCompletionInfo:
47
-
48
- logger.info(
49
- f"Rehashing primary key index. Old round completion info: "
50
- f"{old_rci}. New hash bucket count: {new_hash_bucket_count}"
51
- )
52
-
53
- # collect old primary key index information
54
- old_pki_version_locator = old_rci.primary_key_index_version_locator
55
- old_pkiv_meta = old_pki_version_locator.primary_key_index_version_meta
56
- old_pki_meta = old_pkiv_meta.primary_key_index_meta
57
- old_compacted_partition_locator = old_pki_meta.compacted_partition_locator
58
- if old_pkiv_meta.hash_bucket_count == new_hash_bucket_count:
59
- raise ValueError(
60
- f"Primary key index rehash failed. Old hash bucket "
61
- f"count ({new_hash_bucket_count}) is "
62
- f"equal to new hash bucket count. Partition: "
63
- f"{old_compacted_partition_locator}."
64
- )
65
-
66
- # generate a new unique primary key index version locator to rehash into
67
- new_pki_meta = PrimaryKeyIndexMeta.of(
68
- old_compacted_partition_locator,
69
- old_pki_meta.primary_keys,
70
- old_pki_meta.sort_keys,
71
- old_pki_meta.primary_key_index_algorithm_version,
72
- )
73
- new_pki_locator = PrimaryKeyIndexLocator.of(new_pki_meta)
74
- new_pki_version_meta = PrimaryKeyIndexVersionMeta.of(
75
- new_pki_meta,
76
- new_hash_bucket_count,
77
- )
78
- rehashed_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
79
- new_pki_version_meta
80
- )
81
-
82
- # launch a rehash task for each bucket of the old primary key index version
83
- old_hash_bucket_count = old_pkiv_meta.hash_bucket_count
84
- hb_tasks_pending = invoke_parallel(
85
- items=range(old_hash_bucket_count),
86
- ray_task=rb.rehash_bucket,
87
- max_parallelism=None,
88
- options_provider=options_provider,
89
- s3_bucket=s3_bucket,
90
- old_pki_version_locator=old_pki_version_locator,
91
- num_buckets=new_hash_bucket_count,
92
- num_groups=hash_bucket_index_group_count,
93
- )
94
- logger.info(f"Getting {len(hb_tasks_pending)} rehash bucket results...")
95
- hb_results = ray.get([t[0] for t in hb_tasks_pending])
96
- logger.info(f"Got {len(hb_results)} rehash bucket results.")
97
- all_hash_group_idx_to_obj_id = defaultdict(list)
98
- for hash_group_idx_to_obj_id in hb_results:
99
- for hash_group_index, object_id in enumerate(hash_group_idx_to_obj_id):
100
- if object_id:
101
- all_hash_group_idx_to_obj_id[hash_group_index].append(object_id)
102
- hash_group_count = len(all_hash_group_idx_to_obj_id)
103
- logger.info(f"Rehash bucket groups created: {hash_group_count}")
104
-
105
- # write primary key index files for each rehashed output bucket
106
- pki_stats_promises = invoke_parallel(
107
- items=all_hash_group_idx_to_obj_id.values(),
108
- ray_task=ri.rewrite_index,
109
- max_parallelism=None,
110
- options_provider=options_provider,
111
- s3_bucket=s3_bucket,
112
- new_primary_key_index_version_locator=rehashed_pki_version_locator,
113
- max_records_per_index_file=records_per_primary_key_index_file,
114
- )
115
- logger.info(f"Getting {len(pki_stats_promises)} rewrite index results...")
116
- pki_stats = ray.get([t[0] for t in pki_stats_promises])
117
- logger.info(f"Got {len(pki_stats)} rewrite index results.")
118
-
119
- round_completion_info = RoundCompletionInfo.of(
120
- old_rci.high_watermark,
121
- old_rci.compacted_delta_locator,
122
- old_rci.compacted_pyarrow_write_result,
123
- PyArrowWriteResult.union(pki_stats),
124
- old_rci.sort_keys_bit_width,
125
- rehashed_pki_version_locator,
126
- old_rci.rebase_source_partition_locator,
127
- )
128
- rcf.write_round_completion_file(
129
- s3_bucket,
130
- source_partition_locator,
131
- new_pki_locator.primary_key_index_root_path,
132
- round_completion_info,
133
- )
134
- if delete_old_primary_key_index:
135
- delete_primary_key_index_version(
136
- s3_bucket,
137
- old_pki_version_locator,
138
- )
139
- logger.info(
140
- f"Rehashed primary key index. New round completion info: "
141
- f"{round_completion_info}."
142
- )
143
- return round_completion_info
144
-
145
-
146
- def download_hash_bucket_entries(
147
- s3_bucket: str,
148
- hash_bucket_index: int,
149
- primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
150
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
151
- ) -> List[pa.Table]:
152
-
153
- pk_index_manifest_s3_url = (
154
- primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
155
- s3_bucket,
156
- hash_bucket_index,
157
- )
158
- )
159
- result = s3u.download(pk_index_manifest_s3_url, False)
160
- logger.info(
161
- f"Downloading primary key index hash bucket manifest entries: "
162
- f"{pk_index_manifest_s3_url}. Primary key index version "
163
- f"locator: {primary_key_index_version_locator}"
164
- )
165
- pk_index_manifest = Manifest(json.loads(result["Body"].read().decode("utf-8")))
166
- tables = s3u.download_manifest_entries(
167
- pk_index_manifest, file_reader_kwargs_provider=file_reader_kwargs_provider
168
- )
169
- if not tables:
170
- logger.warning(
171
- f"Primary key index manifest is empty at: "
172
- f"{pk_index_manifest_s3_url}. Primary key index version "
173
- f"locator: {primary_key_index_version_locator}"
174
- )
175
- return tables
176
-
177
-
178
19
  def delete_primary_key_index_version(
179
20
  s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
180
21
  ) -> None:
@@ -243,65 +84,3 @@ def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
243
84
  """
244
85
 
245
86
  return int.from_bytes(digest, "big") % num_buckets
246
-
247
-
248
- def write_primary_key_index_files(
249
- table: pa.Table,
250
- primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
251
- s3_bucket: str,
252
- hb_index: int,
253
- records_per_index_file: int,
254
- ) -> PyArrowWriteResult:
255
- """
256
- Writes primary key index files for the given hash bucket index out to the
257
- specified S3 bucket at the path identified by the given primary key index
258
- version locator. Output is written as 1 or more Parquet files with the
259
- given maximum number of records per file.
260
-
261
- TODO(raghumdani): Support writing primary key index to any data catalog
262
- """
263
- logger.info(
264
- f"Writing primary key index files for hash bucket {hb_index}. "
265
- f"Primary key index version locator: "
266
- f"{primary_key_index_version_locator}."
267
- )
268
- s3_file_system = s3fs.S3FileSystem(
269
- anon=False,
270
- s3_additional_kwargs={
271
- "ContentType": ContentType.PARQUET.value,
272
- "ContentEncoding": ContentEncoding.IDENTITY.value,
273
- },
274
- config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG,
275
- )
276
- pkiv_hb_index_s3_url_base = (
277
- primary_key_index_version_locator.get_pkiv_hb_index_s3_url_base(
278
- s3_bucket, hb_index
279
- )
280
- )
281
- manifest_entries = s3u.upload_sliced_table(
282
- table,
283
- pkiv_hb_index_s3_url_base,
284
- s3_file_system,
285
- records_per_index_file,
286
- get_table_writer(table),
287
- get_table_slicer(table),
288
- )
289
- manifest = Manifest.of(manifest_entries)
290
- pkiv_hb_index_s3_manifest_s3_url = (
291
- primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
292
- s3_bucket, hb_index
293
- )
294
- )
295
- s3u.upload(pkiv_hb_index_s3_manifest_s3_url, str(json.dumps(manifest)))
296
- result = PyArrowWriteResult.of(
297
- len(manifest_entries),
298
- table.nbytes,
299
- manifest.meta.content_length,
300
- len(table),
301
- )
302
- logger.info(
303
- f"Wrote primary key index files for hash bucket {hb_index}. "
304
- f"Primary key index version locator: "
305
- f"{primary_key_index_version_locator}. Result: {result}"
306
- )
307
- return result
deltacat/constants.py CHANGED
@@ -36,6 +36,9 @@ BYTES_PER_GIBIBYTE = 2**30
36
36
  BYTES_PER_TEBIBYTE = 2**40
37
37
  BYTES_PER_PEBIBYTE = 2**50
38
38
 
39
+ SIGNED_INT64_MIN_VALUE = -(2**63)
40
+ SIGNED_INT64_MAX_VALUE = 2**63 - 1
41
+
39
42
  # Inflation multiplier from snappy-compressed parquet to pyarrow.
40
43
  # This should be kept larger than actual average inflation multipliers.
41
44
  # Note that this is a very rough guess since actual observed pyarrow
@@ -49,8 +52,4 @@ PYARROW_INFLATION_MULTIPLIER = 2.5
49
52
  # Inflation multiplier from snappy-compressed parquet to pyarrow for all columns.
50
53
  PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS = 6
51
54
 
52
- PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG = {
53
- "retries": {"max_attempts": 25, "mode": "standard"}
54
- }
55
-
56
55
  MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE
@@ -188,6 +188,36 @@ class TestRepartitionRange(unittest.TestCase):
188
188
  )
189
189
  self.assertEqual(len(result.range_deltas), 2)
190
190
 
191
+ def test_null_rows_are_not_dropped(self):
192
+ # Add null value to the first table
193
+ tables_with_null = [
194
+ pa.table(
195
+ {
196
+ "last_updated": [
197
+ None,
198
+ 1678665487112746,
199
+ 1678665487112747,
200
+ 1678665487112748,
201
+ ]
202
+ }
203
+ ),
204
+ self.tables[1],
205
+ ]
206
+
207
+ result = repartition_range(
208
+ tables_with_null,
209
+ self.destination_partition,
210
+ self.repartition_args,
211
+ self.max_records_per_output_file,
212
+ self.repartitioned_file_content_type,
213
+ self.deltacat_storage,
214
+ )
215
+
216
+ # Assuming range_deltas is a list of DataFrames,
217
+ # check that the first DataFrame has the null value in the 'last_updated' column
218
+ # This may need to be adjusted depending on the actual structure of range_deltas
219
+ self.assertEqual(len(result.range_deltas), 2)
220
+
191
221
 
192
222
  if __name__ == "__main__":
193
223
  unittest.main()
@@ -229,8 +229,13 @@ class PlacementGroupManager:
229
229
 
230
230
  def get_current_node_resource_key(self) -> str:
231
231
  # on ec2: address="172.31.34.51:6379"
232
- # on manta: address = "2600:1f10:4674:6815:aadb:2dc8:de61:bc8e:6379"
233
- current_node_name = ray.experimental.internal_kv.global_gcs_client.address[:-5]
232
+ # on AWS Glue for Ray: address = "2600:1f10:4674:6815:aadb:2dc8:de61:bc8e:6379"
233
+ (
234
+ current_node_name,
235
+ _,
236
+ ) = ray.experimental.internal_kv.global_gcs_client.address.rsplit(
237
+ ":", 1
238
+ ) # using rsplit split on the last occurence of delimiter ":"
234
239
  for node in ray.nodes():
235
240
  if node["NodeName"] == current_node_name:
236
241
  # Found the node.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.1.18b11
3
+ Version: 0.1.18b13
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,5 +1,5 @@
1
- deltacat/__init__.py,sha256=-pROroKFHbLQAMruWQRdiPV5IEfyY12EgCXKDrSBkbw,1811
2
- deltacat/constants.py,sha256=oMU8ypqvDBTG54-6MLGWrt9iJKTN-HKsSWxEWnWp77c,1969
1
+ deltacat/__init__.py,sha256=_t2_FxNTDhr42lxts3cV8iHgCrw_PAT3pIx7MHSA5Ro,1811
2
+ deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
3
3
  deltacat/exceptions.py,sha256=x7qem7FLujXf-DzPsNcQ-XYkW3cF3A0YGIbxkcpz0Mw,146
4
4
  deltacat/logs.py,sha256=yyve_6Y4bLWAdCOnxFOPrSR9FRXwZuh68_rRoPpmg08,5633
5
5
  deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -18,7 +18,7 @@ deltacat/catalog/model/table_definition.py,sha256=tKrM1mmaQlvxqXrLt3QJVZK5BZfaJn
18
18
  deltacat/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  deltacat/compute/compactor/__init__.py,sha256=kmWC-Qnw861k7mPhLH4fQEL6CaMeBql2AipHeFqJ2uI,1127
20
20
  deltacat/compute/compactor/compaction_session.py,sha256=21Ai6esOqw9nhXIpbVQteLvROIPeiqpDg1iBsOclais,25946
21
- deltacat/compute/compactor/repartition_session.py,sha256=t76aZ-bZxqPOjkTfCH3wHXR93DYkwXQxojqUdCdERfQ,6923
21
+ deltacat/compute/compactor/repartition_session.py,sha256=IYBygwvoAGAY6uftZ3C4bAW0VKPfGuKjkdbpr6_FnCo,6986
22
22
  deltacat/compute/compactor/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  deltacat/compute/compactor/model/compact_partition_params.py,sha256=QvjH10IsA8O6ufVzwPz-mcw326BT-Zbs29wFGCcGerA,5677
24
24
  deltacat/compute/compactor/model/compaction_session_audit_info.py,sha256=TKgFFdd38cplihdMtHja-cBTwk3dflEipc8smWtZlGg,25231
@@ -37,13 +37,10 @@ deltacat/compute/compactor/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
37
37
  deltacat/compute/compactor/steps/dedupe.py,sha256=R6p43mOUWgA1t468FS8JU-Wlrr96tt0ccwa0uytuaRY,10063
38
38
  deltacat/compute/compactor/steps/hash_bucket.py,sha256=ZzJQWulSOMve7bDZX7ZRuYAl4bSC4U5SJzPhpeGpKB0,9769
39
39
  deltacat/compute/compactor/steps/materialize.py,sha256=mXxKSaPL7iYtqP-eiJlFwi8kuywFmiU5FLS2-DW5314,13964
40
- deltacat/compute/compactor/steps/repartition.py,sha256=lpvxhiTC27MKqUXPN70H5L-FcLA1-yCCElERQq74Zig,9487
41
- deltacat/compute/compactor/steps/rehash/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py,sha256=yh-sBuUI3hqw2vk_nK9o-KDrgSww4oSvAz2hBxTkv8s,1765
43
- deltacat/compute/compactor/steps/rehash/rewrite_index.py,sha256=-HVM08pk5ROHEgDP-FVty55-a_0dsGRiSnPlNJw7C6Q,1838
40
+ deltacat/compute/compactor/steps/repartition.py,sha256=EH843SI33fporpxQbeBmEQdvogSYmVYih6hUkxXlZ9w,9953
44
41
  deltacat/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
42
  deltacat/compute/compactor/utils/io.py,sha256=itraIfLGUFfVFrW-XHnsEEa9GNIJR4VCnav0LyjHons,16543
46
- deltacat/compute/compactor/utils/primary_key_index.py,sha256=Y8MBkDMS4N9xgJpuqWcdqpdNbfrfycIABrKlGZwfoRM,11359
43
+ deltacat/compute/compactor/utils/primary_key_index.py,sha256=ldcgWqnwCfnGSmUWpe68zvFO7SfOXCrytLTISQ3KwNY,2866
47
44
  deltacat/compute/compactor/utils/round_completion_file.py,sha256=DmZfHeAXlQn0DDdcsIHZROHWfyBCKTY3pNUdHzalqkE,2284
48
45
  deltacat/compute/compactor/utils/system_columns.py,sha256=I36NAEGwRegv56ouVLwTCCisyoOupDCbbaxtoFDzYTE,8121
49
46
  deltacat/compute/metastats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -96,7 +93,7 @@ deltacat/storage/model/table.py,sha256=IOu1ZOrdRkVDB-FOxYMRvnNf5TukIDfbdHWTqHYN_
96
93
  deltacat/storage/model/table_version.py,sha256=j57er3zlN0_2kwVMpWZ3iouABO-Kl8_Txi0UWIZ0dtk,7034
97
94
  deltacat/storage/model/types.py,sha256=-9yPA5wjZf9jOd-iErf4sN-YD-6fbl2z8m8t1lGa0I0,2061
98
95
  deltacat/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
- deltacat/tests/test_repartition.py,sha256=xzqdfRzZS-bA1yBdPNxelecTFe2MtON5Lrd-jTGZ4Xk,7245
96
+ deltacat/tests/test_repartition.py,sha256=dzFkmSB9QmrqJWj2JVxhHS-sefiOVljTd0vVoGFS_L0,8265
100
97
  deltacat/tests/compactor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
98
  deltacat/tests/compactor/test_compact_partition_params.py,sha256=0h0cXNg-1NslQ98Nld7brD1WHHhzzBZR1x16kUd7MdA,8848
102
99
  deltacat/tests/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -123,7 +120,7 @@ deltacat/utils/metrics.py,sha256=1CHb5f9SXvTeKljjGawK6wmyij0HN9X6ixMiTssbT_w,467
123
120
  deltacat/utils/numpy.py,sha256=ZiGREobTVT6IZXgPxkSUpLJFN2Hn8KEZcrqybLDXCIA,2027
124
121
  deltacat/utils/pandas.py,sha256=eGOpiZE1zLznTtuwoN80j4PBp1_bUV8SE4c951r0a3o,9561
125
122
  deltacat/utils/performance.py,sha256=rC3CPfroZP3T5TbRNZXB9GRBr0F9i2KUeZYL45JBgCU,610
126
- deltacat/utils/placement.py,sha256=JE6OsW16VonlMhdH5B2IYuLJxItoYguaKpZNgbpMNLw,11066
123
+ deltacat/utils/placement.py,sha256=6ppSypvmkVH5twN-UdAmDaNLJkBaGnJ2DDMv5NmNv4o,11210
127
124
  deltacat/utils/pyarrow.py,sha256=dgAruwOpWYSlnJ5w8iJz_NWpfQoZHA_iG-F7CBDieko,18245
128
125
  deltacat/utils/resources.py,sha256=fA53NiJOd5rLMtwvuTnqTyq4g59deD6NCGDbX5yIlg8,2908
129
126
  deltacat/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -132,8 +129,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=AyL7hpvYjkmsz-KcpYjVgPpNsmu-x8-rl
132
129
  deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
133
130
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
134
131
  deltacat/utils/ray_utils/runtime.py,sha256=xOVkqL6o8qGsewGvzhMKxmCcqcFZDnNILuz5IGMgxSc,4991
135
- deltacat-0.1.18b11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
136
- deltacat-0.1.18b11.dist-info/METADATA,sha256=H0JKD8faSlKHvi44zSoCE9cA8IXKzwxydHL49eVt3vI,1558
137
- deltacat-0.1.18b11.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
138
- deltacat-0.1.18b11.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
139
- deltacat-0.1.18b11.dist-info/RECORD,,
132
+ deltacat-0.1.18b13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
133
+ deltacat-0.1.18b13.dist-info/METADATA,sha256=BxNxkho94qIqKJWLh4ShgkrA1BPKKQd1so2e3YV8z5U,1558
134
+ deltacat-0.1.18b13.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
135
+ deltacat-0.1.18b13.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
136
+ deltacat-0.1.18b13.dist-info/RECORD,,
File without changes
@@ -1,57 +0,0 @@
1
- import logging
2
- from typing import List, Tuple
3
-
4
- import numpy as np
5
- import pyarrow as pa
6
- import ray
7
- from ray.types import ObjectRef
8
-
9
- from deltacat import logs
10
- from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator
11
- from deltacat.compute.compactor.utils import primary_key_index as pki
12
-
13
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
14
-
15
-
16
- def group_file_records_by_pk_hash_bucket(
17
- pki_table: pa.Table, num_buckets: int
18
- ) -> np.ndarray:
19
- # generate the new table for each new hash bucket
20
- hash_bucket_to_indices = pki.group_record_indices_by_hash_bucket(
21
- pki_table,
22
- num_buckets,
23
- )
24
- hash_bucket_to_table = np.empty([num_buckets], dtype="object")
25
- for hash_bucket, indices in enumerate(hash_bucket_to_indices):
26
- if indices:
27
- hash_bucket_to_table[hash_bucket] = pki_table.take(indices)
28
- return hash_bucket_to_table
29
-
30
-
31
- @ray.remote(num_cpus=1, num_returns=2)
32
- def rehash_bucket(
33
- hash_bucket_index: int,
34
- s3_bucket: str,
35
- old_pki_version_locator: PrimaryKeyIndexVersionLocator,
36
- num_buckets: int,
37
- num_groups: int,
38
- ) -> Tuple[np.ndarray, List[ObjectRef]]:
39
-
40
- logger.info(f"Starting rehash bucket task...")
41
- tables = pki.download_hash_bucket_entries(
42
- s3_bucket,
43
- hash_bucket_index,
44
- old_pki_version_locator,
45
- )
46
- prior_pk_index_table = pa.concat_tables(tables)
47
- hash_bucket_to_table = group_file_records_by_pk_hash_bucket(
48
- prior_pk_index_table,
49
- num_buckets,
50
- )
51
- hash_bucket_group_to_obj_id, object_refs = pki.group_hash_bucket_indices(
52
- hash_bucket_to_table,
53
- num_buckets,
54
- num_groups,
55
- )
56
- logger.info(f"Finished rehash bucket task...")
57
- return hash_bucket_group_to_obj_id, object_refs
@@ -1,48 +0,0 @@
1
- import logging
2
- from collections import defaultdict
3
- from typing import Any, List, Tuple
4
-
5
- import pyarrow as pa
6
- import ray
7
- from ray import cloudpickle
8
- from ray.types import ObjectRef
9
-
10
- from deltacat import logs
11
- from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator, PyArrowWriteResult
12
- from deltacat.compute.compactor.utils import primary_key_index as pki
13
-
14
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
-
16
-
17
- @ray.remote(num_cpus=1, num_returns=2)
18
- def rewrite_index(
19
- object_ids: List[Any],
20
- s3_bucket: str,
21
- new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
22
- max_records_per_index_file: int,
23
- ) -> Tuple[PyArrowWriteResult, List[ObjectRef]]:
24
-
25
- logger.info(f"Starting rewrite primary key index task...")
26
- object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
27
- logger.info(f"Getting table groups object refs...")
28
- table_groups_list = ray.get(object_refs)
29
- logger.info(f"Got {len(table_groups_list)} table groups object refs...")
30
- hb_index_to_tables = defaultdict(list)
31
- for table_groups in table_groups_list:
32
- for hb_index, table in enumerate(table_groups):
33
- if table is not None:
34
- hb_index_to_tables[hb_index].append(table)
35
- logger.info(f"Running {len(hb_index_to_tables)} rewrite index rounds...")
36
- pki_stats = []
37
- for hb_index, tables in hb_index_to_tables.items():
38
- table = pa.concat_tables(tables)
39
- hb_pki_stats = pki.write_primary_key_index_files(
40
- table,
41
- new_primary_key_index_version_locator,
42
- s3_bucket,
43
- hb_index,
44
- max_records_per_index_file,
45
- )
46
- pki_stats.append(hb_pki_stats)
47
- logger.info(f"Finished rewrite primary key index task...")
48
- return PyArrowWriteResult.union(pki_stats), object_refs