deltacat 1.1.36__py3-none-any.whl → 1.1.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.36"
47
+ __version__ = "1.1.37"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -78,13 +78,25 @@ def _append_table_by_hash_bucket(
78
78
  f"Grouping a pki table of length {len(pki_table)} took {groupby_latency}s"
79
79
  )
80
80
 
81
+ hb_pk_grouped_by = hb_pk_grouped_by.sort_by(sc._HASH_BUCKET_IDX_COLUMN_NAME)
81
82
  group_count_array = hb_pk_grouped_by[f"{sc._HASH_BUCKET_IDX_COLUMN_NAME}_count"]
82
83
  hb_group_array = hb_pk_grouped_by[sc._HASH_BUCKET_IDX_COLUMN_NAME]
83
84
 
84
85
  result_len = 0
85
86
  for i, group_count in enumerate(group_count_array):
86
87
  hb_idx = hb_group_array[i].as_py()
87
- pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count.as_py())
88
+ group_count_py = group_count.as_py()
89
+ pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count_py)
90
+ assert group_count_py == len(
91
+ pyarrow_table
92
+ ), f"Group count {group_count_py} not equal to {len(pyarrow_table)}"
93
+ all_buckets = pc.unique(pyarrow_table[sc._HASH_BUCKET_IDX_COLUMN_NAME])
94
+ assert (
95
+ len(all_buckets) == 1
96
+ ), f"Only one hash bucket is allowed by found {len(all_buckets)}"
97
+ assert (
98
+ all_buckets[0].as_py() == hb_idx
99
+ ), f"Hash bucket not equal, {all_buckets[0]} and {hb_idx}"
88
100
  pyarrow_table = pyarrow_table.drop(
89
101
  [sc._HASH_BUCKET_IDX_COLUMN_NAME, sc._PK_HASH_STRING_COLUMN_NAME]
90
102
  )
@@ -141,6 +153,7 @@ def _optimized_group_record_batches_by_hash_bucket(
141
153
  record_batches.append(record_batch)
142
154
 
143
155
  if record_batches:
156
+ print(f"{len(record_batches)} -- END")
144
157
  appended_len, append_latency = timed_invocation(
145
158
  _append_table_by_hash_bucket,
146
159
  pa.Table.from_batches(record_batches),
@@ -170,6 +170,10 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
170
170
  operation_type == OperationType.PYARROW_DOWNLOAD
171
171
  ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
172
172
 
173
+ if not estimate_resources_params.max_files_to_sample:
174
+ # we cannot calculate if we cannot sample
175
+ return None
176
+
173
177
  if not delta.manifest:
174
178
  delta.manifest = deltacat_storage.get_delta_manifest(
175
179
  delta.locator,
@@ -186,10 +190,6 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
186
190
  ),
187
191
  )
188
192
 
189
- if not estimate_resources_params.max_files_to_sample:
190
- # we cannot calculate if we cannot sample
191
- return None
192
-
193
193
  sampled_in_memory_size = 0.0
194
194
  sampled_on_disk_size = 0.0
195
195
  sampled_num_rows = 0
@@ -252,6 +252,10 @@ RESOURCE_ESTIMATION_METHOD_TO_DELTA_RESOURCE_ESTIMATION_FUNCTIONS = {
252
252
  _estimate_resources_required_to_process_delta_using_file_sampling,
253
253
  _estimate_resources_required_to_process_delta_using_previous_inflation,
254
254
  ],
255
+ ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION: [
256
+ _estimate_resources_required_to_process_delta_using_file_sampling,
257
+ _estimate_resources_required_to_process_delta_using_previous_inflation,
258
+ ],
255
259
  }
256
260
 
257
261
 
@@ -23,6 +23,14 @@ class ResourceEstimationMethod(str, Enum):
23
23
  """
24
24
  DEFAULT_V2 = "DEFAULT_V2"
25
25
 
26
+ """
27
+ This approach combines file sampling estimation and inflation based methods
28
+ and runs them in the order specified below:
29
+ 1. FILE_SAMPLING
30
+ 2. PREVIOUS_INFLATION
31
+ """
32
+ FILE_SAMPLING_WITH_PREVIOUS_INFLATION = "FILE_SAMPLING_WITH_PREVIOUS_INFLATION"
33
+
26
34
  """
27
35
  This approach strictly uses previous inflation and average record size to arrive
28
36
  at a resource estimate. It requires users to pass in previous inflation and average
@@ -416,6 +416,29 @@ class TestEstimateResourcesRequiredToProcessDelta:
416
416
  == delta_without_manifest.meta.content_length
417
417
  )
418
418
 
419
+ def test_empty_delta_sampled_when_file_sampling_with_previous_inflation(
420
+ self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
421
+ ):
422
+ params = EstimateResourcesParams.of(
423
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
424
+ max_files_to_sample=2,
425
+ )
426
+
427
+ result = estimate_resources_required_to_process_delta(
428
+ delta=delta_without_manifest,
429
+ operation_type=OperationType.PYARROW_DOWNLOAD,
430
+ deltacat_storage=ds,
431
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
432
+ estimate_resources_params=params,
433
+ )
434
+
435
+ assert delta_without_manifest.manifest is not None
436
+ assert result.memory_bytes is not None
437
+ assert (
438
+ result.statistics.on_disk_size_bytes
439
+ == delta_without_manifest.meta.content_length
440
+ )
441
+
419
442
  def test_delta_manifest_parquet_when_file_sampling(
420
443
  self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
421
444
  ):
@@ -437,6 +460,27 @@ class TestEstimateResourcesRequiredToProcessDelta:
437
460
  == parquet_delta_with_manifest.meta.content_length
438
461
  )
439
462
 
463
+ def test_delta_manifest_parquet_when_file_sampling_with_previous_inflation(
464
+ self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
465
+ ):
466
+ params = EstimateResourcesParams.of(
467
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
468
+ max_files_to_sample=2,
469
+ )
470
+
471
+ result = estimate_resources_required_to_process_delta(
472
+ delta=parquet_delta_with_manifest,
473
+ operation_type=OperationType.PYARROW_DOWNLOAD,
474
+ deltacat_storage=ds,
475
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
476
+ estimate_resources_params=params,
477
+ )
478
+ assert result.memory_bytes is not None
479
+ assert (
480
+ result.statistics.on_disk_size_bytes
481
+ == parquet_delta_with_manifest.meta.content_length
482
+ )
483
+
440
484
  def test_parquet_delta_when_file_sampling_and_arrow_size_zero(
441
485
  self,
442
486
  local_deltacat_storage_kwargs,
@@ -512,6 +556,28 @@ class TestEstimateResourcesRequiredToProcessDelta:
512
556
  )
513
557
  assert result is None
514
558
 
559
+ def test_delta_manifest_utsv_when_file_sampling_with_previous_inflation_zero_files_to_sample(
560
+ self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
561
+ ):
562
+ previous_inflation = 7
563
+ params = EstimateResourcesParams.of(
564
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
565
+ max_files_to_sample=None,
566
+ previous_inflation=previous_inflation,
567
+ )
568
+
569
+ result = estimate_resources_required_to_process_delta(
570
+ delta=utsv_delta_with_manifest,
571
+ operation_type=OperationType.PYARROW_DOWNLOAD,
572
+ deltacat_storage=ds,
573
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
574
+ estimate_resources_params=params,
575
+ )
576
+ assert result is not None
577
+ assert result.memory_bytes == (
578
+ utsv_delta_with_manifest.meta.content_length * previous_inflation
579
+ )
580
+
515
581
  def test_empty_delta_when_default_v2(
516
582
  self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
517
583
  ):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.36
3
+ Version: 1.1.37
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=9vJMHGceWew6atD_3VqKurlBJ3crD5mwAQIgSB1yjNY,1778
1
+ deltacat/__init__.py,sha256=u00X92zHfZJzS08a-2kx3kCLcz40L-THm0HowDiBOiA,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -76,7 +76,7 @@ deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hd
76
76
  deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
77
77
  deltacat/compute/compactor_v2/utils/io.py,sha256=Xjs7_D-0xKSetvllIe4o96aM1elfdjt1Ii7YfsHPvZs,6108
78
78
  deltacat/compute/compactor_v2/utils/merge.py,sha256=fAzEYwQYH2ia8MLdEFdZFivWHpi6qZu8AyyEK0H0vwE,5363
79
- deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
79
+ deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=Qsn0BQrlBWSLqu4srd-LJUX8BaVqG6Wo1oAros7LYWw,12677
80
80
  deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
81
81
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
82
82
  deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
@@ -85,9 +85,9 @@ deltacat/compute/merge_on_read/model/merge_on_read_params.py,sha256=Q51znagh8PtL
85
85
  deltacat/compute/merge_on_read/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
86
86
  deltacat/compute/merge_on_read/utils/delta.py,sha256=e4BtOHa5XPpUnR4r0HqBKjXckBsTI8qBwdUWwpJfkWQ,1367
87
87
  deltacat/compute/resource_estimation/__init__.py,sha256=4bfBXcq-VAt9JCmjvj3yAmn0lEHVGdGsUCCoMGxjEqA,799
88
- deltacat/compute/resource_estimation/delta.py,sha256=dN64jbUQ8OI1BTz4fYGbulJLWjKjdT-XvwDJNLM__Oo,10583
88
+ deltacat/compute/resource_estimation/delta.py,sha256=zd1ivoA3EzdrjgJYYBXY3wrhwZDlt-Xoqke0e5xz6AY,10815
89
89
  deltacat/compute/resource_estimation/manifest.py,sha256=gSqOyIda-pYq3vRsKFq3IiZvwhV3mMqrWPtsmUH9dD8,13035
90
- deltacat/compute/resource_estimation/model.py,sha256=psyagFXdpLGt8DfDqy7c8DWiuXCacr0Swe5f0M7DdO4,5465
90
+ deltacat/compute/resource_estimation/model.py,sha256=1svgVfhNIAyyVkHy-QXcOzO0UVigbVH8M7xyAlgvCbg,5741
91
91
  deltacat/compute/resource_estimation/parquet.py,sha256=5_apma4EKbKcm-nfV73-qN2nfnCeyhFW23ZHX3jz0Kw,3158
92
92
  deltacat/compute/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
93
93
  deltacat/compute/stats/types.py,sha256=cp0lT8nITTKbnkc03OysRjXfcfXzQml9a4wqCnR6kqs,215
@@ -159,7 +159,7 @@ deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py,sha256=eoi
159
159
  deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
160
160
  deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=YDQKUKv3Vv8S1fe0YQmjHTrwnWSliqKHIWGu0fEdKnI,11478
161
161
  deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
- deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
162
+ deltacat/tests/compute/resource_estimation/test_delta.py,sha256=vbqKwZOxrNtfbuXWz08nUvi_srR4y2aMQmUwLR2jDcs,28446
163
163
  deltacat/tests/compute/resource_estimation/test_manifest.py,sha256=yrMvqDjolExdRf6Vtg5XaKDuaKz9ok15PCZ7_aJOYrI,32893
164
164
  deltacat/tests/compute/resource_estimation/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
165
  deltacat/tests/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -212,8 +212,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
212
212
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
213
213
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
214
214
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
215
- deltacat-1.1.36.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
216
- deltacat-1.1.36.dist-info/METADATA,sha256=wIZbEGHnJWq_TBKi0u463p4-PgG9R_0MApw7IIwmnRc,1733
217
- deltacat-1.1.36.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
218
- deltacat-1.1.36.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
219
- deltacat-1.1.36.dist-info/RECORD,,
215
+ deltacat-1.1.37.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
216
+ deltacat-1.1.37.dist-info/METADATA,sha256=iHlaZ9sS-CrQby0kxCrOigl1ZGZKpniwf9LyYbagwzI,1733
217
+ deltacat-1.1.37.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
218
+ deltacat-1.1.37.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
219
+ deltacat-1.1.37.dist-info/RECORD,,