deltacat 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/s3u.py +6 -0
- deltacat/compute/compactor/compaction_session.py +2 -3
- deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -57
- deltacat/compute/compactor/steps/dedupe.py +2 -2
- deltacat/compute/compactor/steps/hash_bucket.py +2 -2
- deltacat/compute/compactor/steps/materialize.py +2 -2
- deltacat/compute/compactor_v2/compaction_session.py +3 -16
- deltacat/compute/compactor_v2/steps/hash_bucket.py +41 -24
- deltacat/compute/compactor_v2/steps/merge.py +38 -21
- deltacat/compute/compactor_v2/utils/primary_key_index.py +2 -1
- deltacat/tests/compute/test_compact_partition_incremental.py +18 -1
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +4 -1
- deltacat/tests/utils/test_resources.py +21 -0
- deltacat/utils/daft.py +2 -0
- deltacat/utils/pyarrow.py +69 -0
- deltacat/utils/resources.py +58 -2
- {deltacat-0.2.7.dist-info → deltacat-0.2.9.dist-info}/METADATA +2 -2
- {deltacat-0.2.7.dist-info → deltacat-0.2.9.dist-info}/RECORD +22 -22
- {deltacat-0.2.7.dist-info → deltacat-0.2.9.dist-info}/WHEEL +1 -1
- {deltacat-0.2.7.dist-info → deltacat-0.2.9.dist-info}/LICENSE +0 -0
- {deltacat-0.2.7.dist-info → deltacat-0.2.9.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
deltacat/aws/s3u.py
CHANGED
@@ -383,6 +383,12 @@ def upload_table(
|
|
383
383
|
# s3fs may swallow S3 errors - we were probably throttled
|
384
384
|
raise RetryableError(f"Retry table upload to: {s3_url}") from e
|
385
385
|
raise NonRetryableError(f"Failed table upload to: {s3_url}") from e
|
386
|
+
except BaseException as e:
|
387
|
+
logger.warn(
|
388
|
+
f"Upload has failed for {s3_url} and content_type={content_type}. Error: {e}",
|
389
|
+
exc_info=True,
|
390
|
+
)
|
391
|
+
raise e
|
386
392
|
return manifest_entries
|
387
393
|
|
388
394
|
|
@@ -52,7 +52,7 @@ from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
|
52
52
|
)
|
53
53
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
54
54
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
55
|
-
from deltacat.utils.resources import
|
55
|
+
from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
|
56
56
|
|
57
57
|
|
58
58
|
if importlib.util.find_spec("memray"):
|
@@ -293,7 +293,6 @@ def _execute_compaction_round(
|
|
293
293
|
f"{node_resource_keys}"
|
294
294
|
)
|
295
295
|
|
296
|
-
compaction_audit.set_cluster_cpu_max(cluster_cpus)
|
297
296
|
# create a remote options provider to round-robin tasks across all nodes or allocated bundles
|
298
297
|
logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
|
299
298
|
round_robin_opt_provider = functools.partial(
|
@@ -680,7 +679,7 @@ def _execute_compaction_round(
|
|
680
679
|
[m.pyarrow_write_result for m in mat_results]
|
681
680
|
)
|
682
681
|
|
683
|
-
session_peak_memory =
|
682
|
+
session_peak_memory = get_current_process_peak_memory_usage_in_bytes()
|
684
683
|
compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
|
685
684
|
session_peak_memory
|
686
685
|
)
|
@@ -98,14 +98,6 @@ class CompactionSessionAuditInfo(dict):
|
|
98
98
|
"""
|
99
99
|
return self.get("hashBucketCount")
|
100
100
|
|
101
|
-
@property
|
102
|
-
def cluster_cpu_max(self) -> float:
|
103
|
-
"""
|
104
|
-
Total cluster cpu allocated for the compaction job. If it is autoscaling cluster,
|
105
|
-
max cpu at any time will be reported.
|
106
|
-
"""
|
107
|
-
return self.get("clusterCpuMax")
|
108
|
-
|
109
101
|
@property
|
110
102
|
def compaction_time_in_seconds(self) -> float:
|
111
103
|
"""
|
@@ -423,35 +415,6 @@ class CompactionSessionAuditInfo(dict):
|
|
423
415
|
"""
|
424
416
|
return self.get("hashBucketProcessedSizeBytes")
|
425
417
|
|
426
|
-
@property
|
427
|
-
def total_cpu_seconds(self) -> float:
|
428
|
-
"""
|
429
|
-
Total number of vCPUs provisioned in the cluster weighted over time.
|
430
|
-
"""
|
431
|
-
return self.get("totalCPUSeconds")
|
432
|
-
|
433
|
-
@property
|
434
|
-
def used_cpu_seconds(self) -> float:
|
435
|
-
"""
|
436
|
-
Total used vCPU in the cluster weighted over time.
|
437
|
-
"""
|
438
|
-
return self.get("usedCPUSeconds")
|
439
|
-
|
440
|
-
@property
|
441
|
-
def used_memory_gb_seconds(self) -> float:
|
442
|
-
"""
|
443
|
-
The used memory in the cluster weighted over time. This
|
444
|
-
determines opportunities for better memory estimation.
|
445
|
-
"""
|
446
|
-
return self.get("usedMemoryGBSeconds")
|
447
|
-
|
448
|
-
@property
|
449
|
-
def total_memory_gb_seconds(self) -> float:
|
450
|
-
"""
|
451
|
-
Total memory in the cluster weighted over time in GB.
|
452
|
-
"""
|
453
|
-
return self.get("totalMemoryGBSeconds")
|
454
|
-
|
455
418
|
@property
|
456
419
|
def pyarrow_version(self) -> str:
|
457
420
|
"""
|
@@ -510,10 +473,6 @@ class CompactionSessionAuditInfo(dict):
|
|
510
473
|
self["hashBucketCount"] = hash_bucket_count
|
511
474
|
return self
|
512
475
|
|
513
|
-
def set_cluster_cpu_max(self, cluster_cpu_max: float) -> CompactionSessionAuditInfo:
|
514
|
-
self["clusterCpuMax"] = cluster_cpu_max
|
515
|
-
return self
|
516
|
-
|
517
476
|
def set_compaction_time_in_seconds(
|
518
477
|
self, compaction_time_in_seconds: float
|
519
478
|
) -> CompactionSessionAuditInfo:
|
@@ -778,22 +737,6 @@ class CompactionSessionAuditInfo(dict):
|
|
778
737
|
self["hashBucketProcessedSizeBytes"] = size
|
779
738
|
return self
|
780
739
|
|
781
|
-
def set_total_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
|
782
|
-
self["totalCPUSeconds"] = value
|
783
|
-
return self
|
784
|
-
|
785
|
-
def set_used_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
|
786
|
-
self["usedCPUSeconds"] = value
|
787
|
-
return self
|
788
|
-
|
789
|
-
def set_used_memory_gb_seconds(self, value: float) -> CompactionSessionAuditInfo:
|
790
|
-
self["usedMemoryGBSeconds"] = value
|
791
|
-
return self
|
792
|
-
|
793
|
-
def set_total_memory_gb_seconds(self, value: float) -> CompactionSessionAuditInfo:
|
794
|
-
self["totalMemoryGBSeconds"] = value
|
795
|
-
return self
|
796
|
-
|
797
740
|
def set_pyarrow_version(self, value: str) -> CompactionSessionAuditInfo:
|
798
741
|
self["pyarrowVersion"] = value
|
799
742
|
return self
|
@@ -25,7 +25,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
25
25
|
from deltacat.utils.performance import timed_invocation
|
26
26
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
27
27
|
from deltacat.io.object_store import IObjectStore
|
28
|
-
from deltacat.utils.resources import
|
28
|
+
from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
|
29
29
|
|
30
30
|
if importlib.util.find_spec("memray"):
|
31
31
|
import memray
|
@@ -228,7 +228,7 @@ def _timed_dedupe(
|
|
228
228
|
f"{len(mat_bucket_to_dd_idx_obj_id)}"
|
229
229
|
)
|
230
230
|
|
231
|
-
peak_memory_usage_bytes =
|
231
|
+
peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
|
232
232
|
return DedupeResult(
|
233
233
|
mat_bucket_to_dd_idx_obj_id,
|
234
234
|
np.int64(total_deduped_records),
|
@@ -32,7 +32,7 @@ from deltacat.utils.common import ReadKwargsProvider
|
|
32
32
|
from deltacat.utils.performance import timed_invocation
|
33
33
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
34
34
|
from deltacat.io.object_store import IObjectStore
|
35
|
-
from deltacat.utils.resources import
|
35
|
+
from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
|
36
36
|
|
37
37
|
if importlib.util.find_spec("memray"):
|
38
38
|
import memray
|
@@ -228,7 +228,7 @@ def _timed_hash_bucket(
|
|
228
228
|
delta_file_envelope_groups, num_buckets, num_groups, object_store
|
229
229
|
)
|
230
230
|
|
231
|
-
peak_memory_usage_bytes =
|
231
|
+
peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
|
232
232
|
return HashBucketResult(
|
233
233
|
hash_bucket_group_to_obj_id,
|
234
234
|
np.int64(total_record_count),
|
@@ -44,7 +44,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
44
44
|
get_current_ray_worker_id,
|
45
45
|
)
|
46
46
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
47
|
-
from deltacat.utils.resources import
|
47
|
+
from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
|
48
48
|
|
49
49
|
if importlib.util.find_spec("memray"):
|
50
50
|
import memray
|
@@ -314,7 +314,7 @@ def materialize(
|
|
314
314
|
emit_metrics_time = latency
|
315
315
|
logger.info(f"Materialize task ended in {end - start}s")
|
316
316
|
|
317
|
-
peak_memory_usage_bytes =
|
317
|
+
peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
|
318
318
|
|
319
319
|
# Merge all new deltas into one for this materialize bucket index
|
320
320
|
merged_materialize_result = MaterializeResult.of(
|
@@ -41,13 +41,12 @@ from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
|
41
41
|
CompactionSessionAuditInfo,
|
42
42
|
)
|
43
43
|
from deltacat.utils.resources import (
|
44
|
-
|
44
|
+
get_current_process_peak_memory_usage_in_bytes,
|
45
45
|
)
|
46
46
|
from deltacat.compute.compactor_v2.utils.task_options import (
|
47
47
|
hash_bucket_resource_options_provider,
|
48
48
|
merge_resource_options_provider,
|
49
49
|
)
|
50
|
-
from deltacat.utils.resources import ClusterUtilizationOverTimeRange
|
51
50
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
52
51
|
|
53
52
|
if importlib.util.find_spec("memray"):
|
@@ -65,10 +64,9 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
65
64
|
|
66
65
|
with memray.Tracker(
|
67
66
|
f"compaction_partition.bin"
|
68
|
-
) if params.enable_profiler else nullcontext()
|
67
|
+
) if params.enable_profiler else nullcontext():
|
69
68
|
(new_partition, new_rci, new_rcf_partition_locator,) = _execute_compaction(
|
70
69
|
params,
|
71
|
-
cluster_util=cluster_util,
|
72
70
|
**kwargs,
|
73
71
|
)
|
74
72
|
|
@@ -469,7 +467,7 @@ def _execute_compaction(
|
|
469
467
|
[m.pyarrow_write_result for m in mat_results]
|
470
468
|
)
|
471
469
|
|
472
|
-
session_peak_memory =
|
470
|
+
session_peak_memory = get_current_process_peak_memory_usage_in_bytes()
|
473
471
|
compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
|
474
472
|
session_peak_memory
|
475
473
|
)
|
@@ -478,17 +476,6 @@ def _execute_compaction(
|
|
478
476
|
mat_results, telemetry_time_hb + telemetry_time_merge
|
479
477
|
)
|
480
478
|
|
481
|
-
cluster_util: ClusterUtilizationOverTimeRange = kwargs.get("cluster_util")
|
482
|
-
|
483
|
-
if cluster_util:
|
484
|
-
compaction_audit.set_total_cpu_seconds(cluster_util.total_vcpu_seconds)
|
485
|
-
compaction_audit.set_used_cpu_seconds(cluster_util.used_vcpu_seconds)
|
486
|
-
compaction_audit.set_used_memory_gb_seconds(cluster_util.used_memory_gb_seconds)
|
487
|
-
compaction_audit.set_total_memory_gb_seconds(
|
488
|
-
cluster_util.total_memory_gb_seconds
|
489
|
-
)
|
490
|
-
compaction_audit.set_cluster_cpu_max(cluster_util.max_cpu)
|
491
|
-
|
492
479
|
input_inflation = None
|
493
480
|
input_average_record_size_bytes = None
|
494
481
|
# Note: we only consider inflation for incremental delta
|
@@ -27,7 +27,11 @@ from deltacat.utils.ray_utils.runtime import (
|
|
27
27
|
from deltacat.utils.common import ReadKwargsProvider
|
28
28
|
from deltacat.utils.performance import timed_invocation
|
29
29
|
from deltacat.utils.metrics import emit_timer_metrics
|
30
|
-
from deltacat.utils.resources import
|
30
|
+
from deltacat.utils.resources import (
|
31
|
+
get_current_process_peak_memory_usage_in_bytes,
|
32
|
+
ProcessUtilizationOverTimeRange,
|
33
|
+
)
|
34
|
+
from deltacat.constants import BYTES_PER_GIBIBYTE
|
31
35
|
|
32
36
|
if importlib.util.find_spec("memray"):
|
33
37
|
import memray
|
@@ -166,7 +170,10 @@ def _timed_hash_bucket(input: HashBucketInput):
|
|
166
170
|
object_store=input.object_store,
|
167
171
|
)
|
168
172
|
|
169
|
-
peak_memory_usage_bytes =
|
173
|
+
peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
|
174
|
+
logger.info(
|
175
|
+
f"Peak memory usage in bytes after hash bucketing: {peak_memory_usage_bytes}"
|
176
|
+
)
|
170
177
|
return HashBucketResult(
|
171
178
|
hash_bucket_group_to_obj_id_tuple,
|
172
179
|
np.int64(total_size_bytes),
|
@@ -179,28 +186,38 @@ def _timed_hash_bucket(input: HashBucketInput):
|
|
179
186
|
|
180
187
|
@ray.remote
|
181
188
|
def hash_bucket(input: HashBucketInput) -> HashBucketResult:
|
189
|
+
with ProcessUtilizationOverTimeRange() as process_util:
|
190
|
+
logger.info(f"Starting hash bucket task...")
|
182
191
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
192
|
+
# Log node peak memory utilization every 10 seconds
|
193
|
+
def log_peak_memory():
|
194
|
+
logger.debug(
|
195
|
+
f"Process peak memory utilization so far: {process_util.max_memory} bytes "
|
196
|
+
f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
|
197
|
+
)
|
198
|
+
|
199
|
+
process_util.schedule_callback(log_peak_memory, 10)
|
187
200
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
201
|
+
hash_bucket_result, duration = timed_invocation(
|
202
|
+
func=_timed_hash_bucket, input=input
|
203
|
+
)
|
204
|
+
|
205
|
+
emit_metrics_time = 0.0
|
206
|
+
if input.metrics_config:
|
207
|
+
emit_result, latency = timed_invocation(
|
208
|
+
func=emit_timer_metrics,
|
209
|
+
metrics_name="hash_bucket",
|
210
|
+
value=duration,
|
211
|
+
metrics_config=input.metrics_config,
|
212
|
+
)
|
213
|
+
emit_metrics_time = latency
|
214
|
+
|
215
|
+
logger.info(f"Finished hash bucket task...")
|
216
|
+
return HashBucketResult(
|
217
|
+
hash_bucket_result[0],
|
218
|
+
hash_bucket_result[1],
|
219
|
+
hash_bucket_result[2],
|
220
|
+
hash_bucket_result[3],
|
221
|
+
np.double(emit_metrics_time),
|
222
|
+
hash_bucket_result[5],
|
195
223
|
)
|
196
|
-
emit_metrics_time = latency
|
197
|
-
|
198
|
-
logger.info(f"Finished hash bucket task...")
|
199
|
-
return HashBucketResult(
|
200
|
-
hash_bucket_result[0],
|
201
|
-
hash_bucket_result[1],
|
202
|
-
hash_bucket_result[2],
|
203
|
-
hash_bucket_result[3],
|
204
|
-
np.double(emit_metrics_time),
|
205
|
-
hash_bucket_result[5],
|
206
|
-
)
|
@@ -30,7 +30,10 @@ from deltacat.compute.compactor.utils import system_columns as sc
|
|
30
30
|
|
31
31
|
from deltacat.utils.performance import timed_invocation
|
32
32
|
from deltacat.utils.metrics import emit_timer_metrics
|
33
|
-
from deltacat.utils.resources import
|
33
|
+
from deltacat.utils.resources import (
|
34
|
+
get_current_process_peak_memory_usage_in_bytes,
|
35
|
+
ProcessUtilizationOverTimeRange,
|
36
|
+
)
|
34
37
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
35
38
|
generate_pk_hash_column,
|
36
39
|
hash_group_index_to_hash_bucket_indices,
|
@@ -44,6 +47,7 @@ from deltacat.storage import (
|
|
44
47
|
interface as unimplemented_deltacat_storage,
|
45
48
|
)
|
46
49
|
from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
|
50
|
+
from deltacat.constants import BYTES_PER_GIBIBYTE
|
47
51
|
|
48
52
|
|
49
53
|
if importlib.util.find_spec("memray"):
|
@@ -436,7 +440,10 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
436
440
|
f"{total_dfes_found} != {len(hb_index_to_delta_file_envelopes_list)}"
|
437
441
|
)
|
438
442
|
|
439
|
-
peak_memory_usage_bytes =
|
443
|
+
peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
|
444
|
+
logger.info(
|
445
|
+
f"Peak memory usage in bytes after merge: {peak_memory_usage_bytes}"
|
446
|
+
)
|
440
447
|
|
441
448
|
return MergeResult(
|
442
449
|
materialized_results,
|
@@ -449,25 +456,35 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
449
456
|
|
450
457
|
@ray.remote
|
451
458
|
def merge(input: MergeInput) -> MergeResult:
|
459
|
+
with ProcessUtilizationOverTimeRange() as process_util:
|
460
|
+
logger.info(f"Starting merge task...")
|
461
|
+
|
462
|
+
# Log node peak memory utilization every 10 seconds
|
463
|
+
def log_peak_memory():
|
464
|
+
logger.debug(
|
465
|
+
f"Process peak memory utilization so far: {process_util.max_memory} bytes "
|
466
|
+
f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
|
467
|
+
)
|
468
|
+
|
469
|
+
process_util.schedule_callback(log_peak_memory, 10)
|
452
470
|
|
453
|
-
|
454
|
-
merge_result, duration = timed_invocation(func=_timed_merge, input=input)
|
471
|
+
merge_result, duration = timed_invocation(func=_timed_merge, input=input)
|
455
472
|
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
473
|
+
emit_metrics_time = 0.0
|
474
|
+
if input.metrics_config:
|
475
|
+
emit_result, latency = timed_invocation(
|
476
|
+
func=emit_timer_metrics,
|
477
|
+
metrics_name="merge",
|
478
|
+
value=duration,
|
479
|
+
metrics_config=input.metrics_config,
|
480
|
+
)
|
481
|
+
emit_metrics_time = latency
|
482
|
+
|
483
|
+
logger.info(f"Finished merge task...")
|
484
|
+
return MergeResult(
|
485
|
+
merge_result[0],
|
486
|
+
merge_result[1],
|
487
|
+
merge_result[2],
|
488
|
+
np.double(emit_metrics_time),
|
489
|
+
merge_result[4],
|
463
490
|
)
|
464
|
-
emit_metrics_time = latency
|
465
|
-
|
466
|
-
logger.info(f"Finished merge task...")
|
467
|
-
return MergeResult(
|
468
|
-
merge_result[0],
|
469
|
-
merge_result[1],
|
470
|
-
merge_result[2],
|
471
|
-
np.double(emit_metrics_time),
|
472
|
-
merge_result[4],
|
473
|
-
)
|
@@ -17,6 +17,7 @@ from deltacat import logs
|
|
17
17
|
from deltacat.compute.compactor.utils import system_columns as sc
|
18
18
|
from deltacat.io.object_store import IObjectStore
|
19
19
|
from deltacat.utils.performance import timed_invocation
|
20
|
+
from deltacat.utils.pyarrow import sliced_string_cast
|
20
21
|
|
21
22
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
22
23
|
|
@@ -182,7 +183,7 @@ def generate_pk_hash_column(
|
|
182
183
|
def _generate_pk_hash(table: pa.Table) -> pa.Array:
|
183
184
|
pk_columns = []
|
184
185
|
for pk_name in primary_keys:
|
185
|
-
pk_columns.append(
|
186
|
+
pk_columns.append(sliced_string_cast(table[pk_name]))
|
186
187
|
|
187
188
|
pk_columns.append(PK_DELIMITER)
|
188
189
|
hash_column = pc.binary_join_element_wise(*pk_columns)
|
@@ -6,6 +6,8 @@ import boto3
|
|
6
6
|
from typing import Any, Callable, Dict, List, Optional, Set
|
7
7
|
from boto3.resources.base import ServiceResource
|
8
8
|
import pyarrow as pa
|
9
|
+
from pytest_benchmark.fixture import BenchmarkFixture
|
10
|
+
|
9
11
|
from deltacat.tests.compute.test_util_common import (
|
10
12
|
get_rcf,
|
11
13
|
)
|
@@ -161,6 +163,7 @@ def test_compact_partition_incremental(
|
|
161
163
|
read_kwargs_provider_param: Any,
|
162
164
|
skip_enabled_compact_partition_drivers,
|
163
165
|
compact_partition_func: Callable,
|
166
|
+
benchmark: BenchmarkFixture,
|
164
167
|
):
|
165
168
|
import deltacat.tests.local_deltacat_storage as ds
|
166
169
|
from deltacat.types.media import ContentType
|
@@ -235,8 +238,22 @@ def test_compact_partition_incremental(
|
|
235
238
|
"sort_keys": sort_keys if sort_keys else None,
|
236
239
|
}
|
237
240
|
)
|
241
|
+
|
238
242
|
# execute
|
239
|
-
|
243
|
+
def _incremental_compaction_setup():
|
244
|
+
"""
|
245
|
+
This callable runs right before invoking the benchmark target function (compaction).
|
246
|
+
This is needed as the benchmark module will invoke the target function multiple times
|
247
|
+
in a single test run, which can lead to non-idempotent behavior if RCFs are generated.
|
248
|
+
|
249
|
+
Returns: args, kwargs
|
250
|
+
"""
|
251
|
+
setup_s3_resource.Bucket(TEST_S3_RCF_BUCKET_NAME).objects.all().delete()
|
252
|
+
return (compact_partition_params,), {}
|
253
|
+
|
254
|
+
rcf_file_s3_uri = benchmark.pedantic(
|
255
|
+
compact_partition_func, setup=_incremental_compaction_setup
|
256
|
+
)
|
240
257
|
# validate
|
241
258
|
round_completion_info = get_rcf(setup_s3_resource, rcf_file_s3_uri)
|
242
259
|
compacted_delta_locator: DeltaLocator = (
|
@@ -5,6 +5,8 @@ import pytest
|
|
5
5
|
import boto3
|
6
6
|
from boto3.resources.base import ServiceResource
|
7
7
|
import pyarrow as pa
|
8
|
+
from pytest_benchmark.fixture import BenchmarkFixture
|
9
|
+
|
8
10
|
from deltacat.tests.compute.test_util_constant import (
|
9
11
|
BASE_TEST_SOURCE_NAMESPACE,
|
10
12
|
BASE_TEST_SOURCE_TABLE_NAME,
|
@@ -182,6 +184,7 @@ def test_compact_partition_rebase_then_incremental(
|
|
182
184
|
rebase_expected_compact_partition_result: pa.Table,
|
183
185
|
skip_enabled_compact_partition_drivers,
|
184
186
|
compact_partition_func: Callable,
|
187
|
+
benchmark: BenchmarkFixture,
|
185
188
|
):
|
186
189
|
import deltacat.tests.local_deltacat_storage as ds
|
187
190
|
from deltacat.types.media import ContentType
|
@@ -265,7 +268,7 @@ def test_compact_partition_rebase_then_incremental(
|
|
265
268
|
}
|
266
269
|
)
|
267
270
|
# execute
|
268
|
-
rcf_file_s3_uri = compact_partition_func
|
271
|
+
rcf_file_s3_uri = benchmark(compact_partition_func, compact_partition_params)
|
269
272
|
compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
|
270
273
|
setup_s3_resource, rcf_file_s3_uri
|
271
274
|
)
|
@@ -49,3 +49,24 @@ class TestClusterUtilizationOverTimeRange(unittest.TestCase):
|
|
49
49
|
self.assertIsNotNone(cu.total_memory_gb_seconds)
|
50
50
|
self.assertIsNotNone(cu.used_memory_gb_seconds)
|
51
51
|
self.assertIsNotNone(cu.max_cpu)
|
52
|
+
|
53
|
+
|
54
|
+
class TestProcessUtilizationOverTimeRange(unittest.TestCase):
|
55
|
+
def test_sanity(self):
|
56
|
+
from deltacat.utils.resources import ProcessUtilizationOverTimeRange
|
57
|
+
|
58
|
+
with ProcessUtilizationOverTimeRange() as nu:
|
59
|
+
time.sleep(3)
|
60
|
+
self.assertIsNotNone(nu.max_memory)
|
61
|
+
|
62
|
+
def test_callback(self):
|
63
|
+
from deltacat.utils.resources import ProcessUtilizationOverTimeRange
|
64
|
+
|
65
|
+
with ProcessUtilizationOverTimeRange() as nu:
|
66
|
+
|
67
|
+
def test_callback():
|
68
|
+
nu.test_field_set = True
|
69
|
+
|
70
|
+
nu.schedule_callback(test_callback, 1)
|
71
|
+
time.sleep(3)
|
72
|
+
self.assertTrue(nu.test_field_set)
|
deltacat/utils/daft.py
CHANGED
deltacat/utils/pyarrow.py
CHANGED
@@ -11,6 +11,8 @@ from pyarrow.parquet import ParquetFile
|
|
11
11
|
from deltacat.exceptions import ValidationError
|
12
12
|
|
13
13
|
import pyarrow as pa
|
14
|
+
import numpy as np
|
15
|
+
import pyarrow.compute as pc
|
14
16
|
from fsspec import AbstractFileSystem
|
15
17
|
from pyarrow import csv as pacsv
|
16
18
|
from pyarrow import feather as paf
|
@@ -38,6 +40,7 @@ from deltacat.utils.arguments import (
|
|
38
40
|
sanitize_kwargs_to_callable,
|
39
41
|
sanitize_kwargs_by_supported_kwargs,
|
40
42
|
)
|
43
|
+
from functools import lru_cache
|
41
44
|
|
42
45
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
43
46
|
|
@@ -738,3 +741,69 @@ class RecordBatchTables:
|
|
738
741
|
"""
|
739
742
|
self._remaining_tables.clear()
|
740
743
|
self._remaining_record_count = 0
|
744
|
+
|
745
|
+
|
746
|
+
@lru_cache(maxsize=1)
|
747
|
+
def _int_max_string_len() -> int:
|
748
|
+
PA_UINT64_MAX_STR_BYTES = pc.binary_length(
|
749
|
+
pc.cast(pa.scalar(2**64 - 1, type=pa.uint64()), pa.string())
|
750
|
+
).as_py()
|
751
|
+
PA_INT64_MAX_STR_BYTES = pc.binary_length(
|
752
|
+
pc.cast(pa.scalar(-(2**63), type=pa.int64()), pa.string())
|
753
|
+
).as_py()
|
754
|
+
return max(PA_UINT64_MAX_STR_BYTES, PA_INT64_MAX_STR_BYTES)
|
755
|
+
|
756
|
+
|
757
|
+
@lru_cache(maxsize=1)
|
758
|
+
def _float_max_string_len() -> int:
|
759
|
+
PA_POS_FLOAT64_MAX_STR_BYTES = pc.binary_length(
|
760
|
+
pc.cast(pa.scalar(np.finfo(np.float64).max, type=pa.float64()), pa.string())
|
761
|
+
).as_py()
|
762
|
+
PA_NEG_FLOAT64_MAX_STR_BYTES = pc.binary_length(
|
763
|
+
pc.cast(pa.scalar(np.finfo(np.float64).min, type=pa.float64()), pa.string())
|
764
|
+
).as_py()
|
765
|
+
return max(PA_POS_FLOAT64_MAX_STR_BYTES, PA_NEG_FLOAT64_MAX_STR_BYTES)
|
766
|
+
|
767
|
+
|
768
|
+
def _max_decimal128_string_len():
|
769
|
+
return 40 # "-" + 38 digits + decimal
|
770
|
+
|
771
|
+
|
772
|
+
def _max_decimal256_string_len():
|
773
|
+
return 78 # "-" + 76 digits + decimal
|
774
|
+
|
775
|
+
|
776
|
+
def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
777
|
+
"""performs slicing of a pyarrow array prior casting to a string.
|
778
|
+
This prevents a pyarrow from allocating too large of an array causing a failure.
|
779
|
+
Issue: https://github.com/apache/arrow/issues/38835
|
780
|
+
TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
|
781
|
+
"""
|
782
|
+
dtype = array.type
|
783
|
+
MAX_BYTES = 2147483646
|
784
|
+
max_str_len = None
|
785
|
+
if pa.types.is_integer(dtype):
|
786
|
+
max_str_len = _int_max_string_len()
|
787
|
+
elif pa.types.is_floating(dtype):
|
788
|
+
max_str_len = _float_max_string_len()
|
789
|
+
elif pa.types.is_decimal128(dtype):
|
790
|
+
max_str_len = _max_decimal128_string_len()
|
791
|
+
elif pa.types.is_decimal256(dtype):
|
792
|
+
max_str_len = _max_decimal256_string_len()
|
793
|
+
|
794
|
+
if max_str_len is not None:
|
795
|
+
max_elems_per_chunk = MAX_BYTES // (2 * max_str_len) # safety factor of 2
|
796
|
+
all_chunks = []
|
797
|
+
for chunk in array.chunks:
|
798
|
+
if len(chunk) < max_elems_per_chunk:
|
799
|
+
all_chunks.append(chunk)
|
800
|
+
else:
|
801
|
+
curr_pos = 0
|
802
|
+
total_len = len(chunk)
|
803
|
+
while curr_pos < total_len:
|
804
|
+
sliced = chunk.slice(curr_pos, max_elems_per_chunk)
|
805
|
+
curr_pos += len(sliced)
|
806
|
+
all_chunks.append(sliced)
|
807
|
+
array = pa.chunked_array(all_chunks, type=dtype)
|
808
|
+
|
809
|
+
return pc.cast(array, pa.string())
|
deltacat/utils/resources.py
CHANGED
@@ -77,6 +77,7 @@ class ClusterUtilizationOverTimeRange(AbstractContextManager):
|
|
77
77
|
self.total_memory_gb_seconds = 0.0
|
78
78
|
self.used_memory_gb_seconds = 0.0
|
79
79
|
self.max_cpu = 0.0
|
80
|
+
self.max_memory = 0.0
|
80
81
|
|
81
82
|
def __enter__(self) -> Any:
|
82
83
|
schedule.every().second.do(self._update_resources)
|
@@ -131,6 +132,11 @@ class ClusterUtilizationOverTimeRange(AbstractContextManager):
|
|
131
132
|
+ float(str(cluster_resources["memory"])) / BYTES_PER_GIBIBYTE
|
132
133
|
)
|
133
134
|
|
135
|
+
self.max_memory = max(
|
136
|
+
self.max_memory,
|
137
|
+
float(str(cluster_resources["memory"] - available_resources["memory"])),
|
138
|
+
)
|
139
|
+
|
134
140
|
def _run_schedule(self, interval: Optional[float] = 1.0):
|
135
141
|
cease_continuous_run = threading.Event()
|
136
142
|
|
@@ -146,9 +152,9 @@ class ClusterUtilizationOverTimeRange(AbstractContextManager):
|
|
146
152
|
return cease_continuous_run
|
147
153
|
|
148
154
|
|
149
|
-
def
|
155
|
+
def get_current_process_peak_memory_usage_in_bytes():
|
150
156
|
"""
|
151
|
-
Returns the peak memory usage of the
|
157
|
+
Returns the peak memory usage of the process in bytes. This method works across
|
152
158
|
Windows, Darwin and Linux platforms.
|
153
159
|
"""
|
154
160
|
current_platform = platform.system()
|
@@ -172,3 +178,53 @@ def get_size_of_object_in_bytes(obj: object) -> float:
|
|
172
178
|
if isinstance(obj, (list, tuple, set, frozenset)):
|
173
179
|
return size + sum(map(get_size_of_object_in_bytes, obj))
|
174
180
|
return size
|
181
|
+
|
182
|
+
|
183
|
+
class ProcessUtilizationOverTimeRange(AbstractContextManager):
|
184
|
+
"""
|
185
|
+
This class can be used to compute the process utilization metrics
|
186
|
+
which requires us to compute it over time as memory utilization changes.
|
187
|
+
"""
|
188
|
+
|
189
|
+
def __init__(self) -> None:
|
190
|
+
self.max_memory = 0.0
|
191
|
+
|
192
|
+
def __enter__(self) -> Any:
|
193
|
+
schedule.every().second.do(self._update_resources)
|
194
|
+
self.stop_run_schedules = self._run_schedule()
|
195
|
+
return super().__enter__()
|
196
|
+
|
197
|
+
def __exit__(
|
198
|
+
self,
|
199
|
+
__exc_type: type[BaseException] | None,
|
200
|
+
__exc_value: BaseException | None,
|
201
|
+
__traceback: TracebackType | None,
|
202
|
+
) -> bool | None:
|
203
|
+
if __exc_value:
|
204
|
+
logger.error(
|
205
|
+
f"Error ocurred while calculating process resources: {__exc_value}"
|
206
|
+
)
|
207
|
+
self.stop_run_schedules.set()
|
208
|
+
return super().__exit__(__exc_type, __exc_value, __traceback)
|
209
|
+
|
210
|
+
def schedule_callback(self, callback, callback_frequency_in_seconds) -> None:
|
211
|
+
schedule.every(callback_frequency_in_seconds).seconds.do(callback)
|
212
|
+
|
213
|
+
# It is not truely parallel(due to GIL Ref: https://wiki.python.org/moin/GlobalInterpreterLock)
|
214
|
+
# even if we are using threading library. However, it averages out and gives a very good approximation.
|
215
|
+
def _update_resources(self):
|
216
|
+
self.max_memory = get_current_process_peak_memory_usage_in_bytes()
|
217
|
+
|
218
|
+
def _run_schedule(self, interval: Optional[float] = 1.0):
|
219
|
+
cease_continuous_run = threading.Event()
|
220
|
+
|
221
|
+
class ScheduleThread(threading.Thread):
|
222
|
+
@classmethod
|
223
|
+
def run(cls):
|
224
|
+
while not cease_continuous_run.is_set():
|
225
|
+
schedule.run_pending()
|
226
|
+
time.sleep(float(str(interval)))
|
227
|
+
|
228
|
+
continuous_thread = ScheduleThread()
|
229
|
+
continuous_thread.start()
|
230
|
+
return cease_continuous_run
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: deltacat
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.9
|
4
4
|
Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
|
5
5
|
Home-page: https://github.com/ray-project/deltacat
|
6
6
|
Author: Ray Team
|
@@ -27,7 +27,7 @@ Requires-Dist: tenacity ==8.1.0
|
|
27
27
|
Requires-Dist: typing-extensions ==4.4.0
|
28
28
|
Requires-Dist: pymemcache ==4.0.0
|
29
29
|
Requires-Dist: redis ==4.6.0
|
30
|
-
Requires-Dist: getdaft ==0.
|
30
|
+
Requires-Dist: getdaft ==0.2.4
|
31
31
|
Requires-Dist: schedule ==1.2.0
|
32
32
|
|
33
33
|
# DeltaCAT
|
@@ -1,11 +1,11 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=9x12tKzGJVcmgVKVWjPCgZHxla7VH_PQf3HUvflyJZc,1777
|
2
2
|
deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
|
3
3
|
deltacat/exceptions.py,sha256=xqZf8CwysNYP2d39pf27OnXGStPREgBgIM-e2Tts-TI,199
|
4
4
|
deltacat/logs.py,sha256=9XWuTBoWhhAF9rAL6t9veXmnAlJHsaqk0lTxteVPqyQ,5674
|
5
5
|
deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
deltacat/aws/clients.py,sha256=wWiqXyZPWXezdEbhQ7DLwEVnYV6KiitqzBc5B4UAwc0,6184
|
7
7
|
deltacat/aws/constants.py,sha256=luXWMO_8eatq8f9NlFjNM7q362j77JwzTM2BEVS_8-8,353
|
8
|
-
deltacat/aws/s3u.py,sha256=
|
8
|
+
deltacat/aws/s3u.py,sha256=s2On5X3IQiCsCMKw4lpfV1GfKQVWOXNsdAmIJK5PEM0,18610
|
9
9
|
deltacat/aws/redshift/__init__.py,sha256=fjuv3jWdPE8IgF4uSrL0YEqV3XUfqDULX3xV27ICceo,266
|
10
10
|
deltacat/aws/redshift/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
deltacat/aws/redshift/model/manifest.py,sha256=N1RRGi1Rbou_9HQieoRCI_wE7eAf5eU_FTZ7dNPvUyY,9682
|
@@ -20,11 +20,11 @@ deltacat/catalog/model/catalog.py,sha256=-Ho7a3rV1hiOS9cSRCAor9AtXV9nJn9t_MDVql9
|
|
20
20
|
deltacat/catalog/model/table_definition.py,sha256=tKrM1mmaQlvxqXrLt3QJVZK5BZfaJnhjTZ6KjybYlhE,727
|
21
21
|
deltacat/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
deltacat/compute/compactor/__init__.py,sha256=ivpOPve1yKi3Vz3tVgp-eeFMNEeUSf-dlRJNSCM85sE,1022
|
23
|
-
deltacat/compute/compactor/compaction_session.py,sha256=
|
23
|
+
deltacat/compute/compactor/compaction_session.py,sha256=bJpNBSTW7Raoa1gpojDpmVVqQGpvX0AwrusHQhUANcI,27612
|
24
24
|
deltacat/compute/compactor/repartition_session.py,sha256=f5BTTGNv365qSuTioL7QUuVm-px_l8-zz-OC_p7gXt4,7240
|
25
25
|
deltacat/compute/compactor/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
26
|
deltacat/compute/compactor/model/compact_partition_params.py,sha256=DWge5I72zKBg_dodn4ekEOAnoHWs1jo21QuVmQi8I0M,14343
|
27
|
-
deltacat/compute/compactor/model/compaction_session_audit_info.py,sha256=
|
27
|
+
deltacat/compute/compactor/model/compaction_session_audit_info.py,sha256=o8O0v3nOc7m1ZR4W0wQkTdsMyFL24LoMc9kzUo8i5uc,30174
|
28
28
|
deltacat/compute/compactor/model/compactor_version.py,sha256=RwRvManiCxZmzjAWzm1OPDxjB1BEHu1d0fBJyGhXKxA,87
|
29
29
|
deltacat/compute/compactor/model/dedupe_result.py,sha256=1OCV944qJdLQ_-8scisVKl45ej1eRv9OV539QYZtQ-U,292
|
30
30
|
deltacat/compute/compactor/model/delta_annotated.py,sha256=NERB9rOtYg-xzBwvqGJ7_hBOzBC7g6X5M9-Cq5pbdH8,12258
|
@@ -37,9 +37,9 @@ deltacat/compute/compactor/model/pyarrow_write_result.py,sha256=WYIa0DRcyaemR6yU
|
|
37
37
|
deltacat/compute/compactor/model/repartition_result.py,sha256=HZy7Ls6toI4rXgVW2yIKMIkVS8o9kxvlIJPvo5_pCxA,140
|
38
38
|
deltacat/compute/compactor/model/round_completion_info.py,sha256=CDlafUX6MSbdBK_zQyzEwD0mYwu-Xs2rtU0-DsEwroM,4940
|
39
39
|
deltacat/compute/compactor/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
|
-
deltacat/compute/compactor/steps/dedupe.py,sha256=
|
41
|
-
deltacat/compute/compactor/steps/hash_bucket.py,sha256=
|
42
|
-
deltacat/compute/compactor/steps/materialize.py,sha256=
|
40
|
+
deltacat/compute/compactor/steps/dedupe.py,sha256=iAPRIeMdGxNxaCy2QC_XzRWiNDVkKbkplJY0DVoWwsE,10190
|
41
|
+
deltacat/compute/compactor/steps/hash_bucket.py,sha256=CbNbE0rizrsG-7rvB90J-iHtr7OajDat-4tyi2Ftz10,10655
|
42
|
+
deltacat/compute/compactor/steps/materialize.py,sha256=j2r01KL5GGhGss9FSN9vpYmgsCQdm2uUpKMDVPtk6_k,14246
|
43
43
|
deltacat/compute/compactor/steps/repartition.py,sha256=_ITw4yvvnNv3wwOYxprzlIz5J6t3b72re6lllpzJD9U,10960
|
44
44
|
deltacat/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
45
|
deltacat/compute/compactor/utils/io.py,sha256=oZmjU0hp5GbCbLF7PZXEc4lgLeeicyjUPE08GffByT4,17300
|
@@ -48,7 +48,7 @@ deltacat/compute/compactor/utils/round_completion_file.py,sha256=DmZfHeAXlQn0DDd
|
|
48
48
|
deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZRG2NqBlCw1X3_FBc,2397
|
49
49
|
deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
|
50
50
|
deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
51
|
-
deltacat/compute/compactor_v2/compaction_session.py,sha256=
|
51
|
+
deltacat/compute/compactor_v2/compaction_session.py,sha256=YnKG2LlrgYYsVKt_6txcXzCgolaQWF4SuQz0eZmChZM,20422
|
52
52
|
deltacat/compute/compactor_v2/constants.py,sha256=yZgzFD59wiXbXiTVgYPWRodZGpngiSBNFB2jmoZ4fps,1471
|
53
53
|
deltacat/compute/compactor_v2/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
54
54
|
deltacat/compute/compactor_v2/model/hash_bucket_input.py,sha256=pgE2o8Z9-Dvs75C15LAkmfuJFFi5pRIuuxA9GGyDlLM,2631
|
@@ -56,13 +56,13 @@ deltacat/compute/compactor_v2/model/hash_bucket_result.py,sha256=EsY9BPPywhmxlcL
|
|
56
56
|
deltacat/compute/compactor_v2/model/merge_input.py,sha256=A-_Oq54sx1vrT-Ewv2_yKARdIh928yJvEuheCkw5tvQ,5049
|
57
57
|
deltacat/compute/compactor_v2/model/merge_result.py,sha256=L53i9iL_XpzqBr7HETixD5v5qfLvitkGcjoML_hHfcs,368
|
58
58
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
59
|
-
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=
|
60
|
-
deltacat/compute/compactor_v2/steps/merge.py,sha256=
|
59
|
+
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=wFu4vAS8PR0_SxxLIfGPmtLjUV9hCfPeHG56CFpoLIM,8100
|
60
|
+
deltacat/compute/compactor_v2/steps/merge.py,sha256=QI8ovaO6yPw_VgDYqTzQOxw2oov4ipuW2gR-w01FWGY,18087
|
61
61
|
deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
62
62
|
deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=rNKZisxGrLQOkwX8eHUQiFoTR1V-E66pMqWigtrs618,2156
|
63
63
|
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=62tFCY2iRP7I3-45GCIYs6_SJsQl8C5lBEr8gbNfbsw,1932
|
64
64
|
deltacat/compute/compactor_v2/utils/io.py,sha256=jgIfwrfH2mTFUx1M0TgwZGGfrS4IXjP1PmqwaQmNAJM,5092
|
65
|
-
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=
|
65
|
+
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=MAscmL35WfwN7Is72aFlD_cGhxtZgjRwwR5kS9Yn2uU,11393
|
66
66
|
deltacat/compute/compactor_v2/utils/task_options.py,sha256=1-wIIXP0gDUJGdl8omMF5Q9kZs2oeu5WddgCnwBh3RE,8681
|
67
67
|
deltacat/compute/metastats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
68
68
|
deltacat/compute/metastats/meta_stats.py,sha256=78hN3aN5wLHUFJsZXuv2JLeqA35HZ8mLUWJDMslMj5Q,18731
|
@@ -119,9 +119,9 @@ deltacat/tests/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
|
|
119
119
|
deltacat/tests/aws/test_clients.py,sha256=23GMWfz27WWBDXSqphG9mfputsyS7j3I5P_HRk4YoKE,3790
|
120
120
|
deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
121
121
|
deltacat/tests/compute/compact_partition_test_cases.py,sha256=EyZwh-7qKiMmzJT8E_V74cvle5uONYZyt89jmdAu1TI,47952
|
122
|
-
deltacat/tests/compute/test_compact_partition_incremental.py,sha256
|
122
|
+
deltacat/tests/compute/test_compact_partition_incremental.py,sha256=-nIQev0FYWbp76LwM0H4KpMEgP2GMqVRFFJHMsLUN2E,10011
|
123
123
|
deltacat/tests/compute/test_compact_partition_params.py,sha256=MIzIcBscwFA1W-cfTTxVx0zcgbrs8D4bI9Hy4TF5eRo,8322
|
124
|
-
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256
|
124
|
+
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=-yFmEGqWMTIq9iShFU9rn4cX7ky1Zmm3pv4F9NwsQUo,13218
|
125
125
|
deltacat/tests/compute/test_util_common.py,sha256=Skz0ZfHzidArZhIzRDHOYt-5uGBwx6MRfKZpeBnzh9w,6055
|
126
126
|
deltacat/tests/compute/test_util_constant.py,sha256=4o-W3E7r7jhFl1A3OFLLrdKnwcF46zx4lEIDY8ONJ3c,929
|
127
127
|
deltacat/tests/compute/test_util_create_table_deltas_repo.py,sha256=5yP285lY539CP1UuyYe8Kz14CnBUpE1kZJZjxBAaXew,6530
|
@@ -155,7 +155,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
155
155
|
deltacat/tests/utils/test_daft.py,sha256=iN6rAwGXw5F4xT2UZ72bN276hkKVD7XD4WNp5DKgm2Q,5098
|
156
156
|
deltacat/tests/utils/test_pyarrow.py,sha256=eZAuYp9MUf8lmpIilH57JkURuNsTGZ3IAGC4Gm5hdrM,17307
|
157
157
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
158
|
-
deltacat/tests/utils/test_resources.py,sha256=
|
158
|
+
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
159
159
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
160
160
|
deltacat/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
161
161
|
deltacat/types/media.py,sha256=RALwafQ0SwMyPUIcENhURk7Sor_2CIfEMztvFUnvZFQ,2227
|
@@ -165,14 +165,14 @@ deltacat/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
165
165
|
deltacat/utils/arguments.py,sha256=5y1Xz4HSAD8M8Jt83i6gOEKoYjy_fMQe1V43IhIE4hY,1191
|
166
166
|
deltacat/utils/cloudpickle.py,sha256=XE7YDmQe56ksfl3NdYZkzOAhbHSuhNcBZGOehQpgZr0,1187
|
167
167
|
deltacat/utils/common.py,sha256=RG_-enXNpLKaYrqyx1ne2lL10lxN9vK7F631oJP6SE8,1375
|
168
|
-
deltacat/utils/daft.py,sha256=
|
168
|
+
deltacat/utils/daft.py,sha256=eZG1AjK21lM7bzEc3_BniDqpqMGDrlp_qj9Du4dxaV0,3334
|
169
169
|
deltacat/utils/metrics.py,sha256=Ob-RXGoNnfTMRXaNbSHoqW8y-n8KfRA9nLuo9AvsReI,6201
|
170
170
|
deltacat/utils/numpy.py,sha256=ZiGREobTVT6IZXgPxkSUpLJFN2Hn8KEZcrqybLDXCIA,2027
|
171
171
|
deltacat/utils/pandas.py,sha256=eGOpiZE1zLznTtuwoN80j4PBp1_bUV8SE4c951r0a3o,9561
|
172
172
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
173
173
|
deltacat/utils/placement.py,sha256=S80CwD1eEK47lQNr0xTmF9kq092-z6lTTmOOBv8cW_o,11723
|
174
|
-
deltacat/utils/pyarrow.py,sha256=
|
175
|
-
deltacat/utils/resources.py,sha256=
|
174
|
+
deltacat/utils/pyarrow.py,sha256=gYcoRhQoBoAFo69WNijMobrLGta4VASg8VarWPiB34Y,28979
|
175
|
+
deltacat/utils/resources.py,sha256=sS4Rzuoy_kZJ0QuiKnq0M3wTEio1h52IRehi9JRjQDg,8216
|
176
176
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
177
177
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
178
178
|
deltacat/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -181,8 +181,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
181
181
|
deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
|
182
182
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
183
183
|
deltacat/utils/ray_utils/runtime.py,sha256=xOVkqL6o8qGsewGvzhMKxmCcqcFZDnNILuz5IGMgxSc,4991
|
184
|
-
deltacat-0.2.
|
185
|
-
deltacat-0.2.
|
186
|
-
deltacat-0.2.
|
187
|
-
deltacat-0.2.
|
188
|
-
deltacat-0.2.
|
184
|
+
deltacat-0.2.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
185
|
+
deltacat-0.2.9.dist-info/METADATA,sha256=XnXwpmM03bCIv-C-znj2rwE_6FDmI68H6zFL4icWMII,1779
|
186
|
+
deltacat-0.2.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
187
|
+
deltacat-0.2.9.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
188
|
+
deltacat-0.2.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|