deltacat 0.1.18b3__tar.gz → 0.1.18b6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. {deltacat-0.1.18b3/deltacat.egg-info → deltacat-0.1.18b6}/PKG-INFO +1 -1
  2. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/__init__.py +1 -1
  3. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/compaction_session.py +165 -29
  4. deltacat-0.1.18b6/deltacat/compute/compactor/model/compact_partition_params.py +153 -0
  5. deltacat-0.1.18b6/deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
  6. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/dedupe_result.py +3 -0
  7. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
  8. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_file_locator.py +11 -6
  9. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
  10. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/materialize_result.py +27 -6
  11. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/round_completion_info.py +9 -0
  12. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/dedupe.py +35 -19
  13. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/hash_bucket.py +41 -16
  14. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/materialize.py +73 -70
  15. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/io.py +15 -0
  16. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/primary_key_index.py +9 -15
  17. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/round_completion_file.py +2 -0
  18. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/system_columns.py +32 -0
  19. deltacat-0.1.18b6/deltacat/io/file_object_store.py +48 -0
  20. deltacat-0.1.18b6/deltacat/io/memcached_object_store.py +121 -0
  21. deltacat-0.1.18b6/deltacat/io/object_store.py +51 -0
  22. deltacat-0.1.18b6/deltacat/io/ray_plasma_object_store.py +23 -0
  23. deltacat-0.1.18b6/deltacat/io/redis_object_store.py +114 -0
  24. deltacat-0.1.18b6/deltacat/io/s3_object_store.py +44 -0
  25. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/delta.py +2 -1
  26. deltacat-0.1.18b6/deltacat/tests/compactor/test_compact_partition_params.py +237 -0
  27. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/compactor/utils/test_io.py +27 -5
  28. deltacat-0.1.18b6/deltacat/tests/io/test_file_object_store.py +86 -0
  29. deltacat-0.1.18b6/deltacat/tests/io/test_memcached_object_store.py +158 -0
  30. deltacat-0.1.18b6/deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
  31. deltacat-0.1.18b6/deltacat/tests/io/test_redis_object_store.py +103 -0
  32. deltacat-0.1.18b6/deltacat/tests/io/test_s3_object_store.py +59 -0
  33. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/utils/test_record_batch_tables.py +1 -1
  34. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/utils/test_resources.py +9 -0
  35. deltacat-0.1.18b6/deltacat/utils/__init__.py +0 -0
  36. deltacat-0.1.18b6/deltacat/utils/ray_utils/__init__.py +0 -0
  37. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/concurrency.py +0 -2
  38. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/resources.py +30 -18
  39. {deltacat-0.1.18b3 → deltacat-0.1.18b6/deltacat.egg-info}/PKG-INFO +1 -1
  40. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat.egg-info/SOURCES.txt +15 -0
  41. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat.egg-info/requires.txt +2 -0
  42. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/setup.py +2 -0
  43. deltacat-0.1.18b3/deltacat/io/__init__.py +0 -7
  44. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/LICENSE +0 -0
  45. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/MANIFEST.in +0 -0
  46. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/README.md +0 -0
  47. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/__init__.py +0 -0
  48. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/clients.py +0 -0
  49. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/constants.py +0 -0
  50. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/redshift/__init__.py +0 -0
  51. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/redshift/model/__init__.py +0 -0
  52. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/redshift/model/manifest.py +0 -0
  53. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/aws/s3u.py +0 -0
  54. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/__init__.py +0 -0
  55. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/delegate.py +0 -0
  56. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/interface.py +0 -0
  57. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/model/__init__.py +0 -0
  58. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/model/catalog.py +0 -0
  59. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/catalog/model/table_definition.py +0 -0
  60. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/__init__.py +0 -0
  61. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/__init__.py +0 -0
  62. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/__init__.py +0 -0
  63. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
  64. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
  65. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  66. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  67. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/sort_key.py +0 -0
  68. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/repartition_session.py +0 -0
  69. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/__init__.py +0 -0
  70. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/__init__.py +0 -0
  71. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/rehash_bucket.py +0 -0
  72. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/rewrite_index.py +0 -0
  73. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/repartition.py +0 -0
  74. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/__init__.py +0 -0
  75. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/__init__.py +0 -0
  76. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/config/__init__.py +0 -0
  77. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/meta_stats.py +0 -0
  78. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/__init__.py +0 -0
  79. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/partition_stats_dict.py +0 -0
  80. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -0
  81. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/stats.py +0 -0
  82. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/__init__.py +0 -0
  83. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/constants.py +0 -0
  84. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/io.py +0 -0
  85. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -0
  86. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/ray_utils.py +0 -0
  87. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/__init__.py +0 -0
  88. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/basic.py +0 -0
  89. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/__init__.py +0 -0
  90. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  91. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_stats.py +0 -0
  92. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  93. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  94. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/models/stats_result.py +0 -0
  95. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/types.py +0 -0
  96. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/__init__.py +0 -0
  97. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/intervals.py +0 -0
  98. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/io.py +0 -0
  99. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/manifest_stats_file.py +0 -0
  100. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/constants.py +0 -0
  101. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/exceptions.py +0 -0
  102. {deltacat-0.1.18b3/deltacat/io/aws → deltacat-0.1.18b6/deltacat/io}/__init__.py +0 -0
  103. {deltacat-0.1.18b3/deltacat/io/aws/redshift → deltacat-0.1.18b6/deltacat/io/aws}/__init__.py +0 -0
  104. {deltacat-0.1.18b3/deltacat/storage/model → deltacat-0.1.18b6/deltacat/io/aws/redshift}/__init__.py +0 -0
  105. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/io/aws/redshift/redshift_datasource.py +0 -0
  106. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/io/dataset.py +0 -0
  107. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/io/read_api.py +0 -0
  108. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/logs.py +0 -0
  109. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/__init__.py +0 -0
  110. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/interface.py +0 -0
  111. {deltacat-0.1.18b3/deltacat/tests → deltacat-0.1.18b6/deltacat/storage/model}/__init__.py +0 -0
  112. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/list_result.py +0 -0
  113. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/locator.py +0 -0
  114. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/namespace.py +0 -0
  115. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/partition.py +0 -0
  116. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/stream.py +0 -0
  117. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/table.py +0 -0
  118. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/table_version.py +0 -0
  119. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/storage/model/types.py +0 -0
  120. {deltacat-0.1.18b3/deltacat/tests/compactor → deltacat-0.1.18b6/deltacat/tests}/__init__.py +0 -0
  121. {deltacat-0.1.18b3/deltacat/tests/compactor/utils → deltacat-0.1.18b6/deltacat/tests/compactor}/__init__.py +0 -0
  122. {deltacat-0.1.18b3/deltacat/tests/stats → deltacat-0.1.18b6/deltacat/tests/compactor/utils}/__init__.py +0 -0
  123. {deltacat-0.1.18b3/deltacat/tests/test_utils → deltacat-0.1.18b6/deltacat/tests/io}/__init__.py +0 -0
  124. {deltacat-0.1.18b3/deltacat/tests/utils → deltacat-0.1.18b6/deltacat/tests/stats}/__init__.py +0 -0
  125. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/stats/test_intervals.py +0 -0
  126. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/test_repartition.py +0 -0
  127. {deltacat-0.1.18b3/deltacat/types → deltacat-0.1.18b6/deltacat/tests/test_utils}/__init__.py +0 -0
  128. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/tests/test_utils/constants.py +0 -0
  129. {deltacat-0.1.18b3/deltacat → deltacat-0.1.18b6/deltacat/tests}/utils/__init__.py +0 -0
  130. {deltacat-0.1.18b3/deltacat/utils/ray_utils → deltacat-0.1.18b6/deltacat/types}/__init__.py +0 -0
  131. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/types/media.py +0 -0
  132. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/types/tables.py +0 -0
  133. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/common.py +0 -0
  134. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/metrics.py +0 -0
  135. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/numpy.py +0 -0
  136. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/pandas.py +0 -0
  137. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/performance.py +0 -0
  138. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/placement.py +0 -0
  139. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/pyarrow.py +0 -0
  140. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/collections.py +0 -0
  141. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/dataset.py +0 -0
  142. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/performance.py +0 -0
  143. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/runtime.py +0 -0
  144. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat.egg-info/dependency_links.txt +0 -0
  145. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/deltacat.egg-info/top_level.txt +0 -0
  146. {deltacat-0.1.18b3 → deltacat-0.1.18b6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.1.18b3
3
+ Version: 0.1.18b6
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
43
43
 
44
44
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
45
45
 
46
- __version__ = "0.1.18b3"
46
+ __version__ = "0.1.18b6"
47
47
 
48
48
 
49
49
  __all__ = [
@@ -3,6 +3,10 @@ from contextlib import nullcontext
3
3
  import functools
4
4
  import logging
5
5
  import ray
6
+ import time
7
+ import json
8
+ from deltacat.aws import s3u as s3_utils
9
+ import deltacat
6
10
  from deltacat import logs
7
11
  import pyarrow as pa
8
12
  from deltacat.compute.compactor import (
@@ -12,6 +16,9 @@ from deltacat.compute.compactor import (
12
16
  )
13
17
  from deltacat.compute.compactor.model.dedupe_result import DedupeResult
14
18
  from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
19
+ from deltacat.io.object_store import IObjectStore
20
+ from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
21
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
15
22
  from deltacat.compute.stats.models.delta_stats import DeltaStats
16
23
  from deltacat.storage import (
17
24
  Delta,
@@ -20,6 +27,9 @@ from deltacat.storage import (
20
27
  PartitionLocator,
21
28
  interface as unimplemented_deltacat_storage,
22
29
  )
30
+ from deltacat.compute.compactor.model.compact_partition_params import (
31
+ CompactPartitionParams,
32
+ )
23
33
  from deltacat.utils.ray_utils.concurrency import (
24
34
  invoke_parallel,
25
35
  round_robin_options_provider,
@@ -37,7 +47,11 @@ from deltacat.utils.placement import PlacementGroupConfig
37
47
  from typing import List, Set, Optional, Tuple, Dict, Any
38
48
  from collections import defaultdict
39
49
  from deltacat.utils.metrics import MetricsConfig
40
- from deltacat.utils.resources import log_current_cluster_utilization
50
+ from deltacat.compute.compactor.model.compaction_session_audit_info import (
51
+ CompactionSessionAuditInfo,
52
+ )
53
+ from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
54
+
41
55
 
42
56
  if importlib.util.find_spec("memray"):
43
57
  import memray
@@ -100,6 +114,7 @@ def compact_partition(
100
114
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
101
115
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
102
116
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
117
+ object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
103
118
  deltacat_storage=unimplemented_deltacat_storage,
104
119
  **kwargs,
105
120
  ) -> Optional[str]:
@@ -139,6 +154,7 @@ def compact_partition(
139
154
  list_deltas_kwargs,
140
155
  read_kwargs_provider,
141
156
  s3_table_writer_kwargs,
157
+ object_store,
142
158
  deltacat_storage,
143
159
  **kwargs,
144
160
  )
@@ -184,10 +200,28 @@ def _execute_compaction_round(
184
200
  list_deltas_kwargs: Optional[Dict[str, Any]],
185
201
  read_kwargs_provider: Optional[ReadKwargsProvider],
186
202
  s3_table_writer_kwargs: Optional[Dict[str, Any]],
203
+ object_store: Optional[IObjectStore],
187
204
  deltacat_storage=unimplemented_deltacat_storage,
188
205
  **kwargs,
189
206
  ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
190
207
 
208
+ rcf_source_partition_locator = (
209
+ rebase_source_partition_locator
210
+ if rebase_source_partition_locator
211
+ else source_partition_locator
212
+ )
213
+
214
+ base_audit_url = rcf_source_partition_locator.path(
215
+ f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
216
+ )
217
+ audit_url = f"{base_audit_url}.json"
218
+
219
+ logger.info(f"Compaction audit will be written to {audit_url}")
220
+
221
+ compaction_audit = CompactionSessionAuditInfo(deltacat.__version__, audit_url)
222
+
223
+ compaction_start = time.monotonic()
224
+
191
225
  if not primary_keys:
192
226
  # TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
193
227
  # with normalized manifest entry sizes
@@ -230,6 +264,7 @@ def _execute_compaction_round(
230
264
  f"{node_resource_keys}"
231
265
  )
232
266
 
267
+ compaction_audit.set_cluster_cpu_max(cluster_cpus)
233
268
  # create a remote options provider to round-robin tasks across all nodes or allocated bundles
234
269
  logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
235
270
  round_robin_opt_provider = functools.partial(
@@ -257,6 +292,13 @@ def _execute_compaction_round(
257
292
  )
258
293
  logger.info(f"Round completion file: {round_completion_info}")
259
294
 
295
+ enable_manifest_entry_copy_by_reference = (
296
+ False if rebase_source_partition_locator else True
297
+ )
298
+ logger.info(
299
+ f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
300
+ )
301
+
260
302
  # discover input delta files
261
303
  # For rebase:
262
304
  # Copy the old compacted table to a new destination, plus any new deltas from rebased source
@@ -268,6 +310,7 @@ def _execute_compaction_round(
268
310
  round_completion_info.high_watermark if round_completion_info else None
269
311
  )
270
312
 
313
+ delta_discovery_start = time.monotonic()
271
314
  (
272
315
  input_deltas,
273
316
  previous_last_stream_position_compacted_on_destination_table,
@@ -282,6 +325,13 @@ def _execute_compaction_round(
282
325
  **list_deltas_kwargs,
283
326
  )
284
327
 
328
+ delta_discovery_end = time.monotonic()
329
+ compaction_audit.set_delta_discovery_time_in_seconds(
330
+ delta_discovery_end - delta_discovery_start
331
+ )
332
+
333
+ s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
334
+
285
335
  if not input_deltas:
286
336
  logger.info("No input deltas found to compact.")
287
337
  return None, None, None
@@ -298,6 +348,7 @@ def _execute_compaction_round(
298
348
  io.fit_input_deltas(
299
349
  input_deltas,
300
350
  cluster_resources,
351
+ compaction_audit,
301
352
  hash_bucket_count,
302
353
  deltacat_storage=deltacat_storage,
303
354
  )
@@ -307,11 +358,14 @@ def _execute_compaction_round(
307
358
  cluster_resources,
308
359
  hash_bucket_count,
309
360
  min_hash_bucket_chunk_size,
361
+ compaction_audit=compaction_audit,
310
362
  input_deltas_stats=input_deltas_stats,
311
363
  deltacat_storage=deltacat_storage,
312
364
  )
313
365
  )
314
366
 
367
+ compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
368
+
315
369
  assert hash_bucket_count is not None and hash_bucket_count > 0, (
316
370
  f"Expected hash bucket count to be a positive integer, but found "
317
371
  f"`{hash_bucket_count}`"
@@ -335,6 +389,8 @@ def _execute_compaction_round(
335
389
  "Multiple rounds are not supported. Please increase the cluster size and run again."
336
390
  )
337
391
 
392
+ hb_start = time.monotonic()
393
+
338
394
  hb_tasks_pending = invoke_parallel(
339
395
  items=uniform_deltas,
340
396
  ray_task=hb.hash_bucket,
@@ -348,11 +404,28 @@ def _execute_compaction_round(
348
404
  enable_profiler=enable_profiler,
349
405
  metrics_config=metrics_config,
350
406
  read_kwargs_provider=read_kwargs_provider,
407
+ object_store=object_store,
351
408
  deltacat_storage=deltacat_storage,
352
409
  )
410
+
411
+ hb_invoke_end = time.monotonic()
412
+
353
413
  logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
354
414
  hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
355
415
  logger.info(f"Got {len(hb_results)} hash bucket results.")
416
+ hb_end = time.monotonic()
417
+ hb_results_retrieved_at = time.time()
418
+
419
+ telemetry_time_hb = compaction_audit.save_step_stats(
420
+ CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
421
+ hb_results,
422
+ hb_results_retrieved_at,
423
+ hb_invoke_end - hb_start,
424
+ hb_end - hb_start,
425
+ )
426
+
427
+ s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
428
+
356
429
  all_hash_group_idx_to_obj_id = defaultdict(list)
357
430
  for hb_result in hb_results:
358
431
  for hash_group_index, object_id in enumerate(
@@ -367,6 +440,8 @@ def _execute_compaction_round(
367
440
  f"Got {total_hb_record_count} hash bucket records from hash bucketing step..."
368
441
  )
369
442
 
443
+ compaction_audit.set_input_records(total_hb_record_count.item())
444
+
370
445
  # TODO (pdames): when resources are freed during the last round of hash
371
446
  # bucketing, start running dedupe tasks that read existing dedupe
372
447
  # output from S3 then wait for hash bucketing to finish before continuing
@@ -389,10 +464,18 @@ def _execute_compaction_round(
389
464
  # identify the index of records to keep or drop based on sort keys
390
465
  num_materialize_buckets = max_parallelism
391
466
  logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
467
+
468
+ dedupe_start = time.monotonic()
469
+ dd_max_parallelism = int(
470
+ max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
471
+ )
472
+ logger.info(
473
+ f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
474
+ )
392
475
  dd_tasks_pending = invoke_parallel(
393
476
  items=all_hash_group_idx_to_obj_id.values(),
394
477
  ray_task=dd.dedupe,
395
- max_parallelism=max_parallelism,
478
+ max_parallelism=dd_max_parallelism,
396
479
  options_provider=round_robin_opt_provider,
397
480
  kwargs_provider=lambda index, item: {
398
481
  "dedupe_task_index": index,
@@ -402,12 +485,33 @@ def _execute_compaction_round(
402
485
  num_materialize_buckets=num_materialize_buckets,
403
486
  enable_profiler=enable_profiler,
404
487
  metrics_config=metrics_config,
488
+ object_store=object_store,
405
489
  )
490
+
491
+ dedupe_invoke_end = time.monotonic()
406
492
  logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
407
493
  dd_results: List[DedupeResult] = ray.get(dd_tasks_pending)
408
494
  logger.info(f"Got {len(dd_results)} dedupe results.")
495
+
496
+ # we use time.time() here because time.monotonic() has no reference point
497
+ # whereas time.time() measures epoch seconds. Hence, it will be reasonable
498
+ # to compare time.time()s captured in different nodes.
499
+ dedupe_results_retrieved_at = time.time()
500
+ dedupe_end = time.monotonic()
501
+
409
502
  total_dd_record_count = sum([ddr.deduped_record_count for ddr in dd_results])
410
503
  logger.info(f"Deduped {total_dd_record_count} records...")
504
+
505
+ telemetry_time_dd = compaction_audit.save_step_stats(
506
+ CompactionSessionAuditInfo.DEDUPE_STEP_NAME,
507
+ dd_results,
508
+ dedupe_results_retrieved_at,
509
+ dedupe_invoke_end - dedupe_start,
510
+ dedupe_end - dedupe_start,
511
+ )
512
+
513
+ compaction_audit.set_records_deduped(total_dd_record_count.item())
514
+
411
515
  all_mat_buckets_to_obj_id = defaultdict(list)
412
516
  for dd_result in dd_results:
413
517
  for (
@@ -420,6 +524,8 @@ def _execute_compaction_round(
420
524
  logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
421
525
  logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
422
526
 
527
+ compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
528
+
423
529
  # TODO(pdames): when resources are freed during the last round of deduping
424
530
  # start running materialize tasks that read materialization source file
425
531
  # tables from S3 then wait for deduping to finish before continuing
@@ -432,6 +538,11 @@ def _execute_compaction_round(
432
538
 
433
539
  # parallel step 3:
434
540
  # materialize records to keep by index
541
+
542
+ s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
543
+
544
+ materialize_start = time.monotonic()
545
+
435
546
  mat_tasks_pending = invoke_parallel(
436
547
  items=all_mat_buckets_to_obj_id.items(),
437
548
  ray_task=mat.materialize,
@@ -445,38 +556,34 @@ def _execute_compaction_round(
445
556
  round_completion_info=round_completion_info,
446
557
  source_partition_locator=source_partition_locator,
447
558
  partition=partition,
559
+ enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
448
560
  max_records_per_output_file=records_per_compacted_file,
449
561
  compacted_file_content_type=compacted_file_content_type,
450
562
  enable_profiler=enable_profiler,
451
563
  metrics_config=metrics_config,
452
564
  read_kwargs_provider=read_kwargs_provider,
453
565
  s3_table_writer_kwargs=s3_table_writer_kwargs,
566
+ object_store=object_store,
454
567
  deltacat_storage=deltacat_storage,
455
568
  )
569
+
570
+ materialize_invoke_end = time.monotonic()
571
+
456
572
  logger.info(f"Getting {len(mat_tasks_pending)} materialize result(s)...")
457
- mat_results = ray.get(mat_tasks_pending)
458
- total_count_of_src_dfl_not_touched = sum(
459
- m.count_of_src_dfl_not_touched for m in mat_results
460
- )
461
- total_length_src_dfl = sum(m.count_of_src_dfl for m in mat_results)
462
- logger.info(
463
- f"Got total of {total_count_of_src_dfl_not_touched} manifest files not touched."
464
- )
465
- logger.info(
466
- f"Got total of {total_length_src_dfl} manifest files during compaction."
467
- )
468
- manifest_entry_copied_by_reference_ratio = (
469
- (round(total_count_of_src_dfl_not_touched / total_length_src_dfl, 4) * 100)
470
- if total_length_src_dfl != 0
471
- else None
472
- )
473
- logger.info(
474
- f"{manifest_entry_copied_by_reference_ratio} percent of manifest files are copied by reference during materialize."
475
- )
573
+ mat_results: List[MaterializeResult] = ray.get(mat_tasks_pending)
476
574
 
477
575
  logger.info(f"Got {len(mat_results)} materialize result(s).")
478
576
 
479
- log_current_cluster_utilization(log_identifier="post_materialize")
577
+ materialize_end = time.monotonic()
578
+ materialize_results_retrieved_at = time.time()
579
+
580
+ telemetry_time_materialize = compaction_audit.save_step_stats(
581
+ CompactionSessionAuditInfo.MATERIALIZE_STEP_NAME,
582
+ mat_results,
583
+ materialize_results_retrieved_at,
584
+ materialize_invoke_end - materialize_start,
585
+ materialize_end - materialize_start,
586
+ )
480
587
 
481
588
  mat_results = sorted(mat_results, key=lambda m: m.task_index)
482
589
  deltas = [m.delta for m in mat_results]
@@ -494,6 +601,7 @@ def _execute_compaction_round(
494
601
  f" Materialized records: {merged_delta.meta.record_count}"
495
602
  )
496
603
  logger.info(record_info_msg)
604
+
497
605
  assert (
498
606
  total_hb_record_count - total_dd_record_count == merged_delta.meta.record_count
499
607
  ), (
@@ -506,6 +614,9 @@ def _execute_compaction_round(
506
614
  )
507
615
  logger.info(f"Committed compacted delta: {compacted_delta}")
508
616
 
617
+ compaction_end = time.monotonic()
618
+ compaction_audit.set_compaction_time_in_seconds(compaction_end - compaction_start)
619
+
509
620
  new_compacted_delta_locator = DeltaLocator.of(
510
621
  new_compacted_partition_locator,
511
622
  compacted_delta.stream_position,
@@ -516,26 +627,51 @@ def _execute_compaction_round(
516
627
  if round_completion_info
517
628
  else None
518
629
  )
630
+
631
+ pyarrow_write_result = PyArrowWriteResult.union(
632
+ [m.pyarrow_write_result for m in mat_results]
633
+ )
634
+
635
+ session_peak_memory = get_current_node_peak_memory_usage_in_bytes()
636
+ compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
637
+ session_peak_memory
638
+ )
639
+
640
+ compaction_audit.save_round_completion_stats(
641
+ mat_results, telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
642
+ )
643
+
644
+ s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
645
+
519
646
  new_round_completion_info = RoundCompletionInfo.of(
520
647
  last_stream_position_compacted,
521
648
  new_compacted_delta_locator,
522
- PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
649
+ pyarrow_write_result,
523
650
  bit_width_of_sort_keys,
524
651
  last_rebase_source_partition_locator,
525
- manifest_entry_copied_by_reference_ratio,
526
- )
527
- rcf_source_partition_locator = (
528
- rebase_source_partition_locator
529
- if rebase_source_partition_locator
530
- else source_partition_locator
652
+ compaction_audit.untouched_file_ratio,
653
+ audit_url,
531
654
  )
655
+
532
656
  logger.info(
533
657
  f"partition-{source_partition_locator.partition_values},"
534
658
  f"compacted at: {last_stream_position_compacted},"
535
659
  f"last position: {last_stream_position_to_compact}"
536
660
  )
661
+
537
662
  return (
538
663
  partition,
539
664
  new_round_completion_info,
540
665
  rcf_source_partition_locator,
541
666
  )
667
+
668
+
669
+ def compact_partition_from_request(
670
+ compact_partition_params: CompactPartitionParams,
671
+ ) -> Optional[str]:
672
+ """
673
+ Wrapper for compact_partition that allows for the compact_partition parameters to be
674
+ passed in as a custom dictionary-like CompactPartitionParams object.
675
+ :param compact_partition_params:
676
+ """
677
+ return compact_partition(**compact_partition_params)
@@ -0,0 +1,153 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ import json
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from deltacat.types.media import ContentType
8
+
9
+
10
+ class CompactPartitionParams(dict):
11
+ """
12
+ This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
13
+ """
14
+
15
+ @staticmethod
16
+ def of(params: Optional[Dict]) -> CompactPartitionParams:
17
+ if params is None:
18
+ params = {}
19
+ compact_partition_params = CompactPartitionParams()
20
+ compact_partition_params["destination_partition_locator"] = params.get(
21
+ "destination_partition_locator"
22
+ )
23
+ compact_partition_params["last_stream_position_to_compact"] = params.get(
24
+ "last_stream_position_to_compact"
25
+ )
26
+ compact_partition_params["source_partition_locator"] = params.get(
27
+ "source_partition_locator"
28
+ )
29
+ compact_partition_params["primary_keys"] = params.get("primary_keys")
30
+ compact_partition_params["rebase_source_partition_locator"] = params.get(
31
+ "rebase_source_partition_locator"
32
+ )
33
+ compact_partition_params["rebase_source_partition_high_watermark"] = params.get(
34
+ "rebase_source_partition_high_watermark"
35
+ )
36
+ compact_partition_params["hash_bucket_count"] = params.get("hash_bucket_count")
37
+ compact_partition_params["deltacat_storage"] = params.get("deltacat_storage")
38
+ compact_partition_params["compaction_artifact_s3_bucket"] = params.get(
39
+ "compaction_artifact_s3_bucket"
40
+ )
41
+ compact_partition_params["properties"] = params.get("properties")
42
+ compact_partition_params["compacted_file_content_type"] = params.get(
43
+ "compacted_file_content_type"
44
+ )
45
+ compact_partition_params["list_deltas_kwargs"] = params.get(
46
+ "list_deltas_kwargs"
47
+ )
48
+ compact_partition_params["pg_config"] = params.get("pg_config")
49
+ compact_partition_params["read_kwargs_provider"] = params.get(
50
+ "read_kwargs_provider"
51
+ )
52
+ compact_partition_params["s3_table_writer_kwargs"] = params.get(
53
+ "s3_table_writer_kwargs"
54
+ )
55
+ return compact_partition_params
56
+
57
+ @property
58
+ def destination_partition_locator(self) -> Optional[dict]:
59
+ return self["destination_partition_locator"]
60
+
61
+ @property
62
+ def last_stream_position_to_compact(self) -> Optional[int]:
63
+ return self["last_stream_position_to_compact"]
64
+
65
+ @property
66
+ def source_partition_locator(self) -> Optional[dict]:
67
+ return self["source_partition_locator"]
68
+
69
+ @property
70
+ def primary_keys(self) -> Optional[List[str]]:
71
+ return list(self["primary_keys"])
72
+
73
+ @property
74
+ def rebase_source_partition_locator(self) -> Optional[dict]:
75
+ return self["rebase_source_partition_locator"]
76
+
77
+ @property
78
+ def rebase_source_partition_high_watermark(self) -> Optional[int]:
79
+ return self["rebase_source_partition_high_watermark"]
80
+
81
+ @property
82
+ def hash_bucket_count(self) -> Optional[int]:
83
+ return self["hash_bucket_count"]
84
+
85
+ @property
86
+ def deltacat_storage(self) -> Optional[str]:
87
+ return self["deltacat_storage"]
88
+
89
+ @property
90
+ def compaction_artifact_s3_bucket(self) -> Optional[str]:
91
+ return self["compaction_artifact_s3_bucket"]
92
+
93
+ @property
94
+ def properties(self) -> Optional[Dict[str, str]]:
95
+ return self["properties"]
96
+
97
+ @property
98
+ def compacted_file_content_type(self) -> Optional[ContentType]:
99
+ return self["compacted_file_content_type"]
100
+
101
+ @property
102
+ def list_deltas_kwargs(self) -> Optional[dict]:
103
+ return self["list_deltas_kwargs"]
104
+
105
+ @property
106
+ def pg_config(self) -> Optional[Any]:
107
+ return self["pg_config"]
108
+
109
+ @property
110
+ def read_kwargs_provider(self) -> Optional[Any]:
111
+ return self["read_kwargs_provider"]
112
+
113
+ @property
114
+ def s3_table_writer_kwargs(self) -> Optional[Any]:
115
+ return self["s3_table_writer_kwargs"]
116
+
117
+ @staticmethod
118
+ def json_handler_for_compact_partition_params(obj):
119
+ """
120
+ A handler for the `json.dumps()` function that can be used to serialize sets to JSON.
121
+ If the `set_default()` handler is passed as the `default` argument to the `json.dumps()` function, it will be called whenever a set object is encountered.
122
+ The `set_default()` handler will then serialize the set as a list.
123
+ """
124
+ try:
125
+ if isinstance(obj, set):
126
+ return list(obj)
127
+ elif hasattr(obj, "toJSON"):
128
+ return obj.toJSON()
129
+ else:
130
+ return obj.__dict__
131
+ except Exception:
132
+ return obj.__class__.__name__
133
+
134
+ def serialize(self) -> str:
135
+ """
136
+ Serializes itself to a json-formatted string
137
+
138
+ Returns:
139
+ The serialized object.
140
+
141
+ """
142
+ to_serialize: Dict[str, Any] = {}
143
+ # individually try deepcopy the values from the self dictionary and just use the class name for the value when it is not possible to deepcopy
144
+ for attr, value in self.items():
145
+ try:
146
+ to_serialize[attr] = copy.deepcopy(value)
147
+ except Exception: # if unable to deep copy the objects like module objects for example then just provide the class name at minimum
148
+ to_serialize[attr] = value.__class__.__name__
149
+ serialized_arguments_compact_partition_args: str = json.dumps(
150
+ to_serialize,
151
+ default=CompactPartitionParams.json_handler_for_compact_partition_params,
152
+ )
153
+ return serialized_arguments_compact_partition_args