deltacat 0.1.18b4__tar.gz → 0.1.18b6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. {deltacat-0.1.18b4/deltacat.egg-info → deltacat-0.1.18b6}/PKG-INFO +1 -1
  2. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/__init__.py +1 -1
  3. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/compaction_session.py +23 -2
  4. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/dedupe.py +9 -14
  5. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/hash_bucket.py +5 -3
  6. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/materialize.py +18 -37
  7. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/primary_key_index.py +9 -15
  8. deltacat-0.1.18b6/deltacat/io/file_object_store.py +48 -0
  9. deltacat-0.1.18b6/deltacat/io/memcached_object_store.py +121 -0
  10. deltacat-0.1.18b6/deltacat/io/object_store.py +51 -0
  11. deltacat-0.1.18b6/deltacat/io/ray_plasma_object_store.py +23 -0
  12. deltacat-0.1.18b6/deltacat/io/redis_object_store.py +114 -0
  13. deltacat-0.1.18b6/deltacat/io/s3_object_store.py +44 -0
  14. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/compactor/utils/test_io.py +4 -0
  15. deltacat-0.1.18b6/deltacat/tests/io/test_file_object_store.py +86 -0
  16. deltacat-0.1.18b6/deltacat/tests/io/test_memcached_object_store.py +158 -0
  17. deltacat-0.1.18b6/deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
  18. deltacat-0.1.18b6/deltacat/tests/io/test_redis_object_store.py +103 -0
  19. deltacat-0.1.18b6/deltacat/tests/io/test_s3_object_store.py +59 -0
  20. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/utils/test_resources.py +4 -0
  21. deltacat-0.1.18b6/deltacat/utils/__init__.py +0 -0
  22. deltacat-0.1.18b6/deltacat/utils/ray_utils/__init__.py +0 -0
  23. {deltacat-0.1.18b4 → deltacat-0.1.18b6/deltacat.egg-info}/PKG-INFO +1 -1
  24. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat.egg-info/SOURCES.txt +12 -0
  25. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat.egg-info/requires.txt +2 -0
  26. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/setup.py +2 -0
  27. deltacat-0.1.18b4/deltacat/io/__init__.py +0 -7
  28. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/LICENSE +0 -0
  29. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/MANIFEST.in +0 -0
  30. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/README.md +0 -0
  31. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/__init__.py +0 -0
  32. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/clients.py +0 -0
  33. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/constants.py +0 -0
  34. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/redshift/__init__.py +0 -0
  35. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/redshift/model/__init__.py +0 -0
  36. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/redshift/model/manifest.py +0 -0
  37. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/s3u.py +0 -0
  38. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/__init__.py +0 -0
  39. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/delegate.py +0 -0
  40. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/interface.py +0 -0
  41. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/model/__init__.py +0 -0
  42. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/model/catalog.py +0 -0
  43. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/model/table_definition.py +0 -0
  44. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/__init__.py +0 -0
  45. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/__init__.py +0 -0
  46. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/__init__.py +0 -0
  47. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
  48. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
  49. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
  50. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
  51. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
  52. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
  53. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
  54. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/materialize_result.py +0 -0
  55. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
  56. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  57. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  58. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
  59. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/sort_key.py +0 -0
  60. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/repartition_session.py +0 -0
  61. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/__init__.py +0 -0
  62. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/__init__.py +0 -0
  63. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/rehash_bucket.py +0 -0
  64. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/rewrite_index.py +0 -0
  65. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/repartition.py +0 -0
  66. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/__init__.py +0 -0
  67. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/io.py +0 -0
  68. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
  69. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/system_columns.py +0 -0
  70. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/__init__.py +0 -0
  71. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/config/__init__.py +0 -0
  72. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/meta_stats.py +0 -0
  73. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/__init__.py +0 -0
  74. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/partition_stats_dict.py +0 -0
  75. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -0
  76. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/stats.py +0 -0
  77. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/__init__.py +0 -0
  78. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/constants.py +0 -0
  79. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/io.py +0 -0
  80. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -0
  81. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/ray_utils.py +0 -0
  82. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/__init__.py +0 -0
  83. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/basic.py +0 -0
  84. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/__init__.py +0 -0
  85. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  86. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_stats.py +0 -0
  87. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  88. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  89. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/stats_result.py +0 -0
  90. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/types.py +0 -0
  91. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/__init__.py +0 -0
  92. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/intervals.py +0 -0
  93. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/io.py +0 -0
  94. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/manifest_stats_file.py +0 -0
  95. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/constants.py +0 -0
  96. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/exceptions.py +0 -0
  97. {deltacat-0.1.18b4/deltacat/io/aws → deltacat-0.1.18b6/deltacat/io}/__init__.py +0 -0
  98. {deltacat-0.1.18b4/deltacat/io/aws/redshift → deltacat-0.1.18b6/deltacat/io/aws}/__init__.py +0 -0
  99. {deltacat-0.1.18b4/deltacat/storage/model → deltacat-0.1.18b6/deltacat/io/aws/redshift}/__init__.py +0 -0
  100. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/io/aws/redshift/redshift_datasource.py +0 -0
  101. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/io/dataset.py +0 -0
  102. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/io/read_api.py +0 -0
  103. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/logs.py +0 -0
  104. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/__init__.py +0 -0
  105. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/interface.py +0 -0
  106. {deltacat-0.1.18b4/deltacat/tests → deltacat-0.1.18b6/deltacat/storage/model}/__init__.py +0 -0
  107. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/delta.py +0 -0
  108. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/list_result.py +0 -0
  109. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/locator.py +0 -0
  110. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/namespace.py +0 -0
  111. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/partition.py +0 -0
  112. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/stream.py +0 -0
  113. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/table.py +0 -0
  114. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/table_version.py +0 -0
  115. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/types.py +0 -0
  116. {deltacat-0.1.18b4/deltacat/tests/compactor → deltacat-0.1.18b6/deltacat/tests}/__init__.py +0 -0
  117. {deltacat-0.1.18b4/deltacat/tests/compactor/utils → deltacat-0.1.18b6/deltacat/tests/compactor}/__init__.py +0 -0
  118. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/compactor/test_compact_partition_params.py +0 -0
  119. {deltacat-0.1.18b4/deltacat/tests/stats → deltacat-0.1.18b6/deltacat/tests/compactor/utils}/__init__.py +0 -0
  120. {deltacat-0.1.18b4/deltacat/tests/test_utils → deltacat-0.1.18b6/deltacat/tests/io}/__init__.py +0 -0
  121. {deltacat-0.1.18b4/deltacat/tests/utils → deltacat-0.1.18b6/deltacat/tests/stats}/__init__.py +0 -0
  122. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/stats/test_intervals.py +0 -0
  123. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/test_repartition.py +0 -0
  124. {deltacat-0.1.18b4/deltacat/types → deltacat-0.1.18b6/deltacat/tests/test_utils}/__init__.py +0 -0
  125. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/test_utils/constants.py +0 -0
  126. {deltacat-0.1.18b4/deltacat → deltacat-0.1.18b6/deltacat/tests}/utils/__init__.py +0 -0
  127. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
  128. {deltacat-0.1.18b4/deltacat/utils/ray_utils → deltacat-0.1.18b6/deltacat/types}/__init__.py +0 -0
  129. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/types/media.py +0 -0
  130. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/types/tables.py +0 -0
  131. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/common.py +0 -0
  132. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/metrics.py +0 -0
  133. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/numpy.py +0 -0
  134. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/pandas.py +0 -0
  135. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/performance.py +0 -0
  136. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/placement.py +0 -0
  137. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/pyarrow.py +0 -0
  138. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/collections.py +0 -0
  139. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/concurrency.py +0 -0
  140. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/dataset.py +0 -0
  141. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/performance.py +0 -0
  142. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/runtime.py +0 -0
  143. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/resources.py +0 -0
  144. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat.egg-info/dependency_links.txt +0 -0
  145. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat.egg-info/top_level.txt +0 -0
  146. {deltacat-0.1.18b4 → deltacat-0.1.18b6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.1.18b4
3
+ Version: 0.1.18b6
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
43
43
 
44
44
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
45
45
 
46
- __version__ = "0.1.18b4"
46
+ __version__ = "0.1.18b6"
47
47
 
48
48
 
49
49
  __all__ = [
@@ -16,6 +16,8 @@ from deltacat.compute.compactor import (
16
16
  )
17
17
  from deltacat.compute.compactor.model.dedupe_result import DedupeResult
18
18
  from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
19
+ from deltacat.io.object_store import IObjectStore
20
+ from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
19
21
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
20
22
  from deltacat.compute.stats.models.delta_stats import DeltaStats
21
23
  from deltacat.storage import (
@@ -112,6 +114,7 @@ def compact_partition(
112
114
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
113
115
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
114
116
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
117
+ object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
115
118
  deltacat_storage=unimplemented_deltacat_storage,
116
119
  **kwargs,
117
120
  ) -> Optional[str]:
@@ -151,6 +154,7 @@ def compact_partition(
151
154
  list_deltas_kwargs,
152
155
  read_kwargs_provider,
153
156
  s3_table_writer_kwargs,
157
+ object_store,
154
158
  deltacat_storage,
155
159
  **kwargs,
156
160
  )
@@ -196,6 +200,7 @@ def _execute_compaction_round(
196
200
  list_deltas_kwargs: Optional[Dict[str, Any]],
197
201
  read_kwargs_provider: Optional[ReadKwargsProvider],
198
202
  s3_table_writer_kwargs: Optional[Dict[str, Any]],
203
+ object_store: Optional[IObjectStore],
199
204
  deltacat_storage=unimplemented_deltacat_storage,
200
205
  **kwargs,
201
206
  ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
@@ -287,6 +292,13 @@ def _execute_compaction_round(
287
292
  )
288
293
  logger.info(f"Round completion file: {round_completion_info}")
289
294
 
295
+ enable_manifest_entry_copy_by_reference = (
296
+ False if rebase_source_partition_locator else True
297
+ )
298
+ logger.info(
299
+ f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
300
+ )
301
+
290
302
  # discover input delta files
291
303
  # For rebase:
292
304
  # Copy the old compacted table to a new destination, plus any new deltas from rebased source
@@ -392,6 +404,7 @@ def _execute_compaction_round(
392
404
  enable_profiler=enable_profiler,
393
405
  metrics_config=metrics_config,
394
406
  read_kwargs_provider=read_kwargs_provider,
407
+ object_store=object_store,
395
408
  deltacat_storage=deltacat_storage,
396
409
  )
397
410
 
@@ -453,11 +466,16 @@ def _execute_compaction_round(
453
466
  logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
454
467
 
455
468
  dedupe_start = time.monotonic()
456
-
469
+ dd_max_parallelism = int(
470
+ max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
471
+ )
472
+ logger.info(
473
+ f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
474
+ )
457
475
  dd_tasks_pending = invoke_parallel(
458
476
  items=all_hash_group_idx_to_obj_id.values(),
459
477
  ray_task=dd.dedupe,
460
- max_parallelism=max_parallelism,
478
+ max_parallelism=dd_max_parallelism,
461
479
  options_provider=round_robin_opt_provider,
462
480
  kwargs_provider=lambda index, item: {
463
481
  "dedupe_task_index": index,
@@ -467,6 +485,7 @@ def _execute_compaction_round(
467
485
  num_materialize_buckets=num_materialize_buckets,
468
486
  enable_profiler=enable_profiler,
469
487
  metrics_config=metrics_config,
488
+ object_store=object_store,
470
489
  )
471
490
 
472
491
  dedupe_invoke_end = time.monotonic()
@@ -537,12 +556,14 @@ def _execute_compaction_round(
537
556
  round_completion_info=round_completion_info,
538
557
  source_partition_locator=source_partition_locator,
539
558
  partition=partition,
559
+ enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
540
560
  max_records_per_output_file=records_per_compacted_file,
541
561
  compacted_file_content_type=compacted_file_content_type,
542
562
  enable_profiler=enable_profiler,
543
563
  metrics_config=metrics_config,
544
564
  read_kwargs_provider=read_kwargs_provider,
545
565
  s3_table_writer_kwargs=s3_table_writer_kwargs,
566
+ object_store=object_store,
546
567
  deltacat_storage=deltacat_storage,
547
568
  )
548
569
 
@@ -1,5 +1,6 @@
1
1
  import importlib
2
2
  import logging
3
+ from typing import Optional
3
4
  import time
4
5
  from collections import defaultdict
5
6
  from contextlib import nullcontext
@@ -8,7 +9,6 @@ import numpy as np
8
9
  import pyarrow as pa
9
10
  import pyarrow.compute as pc
10
11
  import ray
11
- from ray import cloudpickle
12
12
 
13
13
  from deltacat import logs
14
14
  from deltacat.compute.compactor import (
@@ -25,6 +25,7 @@ from deltacat.utils.ray_utils.runtime import (
25
25
  )
26
26
  from deltacat.utils.performance import timed_invocation
27
27
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
28
+ from deltacat.io.object_store import IObjectStore
28
29
  from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
29
30
 
30
31
  if importlib.util.find_spec("memray"):
@@ -106,6 +107,7 @@ def _timed_dedupe(
106
107
  num_materialize_buckets: int,
107
108
  dedupe_task_index: int,
108
109
  enable_profiler: bool,
110
+ object_store: Optional[IObjectStore],
109
111
  ):
110
112
  task_id = get_current_ray_task_id()
111
113
  worker_id = get_current_ray_worker_id()
@@ -114,15 +116,12 @@ def _timed_dedupe(
114
116
  ) if enable_profiler else nullcontext():
115
117
  # TODO (pdames): mitigate risk of running out of memory here in cases of
116
118
  # severe skew of primary key updates in deltas
117
- src_file_records_obj_refs = [
118
- cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
119
- ]
120
119
  logger.info(
121
120
  f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
122
- f"groups for {len(src_file_records_obj_refs)} object refs..."
121
+ f"groups for {len(object_ids)} object refs..."
123
122
  )
124
123
 
125
- delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
124
+ delta_file_envelope_groups_list = object_store.get_many(object_ids)
126
125
  hb_index_to_delta_file_envelopes_list = defaultdict(list)
127
126
  for delta_file_envelope_groups in delta_file_envelope_groups_list:
128
127
  for hb_idx, dfes in enumerate(delta_file_envelope_groups):
@@ -201,7 +200,6 @@ def _timed_dedupe(
201
200
  src_file_id_to_row_indices[src_dfl].append(row_idx_col[row_idx])
202
201
 
203
202
  logger.info(f"Finished all dedupe rounds...")
204
- mat_bucket_to_src_file_record_count = defaultdict(dict)
205
203
  mat_bucket_to_src_file_records: Dict[
206
204
  MaterializeBucketIndex, DeltaFileLocatorToRecords
207
205
  ] = defaultdict(dict)
@@ -213,22 +211,17 @@ def _timed_dedupe(
213
211
  mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
214
212
  src_row_indices,
215
213
  )
216
- mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(
217
- src_row_indices
218
- )
219
214
 
220
215
  mat_bucket_to_dd_idx_obj_id: Dict[
221
216
  MaterializeBucketIndex, DedupeTaskIndexWithObjectId
222
217
  ] = {}
223
218
  for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
224
- object_ref = ray.put(src_file_records)
225
- pickled_object_ref = cloudpickle.dumps(object_ref)
219
+ object_ref = object_store.put(src_file_records)
226
220
  mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
227
221
  dedupe_task_index,
228
- pickled_object_ref,
222
+ object_ref,
229
223
  )
230
224
  del object_ref
231
- del pickled_object_ref
232
225
  logger.info(
233
226
  f"Count of materialize buckets with object refs: "
234
227
  f"{len(mat_bucket_to_dd_idx_obj_id)}"
@@ -253,6 +246,7 @@ def dedupe(
253
246
  dedupe_task_index: int,
254
247
  enable_profiler: bool,
255
248
  metrics_config: MetricsConfig,
249
+ object_store: Optional[IObjectStore],
256
250
  ) -> DedupeResult:
257
251
  logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
258
252
  dedupe_result, duration = timed_invocation(
@@ -262,6 +256,7 @@ def dedupe(
262
256
  num_materialize_buckets=num_materialize_buckets,
263
257
  dedupe_task_index=dedupe_task_index,
264
258
  enable_profiler=enable_profiler,
259
+ object_store=object_store,
265
260
  )
266
261
 
267
262
  emit_metrics_time = 0.0
@@ -31,6 +31,7 @@ from deltacat.utils.ray_utils.runtime import (
31
31
  from deltacat.utils.common import ReadKwargsProvider
32
32
  from deltacat.utils.performance import timed_invocation
33
33
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
34
+ from deltacat.io.object_store import IObjectStore
34
35
  from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
35
36
 
36
37
  if importlib.util.find_spec("memray"):
@@ -179,6 +180,7 @@ def _timed_hash_bucket(
179
180
  num_groups: int,
180
181
  enable_profiler: bool,
181
182
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
183
+ object_store: Optional[IObjectStore] = None,
182
184
  deltacat_storage=unimplemented_deltacat_storage,
183
185
  ):
184
186
  task_id = get_current_ray_task_id()
@@ -207,9 +209,7 @@ def _timed_hash_bucket(
207
209
  deltacat_storage,
208
210
  )
209
211
  hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
210
- delta_file_envelope_groups,
211
- num_buckets,
212
- num_groups,
212
+ delta_file_envelope_groups, num_buckets, num_groups, object_store
213
213
  )
214
214
 
215
215
  peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
@@ -233,6 +233,7 @@ def hash_bucket(
233
233
  enable_profiler: bool,
234
234
  metrics_config: MetricsConfig,
235
235
  read_kwargs_provider: Optional[ReadKwargsProvider],
236
+ object_store: Optional[IObjectStore],
236
237
  deltacat_storage=unimplemented_deltacat_storage,
237
238
  ) -> HashBucketResult:
238
239
 
@@ -247,6 +248,7 @@ def hash_bucket(
247
248
  num_groups=num_groups,
248
249
  enable_profiler=enable_profiler,
249
250
  read_kwargs_provider=read_kwargs_provider,
251
+ object_store=object_store,
250
252
  deltacat_storage=deltacat_storage,
251
253
  )
252
254
 
@@ -5,11 +5,10 @@ from uuid import uuid4
5
5
  from collections import defaultdict
6
6
  from contextlib import nullcontext
7
7
  from itertools import chain, repeat
8
- from typing import List, Optional, Tuple, Dict, Any, Union
8
+ from typing import List, Optional, Tuple, Dict, Any
9
9
  import pyarrow as pa
10
10
  import numpy as np
11
11
  import ray
12
- from ray import cloudpickle
13
12
  from deltacat import logs
14
13
  from deltacat.compute.compactor import (
15
14
  MaterializeResult,
@@ -28,15 +27,13 @@ from deltacat.storage import (
28
27
  PartitionLocator,
29
28
  Manifest,
30
29
  ManifestEntry,
31
- LocalDataset,
32
- LocalTable,
33
- DistributedDataset,
34
30
  )
35
31
  from deltacat.storage import interface as unimplemented_deltacat_storage
36
32
  from deltacat.utils.common import ReadKwargsProvider
37
33
  from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
38
34
  from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
39
35
  from deltacat.utils.performance import timed_invocation
36
+ from deltacat.io.object_store import IObjectStore
40
37
  from deltacat.utils.pyarrow import (
41
38
  ReadKwargsProviderPyArrowCsvPureUtf8,
42
39
  ReadKwargsProviderPyArrowSchemaOverride,
@@ -64,29 +61,15 @@ def materialize(
64
61
  dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
65
62
  max_records_per_output_file: int,
66
63
  compacted_file_content_type: ContentType,
64
+ enable_manifest_entry_copy_by_reference: bool,
67
65
  enable_profiler: bool,
68
66
  metrics_config: MetricsConfig,
69
67
  schema: Optional[pa.Schema] = None,
70
68
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
71
69
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
70
+ object_store: Optional[IObjectStore] = None,
72
71
  deltacat_storage=unimplemented_deltacat_storage,
73
72
  ):
74
- def _stage_delta_implementation(
75
- data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
76
- partition: Partition,
77
- stage_delta_from_existing_manifest: Optional[bool],
78
- ) -> Delta:
79
- if stage_delta_from_existing_manifest:
80
- delta = Delta.of(
81
- locator=DeltaLocator.of(partition.locator),
82
- delta_type=DeltaType.UPSERT,
83
- meta=manifest.meta,
84
- manifest=data,
85
- previous_stream_position=partition.stream_position,
86
- properties={},
87
- )
88
- return delta
89
-
90
73
  def _stage_delta_from_manifest_entry_reference_list(
91
74
  manifest_entry_list_reference: List[ManifestEntry],
92
75
  partition: Partition,
@@ -96,10 +79,13 @@ def materialize(
96
79
  delta_type == DeltaType.UPSERT
97
80
  ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
98
81
  manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
99
- delta = _stage_delta_implementation(
100
- data=manifest,
101
- partition=partition,
102
- stage_delta_from_existing_manifest=True,
82
+ delta = Delta.of(
83
+ locator=DeltaLocator.of(partition.locator),
84
+ delta_type=delta_type,
85
+ meta=manifest.meta,
86
+ manifest=manifest,
87
+ previous_stream_position=partition.stream_position,
88
+ properties={},
103
89
  )
104
90
  return delta
105
91
 
@@ -161,18 +147,11 @@ def materialize(
161
147
  f"dedupe_{worker_id}_{task_id}.bin"
162
148
  ) if enable_profiler else nullcontext():
163
149
  start = time.time()
164
- dedupe_task_idx_and_obj_ref_tuples = [
165
- (
166
- t1,
167
- cloudpickle.loads(t2),
168
- )
169
- for t1, t2 in dedupe_task_idx_and_obj_id_tuples
170
- ]
171
150
  logger.info(f"Resolved materialize task obj refs...")
172
- dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
151
+ dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_id_tuples)
173
152
  # this depends on `ray.get` result order matching input order, as per the
174
153
  # contract established in: https://github.com/ray-project/ray/pull/16763
175
- src_file_records_list = ray.get(list(obj_refs))
154
+ src_file_records_list = object_store.get_many(list(obj_refs))
176
155
  all_src_file_records = defaultdict(list)
177
156
  for i, src_file_records in enumerate(src_file_records_list):
178
157
  dedupe_task_idx = dedupe_task_indices[i]
@@ -231,7 +210,9 @@ def materialize(
231
210
  record_numbers_length += 1
232
211
  mask_pylist[record_number] = True
233
212
  if (
234
- record_numbers_length == src_file_record_count
213
+ round_completion_info
214
+ and enable_manifest_entry_copy_by_reference
215
+ and record_numbers_length == src_file_record_count
235
216
  and src_file_partition_locator
236
217
  == round_completion_info.compacted_delta_locator.partition_locator
237
218
  ):
@@ -244,8 +225,8 @@ def materialize(
244
225
  manifest_entry_list_reference.append(untouched_src_manifest_entry)
245
226
  referenced_pyarrow_write_result = PyArrowWriteResult.of(
246
227
  1,
247
- manifest.meta.source_content_length,
248
- manifest.meta.content_length,
228
+ untouched_src_manifest_entry.meta.source_content_length,
229
+ untouched_src_manifest_entry.meta.content_length,
249
230
  src_file_record_count,
250
231
  )
251
232
  referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)
@@ -7,7 +7,6 @@ import numpy as np
7
7
  import pyarrow as pa
8
8
  import ray
9
9
  import s3fs
10
- from ray import cloudpickle
11
10
  from ray.types import ObjectRef
12
11
 
13
12
  from deltacat import logs
@@ -30,6 +29,7 @@ from deltacat.types.media import ContentEncoding, ContentType
30
29
  from deltacat.types.tables import get_table_slicer, get_table_writer
31
30
  from deltacat.utils.common import ReadKwargsProvider
32
31
  from deltacat.utils.ray_utils.concurrency import invoke_parallel
32
+ from deltacat.io.object_store import IObjectStore
33
33
 
34
34
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
35
35
 
@@ -203,7 +203,10 @@ def group_record_indices_by_hash_bucket(
203
203
 
204
204
 
205
205
  def group_hash_bucket_indices(
206
- hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
206
+ hash_bucket_object_groups: np.ndarray,
207
+ num_buckets: int,
208
+ num_groups: int,
209
+ object_store: Optional[IObjectStore] = None,
207
210
  ) -> Tuple[np.ndarray, List[ObjectRef]]:
208
211
  """
209
212
  Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
@@ -226,19 +229,10 @@ def group_hash_bucket_indices(
226
229
  for hb_group, obj in enumerate(hb_group_to_object):
227
230
  if obj is None:
228
231
  continue
229
- obj_ref = ray.put(obj)
230
- pickled_obj_ref = cloudpickle.dumps(obj_ref)
231
- object_refs.append(pickled_obj_ref)
232
- hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
233
- # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
234
- # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
235
- # (e.g., if the ObjectRef is deserialized by a non-Ray process).
236
- # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
237
- # The object now has a permanent reference and the data can't be freed from Ray’s object store.
238
- # Manually deleting the untrackable object references offsets these permanent references and
239
- # helps to allow these objects to be garbage collected normally.
240
- del obj_ref
241
- del pickled_obj_ref
232
+ object_ref = object_store.put(obj)
233
+ object_refs.append(object_ref)
234
+ hash_bucket_group_to_obj_id[hb_group] = object_ref
235
+ del object_ref
242
236
  return hash_bucket_group_to_obj_id, object_refs
243
237
 
244
238
 
@@ -0,0 +1,48 @@
1
+ import logging
2
+ from ray import cloudpickle
3
+ import time
4
+ from deltacat.io.object_store import IObjectStore
5
+ from typing import Any, List
6
+ from deltacat import logs
7
+ import os
8
+ import uuid
9
+ from builtins import open
10
+
11
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
+
13
+
14
+ class FileObjectStore(IObjectStore):
15
+ """
16
+ An implementation of object store that uses file system.
17
+ """
18
+
19
+ def __init__(self, dir_path: str) -> None:
20
+ self.dir_path = dir_path
21
+ super().__init__()
22
+
23
+ def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
24
+ result = []
25
+
26
+ for obj in objects:
27
+ serialized = cloudpickle.dumps(obj)
28
+ ref = f"{self.dir_path}/{uuid.uuid4()}"
29
+ with open(ref, "xb") as f:
30
+ f.write(serialized)
31
+
32
+ result.append(ref)
33
+
34
+ return result
35
+
36
+ def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
37
+ result = []
38
+ start = time.monotonic()
39
+ for ref in refs:
40
+ with open(ref, "rb") as f:
41
+ serialized = f.read()
42
+ loaded = cloudpickle.loads(serialized)
43
+ result.append(loaded)
44
+ os.remove(ref)
45
+ end = time.monotonic()
46
+
47
+ logger.info(f"The total time taken to read all objects is: {end - start}")
48
+ return result
@@ -0,0 +1,121 @@
1
+ import logging
2
+ from ray import cloudpickle
3
+ from collections import defaultdict
4
+ import time
5
+ from deltacat.io.object_store import IObjectStore
6
+ from typing import Any, List
7
+ from deltacat import logs
8
+ import uuid
9
+ import socket
10
+ from pymemcache.client.base import Client
11
+ from pymemcache.client.retrying import RetryingClient
12
+ from pymemcache.exceptions import MemcacheUnexpectedCloseError
13
+
14
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
+
16
+
17
+ class MemcachedObjectStore(IObjectStore):
18
+ """
19
+ An implementation of object store that uses Memcached.
20
+ """
21
+
22
+ def __init__(self, port=11212) -> None:
23
+ self.client_cache = {}
24
+ self.current_ip = None
25
+ self.SEPARATOR = "_"
26
+ self.port = port
27
+ super().__init__()
28
+
29
+ def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
30
+ input = {}
31
+ result = []
32
+ current_ip = self._get_current_ip()
33
+ for obj in objects:
34
+ serialized = cloudpickle.dumps(obj)
35
+ uid = uuid.uuid4()
36
+ ref = self._create_ref(uid, current_ip)
37
+ input[uid.__str__()] = serialized
38
+ result.append(ref)
39
+
40
+ client = self._get_client_by_ip(current_ip)
41
+ if client.set_many(input, noreply=False):
42
+ raise RuntimeError("Unable to write few keys to cache")
43
+
44
+ return result
45
+
46
+ def put(self, obj: object, *args, **kwargs) -> Any:
47
+ serialized = cloudpickle.dumps(obj)
48
+ uid = uuid.uuid4()
49
+ current_ip = self._get_current_ip()
50
+ ref = self._create_ref(uid, current_ip)
51
+ client = self._get_client_by_ip(current_ip)
52
+
53
+ if client.set(uid.__str__(), serialized):
54
+ return ref
55
+ else:
56
+ raise RuntimeError("Unable to write to cache")
57
+
58
+ def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
59
+ result = []
60
+ uid_per_ip = defaultdict(lambda: [])
61
+
62
+ start = time.monotonic()
63
+ for ref in refs:
64
+ uid, ip = ref.split(self.SEPARATOR)
65
+ uid_per_ip[ip].append(uid)
66
+
67
+ for (ip, uids) in uid_per_ip.items():
68
+ client = self._get_client_by_ip(ip)
69
+ cache_result = client.get_many(uids)
70
+ assert len(cache_result) == len(
71
+ uids
72
+ ), f"Not all values were returned from cache as {len(cache_result)} != {len(uids)}"
73
+
74
+ values = cache_result.values()
75
+ total_bytes = 0
76
+
77
+ deserialize_start = time.monotonic()
78
+ for serialized in values:
79
+ deserialized = cloudpickle.loads(serialized)
80
+ total_bytes += len(serialized)
81
+ result.append(deserialized)
82
+
83
+ deserialize_end = time.monotonic()
84
+ logger.debug(
85
+ f"The time taken to deserialize {total_bytes} bytes is: {deserialize_end - deserialize_start}",
86
+ )
87
+
88
+ end = time.monotonic()
89
+
90
+ logger.info(f"The total time taken to read all objects is: {end - start}")
91
+ return result
92
+
93
+ def get(self, ref: Any, *args, **kwargs) -> object:
94
+ uid, ip = ref.split(self.SEPARATOR)
95
+ client = self._get_client_by_ip(ip)
96
+ serialized = client.get(uid)
97
+ return cloudpickle.loads(serialized)
98
+
99
+ def _create_ref(self, uid, ip) -> str:
100
+ return f"{uid}{self.SEPARATOR}{ip}"
101
+
102
+ def _get_client_by_ip(self, ip_address: str):
103
+ if ip_address in self.client_cache:
104
+ return self.client_cache[ip_address]
105
+
106
+ base_client = Client((ip_address, self.port))
107
+ client = RetryingClient(
108
+ base_client,
109
+ attempts=3,
110
+ retry_delay=0.01,
111
+ retry_for=[MemcacheUnexpectedCloseError],
112
+ )
113
+
114
+ self.client_cache[ip_address] = client
115
+ return client
116
+
117
+ def _get_current_ip(self):
118
+ if self.current_ip is None:
119
+ self.current_ip = socket.gethostbyname(socket.gethostname())
120
+
121
+ return self.current_ip
@@ -0,0 +1,51 @@
1
+ from typing import List, Any
2
+
3
+
4
+ class IObjectStore:
5
+ """
6
+ An object store interface.
7
+ """
8
+
9
+ def setup(self, *args, **kwargs) -> Any:
10
+ ...
11
+
12
+ """
13
+ Sets up everything needed to run the object store.
14
+ """
15
+
16
+ def put(self, obj: object, *args, **kwargs) -> Any:
17
+ """
18
+ Put a single object into the object store. Depending
19
+ on the implementation, this method can be sync or async.
20
+ """
21
+ return self.put_many([obj])[0]
22
+
23
+ def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
24
+ ...
25
+
26
+ """
27
+ Put many objects into the object store. It would return an ordered list
28
+ of object references corresponding to each object in the input.
29
+ """
30
+
31
+ def get(self, ref: Any, *args, **kwargs) -> object:
32
+ """
33
+ Get a single object from an object store.
34
+ """
35
+ return self.get_many([ref])[0]
36
+
37
+ def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
38
+ ...
39
+
40
+ """
41
+ Get a list of objects from the object store. Use this method to
42
+ avoid multiple get calls. Note that depending on implementation it may
43
+ or may not return ordered results.
44
+ """
45
+
46
+ def clear(self, *args, **kwargs) -> bool:
47
+ ...
48
+
49
+ """
50
+ Clears the object store and all the associated data in it.
51
+ """
@@ -0,0 +1,23 @@
1
+ import ray
2
+ from ray import cloudpickle
3
+ from deltacat.io.object_store import IObjectStore
4
+ from typing import Any, List
5
+
6
+
7
+ class RayPlasmaObjectStore(IObjectStore):
8
+ """
9
+ An implementation of object store that uses Ray plasma object store.
10
+ """
11
+
12
+ def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
13
+ result = []
14
+ for obj in objects:
15
+ object_ref = ray.put(obj)
16
+ pickled = cloudpickle.dumps(object_ref)
17
+ result.append(pickled)
18
+
19
+ return result
20
+
21
+ def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
22
+ loaded_refs = [cloudpickle.loads(obj_id) for obj_id in refs]
23
+ return ray.get(loaded_refs)