deltacat 0.1.18b12__tar.gz → 0.1.18b13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/PKG-INFO +1 -1
  2. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/__init__.py +1 -1
  3. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/repartition_session.py +3 -1
  4. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/steps/repartition.py +4 -1
  5. deltacat-0.1.18b13/deltacat/compute/compactor/utils/primary_key_index.py +86 -0
  6. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/constants.py +3 -4
  7. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/placement.py +7 -2
  8. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat.egg-info/PKG-INFO +1 -1
  9. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat.egg-info/SOURCES.txt +0 -3
  10. deltacat-0.1.18b12/deltacat/compute/compactor/steps/rehash/rehash_bucket.py +0 -57
  11. deltacat-0.1.18b12/deltacat/compute/compactor/steps/rehash/rewrite_index.py +0 -48
  12. deltacat-0.1.18b12/deltacat/compute/compactor/utils/primary_key_index.py +0 -307
  13. deltacat-0.1.18b12/deltacat/utils/ray_utils/__init__.py +0 -0
  14. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/MANIFEST.in +0 -0
  15. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/README.md +0 -0
  16. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/aws/__init__.py +0 -0
  17. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/aws/clients.py +0 -0
  18. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/aws/constants.py +0 -0
  19. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/aws/redshift/__init__.py +0 -0
  20. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/aws/redshift/model/__init__.py +0 -0
  21. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/aws/redshift/model/manifest.py +0 -0
  22. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/aws/s3u.py +0 -0
  23. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/catalog/__init__.py +0 -0
  24. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/catalog/delegate.py +0 -0
  25. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/catalog/interface.py +0 -0
  26. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/catalog/model/__init__.py +0 -0
  27. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/catalog/model/catalog.py +0 -0
  28. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/catalog/model/table_definition.py +0 -0
  29. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/__init__.py +0 -0
  30. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/__init__.py +0 -0
  31. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/compaction_session.py +0 -0
  32. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/__init__.py +0 -0
  33. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
  34. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
  35. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
  36. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
  37. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
  38. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
  39. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
  40. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/materialize_result.py +0 -0
  41. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
  42. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
  43. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/repartition_result.py +0 -0
  44. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
  45. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/model/sort_key.py +0 -0
  46. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/steps/__init__.py +0 -0
  47. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/steps/dedupe.py +0 -0
  48. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/steps/hash_bucket.py +0 -0
  49. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/steps/materialize.py +0 -0
  50. {deltacat-0.1.18b12/deltacat/compute/compactor/steps/rehash → deltacat-0.1.18b13/deltacat/compute/compactor/utils}/__init__.py +0 -0
  51. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/utils/io.py +0 -0
  52. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
  53. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/compactor/utils/system_columns.py +0 -0
  54. {deltacat-0.1.18b12/deltacat/compute/compactor/utils → deltacat-0.1.18b13/deltacat/compute/metastats}/__init__.py +0 -0
  55. {deltacat-0.1.18b12/deltacat/compute/metastats → deltacat-0.1.18b13/deltacat/compute/metastats/config}/__init__.py +0 -0
  56. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/metastats/meta_stats.py +0 -0
  57. {deltacat-0.1.18b12/deltacat/compute/metastats/config → deltacat-0.1.18b13/deltacat/compute/metastats/model}/__init__.py +0 -0
  58. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/metastats/model/partition_stats_dict.py +0 -0
  59. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -0
  60. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/metastats/stats.py +0 -0
  61. {deltacat-0.1.18b12/deltacat/compute/metastats/model → deltacat-0.1.18b13/deltacat/compute/metastats/utils}/__init__.py +0 -0
  62. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/metastats/utils/constants.py +0 -0
  63. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/metastats/utils/io.py +0 -0
  64. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -0
  65. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/metastats/utils/ray_utils.py +0 -0
  66. {deltacat-0.1.18b12/deltacat/compute/metastats/utils → deltacat-0.1.18b13/deltacat/compute/stats}/__init__.py +0 -0
  67. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/stats/basic.py +0 -0
  68. {deltacat-0.1.18b12/deltacat/compute/stats → deltacat-0.1.18b13/deltacat/compute/stats/models}/__init__.py +0 -0
  69. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
  70. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/stats/models/delta_stats.py +0 -0
  71. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
  72. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
  73. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/stats/models/stats_result.py +0 -0
  74. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/stats/types.py +0 -0
  75. {deltacat-0.1.18b12/deltacat/compute/stats/models → deltacat-0.1.18b13/deltacat/compute/stats/utils}/__init__.py +0 -0
  76. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/stats/utils/intervals.py +0 -0
  77. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/stats/utils/io.py +0 -0
  78. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/compute/stats/utils/manifest_stats_file.py +0 -0
  79. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/exceptions.py +0 -0
  80. {deltacat-0.1.18b12/deltacat/compute/stats/utils → deltacat-0.1.18b13/deltacat/io}/__init__.py +0 -0
  81. {deltacat-0.1.18b12/deltacat/io → deltacat-0.1.18b13/deltacat/io/aws}/__init__.py +0 -0
  82. {deltacat-0.1.18b12/deltacat/io/aws → deltacat-0.1.18b13/deltacat/io/aws/redshift}/__init__.py +0 -0
  83. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/io/aws/redshift/redshift_datasource.py +0 -0
  84. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/io/dataset.py +0 -0
  85. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/io/file_object_store.py +0 -0
  86. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/io/memcached_object_store.py +0 -0
  87. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/io/object_store.py +0 -0
  88. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/io/ray_plasma_object_store.py +0 -0
  89. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/io/read_api.py +0 -0
  90. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/io/redis_object_store.py +0 -0
  91. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/io/s3_object_store.py +0 -0
  92. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/logs.py +0 -0
  93. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/__init__.py +0 -0
  94. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/interface.py +0 -0
  95. {deltacat-0.1.18b12/deltacat/io/aws/redshift → deltacat-0.1.18b13/deltacat/storage/model}/__init__.py +0 -0
  96. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/model/delta.py +0 -0
  97. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/model/list_result.py +0 -0
  98. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/model/locator.py +0 -0
  99. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/model/namespace.py +0 -0
  100. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/model/partition.py +0 -0
  101. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/model/stream.py +0 -0
  102. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/model/table.py +0 -0
  103. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/model/table_version.py +0 -0
  104. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/storage/model/types.py +0 -0
  105. {deltacat-0.1.18b12/deltacat/storage/model → deltacat-0.1.18b13/deltacat/tests}/__init__.py +0 -0
  106. {deltacat-0.1.18b12/deltacat/tests → deltacat-0.1.18b13/deltacat/tests/compactor}/__init__.py +0 -0
  107. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/compactor/test_compact_partition_params.py +0 -0
  108. {deltacat-0.1.18b12/deltacat/tests/compactor → deltacat-0.1.18b13/deltacat/tests/compactor/utils}/__init__.py +0 -0
  109. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/compactor/utils/test_io.py +0 -0
  110. {deltacat-0.1.18b12/deltacat/tests/compactor/utils → deltacat-0.1.18b13/deltacat/tests/io}/__init__.py +0 -0
  111. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/io/test_file_object_store.py +0 -0
  112. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/io/test_memcached_object_store.py +0 -0
  113. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/io/test_ray_plasma_object_store.py +0 -0
  114. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/io/test_redis_object_store.py +0 -0
  115. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/io/test_s3_object_store.py +0 -0
  116. {deltacat-0.1.18b12/deltacat/tests/io → deltacat-0.1.18b13/deltacat/tests/stats}/__init__.py +0 -0
  117. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/stats/test_intervals.py +0 -0
  118. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/test_repartition.py +0 -0
  119. {deltacat-0.1.18b12/deltacat/tests/stats → deltacat-0.1.18b13/deltacat/tests/test_utils}/__init__.py +0 -0
  120. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/test_utils/constants.py +0 -0
  121. {deltacat-0.1.18b12/deltacat/tests/test_utils → deltacat-0.1.18b13/deltacat/tests/utils}/__init__.py +0 -0
  122. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
  123. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/tests/utils/test_resources.py +0 -0
  124. {deltacat-0.1.18b12/deltacat/tests/utils → deltacat-0.1.18b13/deltacat/types}/__init__.py +0 -0
  125. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/types/media.py +0 -0
  126. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/types/tables.py +0 -0
  127. {deltacat-0.1.18b12/deltacat/types → deltacat-0.1.18b13/deltacat/utils}/__init__.py +0 -0
  128. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/common.py +0 -0
  129. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/metrics.py +0 -0
  130. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/numpy.py +0 -0
  131. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/pandas.py +0 -0
  132. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/performance.py +0 -0
  133. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/pyarrow.py +0 -0
  134. {deltacat-0.1.18b12/deltacat/utils → deltacat-0.1.18b13/deltacat/utils/ray_utils}/__init__.py +0 -0
  135. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/ray_utils/collections.py +0 -0
  136. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/ray_utils/concurrency.py +0 -0
  137. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/ray_utils/dataset.py +0 -0
  138. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/ray_utils/performance.py +0 -0
  139. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/ray_utils/runtime.py +0 -0
  140. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat/utils/resources.py +0 -0
  141. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat.egg-info/dependency_links.txt +0 -0
  142. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat.egg-info/requires.txt +0 -0
  143. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/deltacat.egg-info/top_level.txt +0 -0
  144. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/setup.cfg +0 -0
  145. {deltacat-0.1.18b12 → deltacat-0.1.18b13}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.1.18b12
3
+ Version: 0.1.18b13
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
43
43
 
44
44
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
45
45
 
46
- __version__ = "0.1.18b12"
46
+ __version__ = "0.1.18b13"
47
47
 
48
48
 
49
49
  __all__ = [
@@ -144,7 +144,9 @@ def repartition(
144
144
  logger.info(f"repartition {repar_end - repar_start} seconds")
145
145
  logger.info(f"Got {len(ordered_deltas)} task results.")
146
146
  # ordered_deltas are ordered as [cold1, cold2, coldN, hot1, hot2, hotN]
147
- merged_delta = Delta.merge_deltas(ordered_deltas)
147
+ merged_delta = Delta.merge_deltas(
148
+ ordered_deltas, stream_position=last_stream_position_to_compact
149
+ )
148
150
  compacted_delta = deltacat_storage.commit_delta(
149
151
  merged_delta, properties=kwargs.get("properties", {})
150
152
  )
@@ -2,6 +2,7 @@ import importlib
2
2
  import logging
3
3
  from contextlib import nullcontext
4
4
  import pyarrow.compute as pc
5
+ from deltacat.constants import SIGNED_INT64_MIN_VALUE, SIGNED_INT64_MAX_VALUE
5
6
  import pyarrow as pa
6
7
  from typing import List, Optional
7
8
  from deltacat.types.media import StorageType, ContentType
@@ -93,7 +94,9 @@ def repartition_range(
93
94
  if not all(column in table.column_names for table in tables):
94
95
  raise ValueError(f"Column {column} does not exist in the table")
95
96
  partition_ranges.sort()
96
- partition_ranges = [-float("Inf")] + partition_ranges + [float("Inf")]
97
+ partition_ranges = (
98
+ [SIGNED_INT64_MIN_VALUE] + partition_ranges + [SIGNED_INT64_MAX_VALUE]
99
+ )
97
100
  partitioned_tables_list = [[] for _ in range(len(partition_ranges) - 1)]
98
101
 
99
102
  total_record_count = 0
@@ -0,0 +1,86 @@
1
+ import logging
2
+ from typing import List, Optional, Tuple
3
+
4
+ import numpy as np
5
+ import pyarrow as pa
6
+ from ray.types import ObjectRef
7
+
8
+ from deltacat import logs
9
+ from deltacat.aws import s3u
10
+ from deltacat.compute.compactor import (
11
+ PrimaryKeyIndexVersionLocator,
12
+ )
13
+ from deltacat.compute.compactor.utils import system_columns as sc
14
+ from deltacat.io.object_store import IObjectStore
15
+
16
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
17
+
18
+
19
+ def delete_primary_key_index_version(
20
+ s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
21
+ ) -> None:
22
+
23
+ logger.info(f"Deleting primary key index: {pki_version_locator}")
24
+ s3u.delete_files_by_prefix(
25
+ s3_bucket,
26
+ pki_version_locator.primary_key_index_version_root_path,
27
+ )
28
+ logger.info(f"Primary key index deleted: {pki_version_locator}")
29
+
30
+
31
+ def group_record_indices_by_hash_bucket(
32
+ pki_table: pa.Table, num_buckets: int
33
+ ) -> np.ndarray:
34
+
35
+ hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
36
+ record_index = 0
37
+ for digest in sc.pk_hash_column_np(pki_table):
38
+ hash_bucket = pk_digest_to_hash_bucket_index(digest, num_buckets)
39
+ if hash_bucket_to_indices[hash_bucket] is None:
40
+ hash_bucket_to_indices[hash_bucket] = []
41
+ hash_bucket_to_indices[hash_bucket].append(record_index)
42
+ record_index += 1
43
+ return hash_bucket_to_indices
44
+
45
+
46
+ def group_hash_bucket_indices(
47
+ hash_bucket_object_groups: np.ndarray,
48
+ num_buckets: int,
49
+ num_groups: int,
50
+ object_store: Optional[IObjectStore] = None,
51
+ ) -> Tuple[np.ndarray, List[ObjectRef]]:
52
+ """
53
+ Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
54
+ """
55
+
56
+ object_refs = []
57
+ hash_bucket_group_to_obj_id = np.empty([num_groups], dtype="object")
58
+
59
+ if hash_bucket_object_groups is None:
60
+ return hash_bucket_group_to_obj_id, object_refs
61
+
62
+ hb_group_to_object = np.empty([num_groups], dtype="object")
63
+ for hb_index, obj in enumerate(hash_bucket_object_groups):
64
+ if obj:
65
+ hb_group = hb_index % num_groups
66
+ if hb_group_to_object[hb_group] is None:
67
+ hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
68
+ hb_group_to_object[hb_group][hb_index] = obj
69
+
70
+ for hb_group, obj in enumerate(hb_group_to_object):
71
+ if obj is None:
72
+ continue
73
+ object_ref = object_store.put(obj)
74
+ object_refs.append(object_ref)
75
+ hash_bucket_group_to_obj_id[hb_group] = object_ref
76
+ del object_ref
77
+ return hash_bucket_group_to_obj_id, object_refs
78
+
79
+
80
+ def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
81
+ """
82
+ Deterministically get the hash bucket a particular digest belongs to
83
+ based on number of total hash buckets.
84
+ """
85
+
86
+ return int.from_bytes(digest, "big") % num_buckets
@@ -36,6 +36,9 @@ BYTES_PER_GIBIBYTE = 2**30
36
36
  BYTES_PER_TEBIBYTE = 2**40
37
37
  BYTES_PER_PEBIBYTE = 2**50
38
38
 
39
+ SIGNED_INT64_MIN_VALUE = -(2**63)
40
+ SIGNED_INT64_MAX_VALUE = 2**63 - 1
41
+
39
42
  # Inflation multiplier from snappy-compressed parquet to pyarrow.
40
43
  # This should be kept larger than actual average inflation multipliers.
41
44
  # Note that this is a very rough guess since actual observed pyarrow
@@ -49,8 +52,4 @@ PYARROW_INFLATION_MULTIPLIER = 2.5
49
52
  # Inflation multiplier from snappy-compressed parquet to pyarrow for all columns.
50
53
  PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS = 6
51
54
 
52
- PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG = {
53
- "retries": {"max_attempts": 25, "mode": "standard"}
54
- }
55
-
56
55
  MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE
@@ -229,8 +229,13 @@ class PlacementGroupManager:
229
229
 
230
230
  def get_current_node_resource_key(self) -> str:
231
231
  # on ec2: address="172.31.34.51:6379"
232
- # on manta: address = "2600:1f10:4674:6815:aadb:2dc8:de61:bc8e:6379"
233
- current_node_name = ray.experimental.internal_kv.global_gcs_client.address[:-5]
232
+ # on AWS Glue for Ray: address = "2600:1f10:4674:6815:aadb:2dc8:de61:bc8e:6379"
233
+ (
234
+ current_node_name,
235
+ _,
236
+ ) = ray.experimental.internal_kv.global_gcs_client.address.rsplit(
237
+ ":", 1
238
+ ) # using rsplit split on the last occurence of delimiter ":"
234
239
  for node in ray.nodes():
235
240
  if node["NodeName"] == current_node_name:
236
241
  # Found the node.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.1.18b12
3
+ Version: 0.1.18b13
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -46,9 +46,6 @@ deltacat/compute/compactor/steps/dedupe.py
46
46
  deltacat/compute/compactor/steps/hash_bucket.py
47
47
  deltacat/compute/compactor/steps/materialize.py
48
48
  deltacat/compute/compactor/steps/repartition.py
49
- deltacat/compute/compactor/steps/rehash/__init__.py
50
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py
51
- deltacat/compute/compactor/steps/rehash/rewrite_index.py
52
49
  deltacat/compute/compactor/utils/__init__.py
53
50
  deltacat/compute/compactor/utils/io.py
54
51
  deltacat/compute/compactor/utils/primary_key_index.py
@@ -1,57 +0,0 @@
1
- import logging
2
- from typing import List, Tuple
3
-
4
- import numpy as np
5
- import pyarrow as pa
6
- import ray
7
- from ray.types import ObjectRef
8
-
9
- from deltacat import logs
10
- from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator
11
- from deltacat.compute.compactor.utils import primary_key_index as pki
12
-
13
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
14
-
15
-
16
- def group_file_records_by_pk_hash_bucket(
17
- pki_table: pa.Table, num_buckets: int
18
- ) -> np.ndarray:
19
- # generate the new table for each new hash bucket
20
- hash_bucket_to_indices = pki.group_record_indices_by_hash_bucket(
21
- pki_table,
22
- num_buckets,
23
- )
24
- hash_bucket_to_table = np.empty([num_buckets], dtype="object")
25
- for hash_bucket, indices in enumerate(hash_bucket_to_indices):
26
- if indices:
27
- hash_bucket_to_table[hash_bucket] = pki_table.take(indices)
28
- return hash_bucket_to_table
29
-
30
-
31
- @ray.remote(num_cpus=1, num_returns=2)
32
- def rehash_bucket(
33
- hash_bucket_index: int,
34
- s3_bucket: str,
35
- old_pki_version_locator: PrimaryKeyIndexVersionLocator,
36
- num_buckets: int,
37
- num_groups: int,
38
- ) -> Tuple[np.ndarray, List[ObjectRef]]:
39
-
40
- logger.info(f"Starting rehash bucket task...")
41
- tables = pki.download_hash_bucket_entries(
42
- s3_bucket,
43
- hash_bucket_index,
44
- old_pki_version_locator,
45
- )
46
- prior_pk_index_table = pa.concat_tables(tables)
47
- hash_bucket_to_table = group_file_records_by_pk_hash_bucket(
48
- prior_pk_index_table,
49
- num_buckets,
50
- )
51
- hash_bucket_group_to_obj_id, object_refs = pki.group_hash_bucket_indices(
52
- hash_bucket_to_table,
53
- num_buckets,
54
- num_groups,
55
- )
56
- logger.info(f"Finished rehash bucket task...")
57
- return hash_bucket_group_to_obj_id, object_refs
@@ -1,48 +0,0 @@
1
- import logging
2
- from collections import defaultdict
3
- from typing import Any, List, Tuple
4
-
5
- import pyarrow as pa
6
- import ray
7
- from ray import cloudpickle
8
- from ray.types import ObjectRef
9
-
10
- from deltacat import logs
11
- from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator, PyArrowWriteResult
12
- from deltacat.compute.compactor.utils import primary_key_index as pki
13
-
14
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
-
16
-
17
- @ray.remote(num_cpus=1, num_returns=2)
18
- def rewrite_index(
19
- object_ids: List[Any],
20
- s3_bucket: str,
21
- new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
22
- max_records_per_index_file: int,
23
- ) -> Tuple[PyArrowWriteResult, List[ObjectRef]]:
24
-
25
- logger.info(f"Starting rewrite primary key index task...")
26
- object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
27
- logger.info(f"Getting table groups object refs...")
28
- table_groups_list = ray.get(object_refs)
29
- logger.info(f"Got {len(table_groups_list)} table groups object refs...")
30
- hb_index_to_tables = defaultdict(list)
31
- for table_groups in table_groups_list:
32
- for hb_index, table in enumerate(table_groups):
33
- if table is not None:
34
- hb_index_to_tables[hb_index].append(table)
35
- logger.info(f"Running {len(hb_index_to_tables)} rewrite index rounds...")
36
- pki_stats = []
37
- for hb_index, tables in hb_index_to_tables.items():
38
- table = pa.concat_tables(tables)
39
- hb_pki_stats = pki.write_primary_key_index_files(
40
- table,
41
- new_primary_key_index_version_locator,
42
- s3_bucket,
43
- hb_index,
44
- max_records_per_index_file,
45
- )
46
- pki_stats.append(hb_pki_stats)
47
- logger.info(f"Finished rewrite primary key index task...")
48
- return PyArrowWriteResult.union(pki_stats), object_refs
@@ -1,307 +0,0 @@
1
- import json
2
- import logging
3
- from collections import defaultdict
4
- from typing import Any, Callable, Dict, List, Optional, Tuple
5
-
6
- import numpy as np
7
- import pyarrow as pa
8
- import ray
9
- import s3fs
10
- from ray.types import ObjectRef
11
-
12
- from deltacat import logs
13
- from deltacat.aws import s3u
14
- from deltacat.compute.compactor import (
15
- PrimaryKeyIndexLocator,
16
- PrimaryKeyIndexMeta,
17
- PrimaryKeyIndexVersionLocator,
18
- PrimaryKeyIndexVersionMeta,
19
- PyArrowWriteResult,
20
- RoundCompletionInfo,
21
- )
22
- from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb
23
- from deltacat.compute.compactor.steps.rehash import rewrite_index as ri
24
- from deltacat.compute.compactor.utils import round_completion_file as rcf
25
- from deltacat.compute.compactor.utils import system_columns as sc
26
- from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
27
- from deltacat.storage import Manifest, PartitionLocator
28
- from deltacat.types.media import ContentEncoding, ContentType
29
- from deltacat.types.tables import get_table_slicer, get_table_writer
30
- from deltacat.utils.common import ReadKwargsProvider
31
- from deltacat.utils.ray_utils.concurrency import invoke_parallel
32
- from deltacat.io.object_store import IObjectStore
33
-
34
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
35
-
36
-
37
- def rehash(
38
- options_provider: Callable[[int, Any], Dict[str, Any]],
39
- s3_bucket: str,
40
- source_partition_locator: PartitionLocator,
41
- old_rci: RoundCompletionInfo,
42
- new_hash_bucket_count: int,
43
- hash_bucket_index_group_count: int,
44
- records_per_primary_key_index_file: int,
45
- delete_old_primary_key_index: bool,
46
- ) -> RoundCompletionInfo:
47
-
48
- logger.info(
49
- f"Rehashing primary key index. Old round completion info: "
50
- f"{old_rci}. New hash bucket count: {new_hash_bucket_count}"
51
- )
52
-
53
- # collect old primary key index information
54
- old_pki_version_locator = old_rci.primary_key_index_version_locator
55
- old_pkiv_meta = old_pki_version_locator.primary_key_index_version_meta
56
- old_pki_meta = old_pkiv_meta.primary_key_index_meta
57
- old_compacted_partition_locator = old_pki_meta.compacted_partition_locator
58
- if old_pkiv_meta.hash_bucket_count == new_hash_bucket_count:
59
- raise ValueError(
60
- f"Primary key index rehash failed. Old hash bucket "
61
- f"count ({new_hash_bucket_count}) is "
62
- f"equal to new hash bucket count. Partition: "
63
- f"{old_compacted_partition_locator}."
64
- )
65
-
66
- # generate a new unique primary key index version locator to rehash into
67
- new_pki_meta = PrimaryKeyIndexMeta.of(
68
- old_compacted_partition_locator,
69
- old_pki_meta.primary_keys,
70
- old_pki_meta.sort_keys,
71
- old_pki_meta.primary_key_index_algorithm_version,
72
- )
73
- new_pki_locator = PrimaryKeyIndexLocator.of(new_pki_meta)
74
- new_pki_version_meta = PrimaryKeyIndexVersionMeta.of(
75
- new_pki_meta,
76
- new_hash_bucket_count,
77
- )
78
- rehashed_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
79
- new_pki_version_meta
80
- )
81
-
82
- # launch a rehash task for each bucket of the old primary key index version
83
- old_hash_bucket_count = old_pkiv_meta.hash_bucket_count
84
- hb_tasks_pending = invoke_parallel(
85
- items=range(old_hash_bucket_count),
86
- ray_task=rb.rehash_bucket,
87
- max_parallelism=None,
88
- options_provider=options_provider,
89
- s3_bucket=s3_bucket,
90
- old_pki_version_locator=old_pki_version_locator,
91
- num_buckets=new_hash_bucket_count,
92
- num_groups=hash_bucket_index_group_count,
93
- )
94
- logger.info(f"Getting {len(hb_tasks_pending)} rehash bucket results...")
95
- hb_results = ray.get([t[0] for t in hb_tasks_pending])
96
- logger.info(f"Got {len(hb_results)} rehash bucket results.")
97
- all_hash_group_idx_to_obj_id = defaultdict(list)
98
- for hash_group_idx_to_obj_id in hb_results:
99
- for hash_group_index, object_id in enumerate(hash_group_idx_to_obj_id):
100
- if object_id:
101
- all_hash_group_idx_to_obj_id[hash_group_index].append(object_id)
102
- hash_group_count = len(all_hash_group_idx_to_obj_id)
103
- logger.info(f"Rehash bucket groups created: {hash_group_count}")
104
-
105
- # write primary key index files for each rehashed output bucket
106
- pki_stats_promises = invoke_parallel(
107
- items=all_hash_group_idx_to_obj_id.values(),
108
- ray_task=ri.rewrite_index,
109
- max_parallelism=None,
110
- options_provider=options_provider,
111
- s3_bucket=s3_bucket,
112
- new_primary_key_index_version_locator=rehashed_pki_version_locator,
113
- max_records_per_index_file=records_per_primary_key_index_file,
114
- )
115
- logger.info(f"Getting {len(pki_stats_promises)} rewrite index results...")
116
- pki_stats = ray.get([t[0] for t in pki_stats_promises])
117
- logger.info(f"Got {len(pki_stats)} rewrite index results.")
118
-
119
- round_completion_info = RoundCompletionInfo.of(
120
- old_rci.high_watermark,
121
- old_rci.compacted_delta_locator,
122
- old_rci.compacted_pyarrow_write_result,
123
- PyArrowWriteResult.union(pki_stats),
124
- old_rci.sort_keys_bit_width,
125
- rehashed_pki_version_locator,
126
- old_rci.rebase_source_partition_locator,
127
- )
128
- rcf.write_round_completion_file(
129
- s3_bucket,
130
- source_partition_locator,
131
- new_pki_locator.primary_key_index_root_path,
132
- round_completion_info,
133
- )
134
- if delete_old_primary_key_index:
135
- delete_primary_key_index_version(
136
- s3_bucket,
137
- old_pki_version_locator,
138
- )
139
- logger.info(
140
- f"Rehashed primary key index. New round completion info: "
141
- f"{round_completion_info}."
142
- )
143
- return round_completion_info
144
-
145
-
146
- def download_hash_bucket_entries(
147
- s3_bucket: str,
148
- hash_bucket_index: int,
149
- primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
150
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
151
- ) -> List[pa.Table]:
152
-
153
- pk_index_manifest_s3_url = (
154
- primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
155
- s3_bucket,
156
- hash_bucket_index,
157
- )
158
- )
159
- result = s3u.download(pk_index_manifest_s3_url, False)
160
- logger.info(
161
- f"Downloading primary key index hash bucket manifest entries: "
162
- f"{pk_index_manifest_s3_url}. Primary key index version "
163
- f"locator: {primary_key_index_version_locator}"
164
- )
165
- pk_index_manifest = Manifest(json.loads(result["Body"].read().decode("utf-8")))
166
- tables = s3u.download_manifest_entries(
167
- pk_index_manifest, file_reader_kwargs_provider=file_reader_kwargs_provider
168
- )
169
- if not tables:
170
- logger.warning(
171
- f"Primary key index manifest is empty at: "
172
- f"{pk_index_manifest_s3_url}. Primary key index version "
173
- f"locator: {primary_key_index_version_locator}"
174
- )
175
- return tables
176
-
177
-
178
- def delete_primary_key_index_version(
179
- s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
180
- ) -> None:
181
-
182
- logger.info(f"Deleting primary key index: {pki_version_locator}")
183
- s3u.delete_files_by_prefix(
184
- s3_bucket,
185
- pki_version_locator.primary_key_index_version_root_path,
186
- )
187
- logger.info(f"Primary key index deleted: {pki_version_locator}")
188
-
189
-
190
- def group_record_indices_by_hash_bucket(
191
- pki_table: pa.Table, num_buckets: int
192
- ) -> np.ndarray:
193
-
194
- hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
195
- record_index = 0
196
- for digest in sc.pk_hash_column_np(pki_table):
197
- hash_bucket = pk_digest_to_hash_bucket_index(digest, num_buckets)
198
- if hash_bucket_to_indices[hash_bucket] is None:
199
- hash_bucket_to_indices[hash_bucket] = []
200
- hash_bucket_to_indices[hash_bucket].append(record_index)
201
- record_index += 1
202
- return hash_bucket_to_indices
203
-
204
-
205
- def group_hash_bucket_indices(
206
- hash_bucket_object_groups: np.ndarray,
207
- num_buckets: int,
208
- num_groups: int,
209
- object_store: Optional[IObjectStore] = None,
210
- ) -> Tuple[np.ndarray, List[ObjectRef]]:
211
- """
212
- Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
213
- """
214
-
215
- object_refs = []
216
- hash_bucket_group_to_obj_id = np.empty([num_groups], dtype="object")
217
-
218
- if hash_bucket_object_groups is None:
219
- return hash_bucket_group_to_obj_id, object_refs
220
-
221
- hb_group_to_object = np.empty([num_groups], dtype="object")
222
- for hb_index, obj in enumerate(hash_bucket_object_groups):
223
- if obj:
224
- hb_group = hb_index % num_groups
225
- if hb_group_to_object[hb_group] is None:
226
- hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
227
- hb_group_to_object[hb_group][hb_index] = obj
228
-
229
- for hb_group, obj in enumerate(hb_group_to_object):
230
- if obj is None:
231
- continue
232
- object_ref = object_store.put(obj)
233
- object_refs.append(object_ref)
234
- hash_bucket_group_to_obj_id[hb_group] = object_ref
235
- del object_ref
236
- return hash_bucket_group_to_obj_id, object_refs
237
-
238
-
239
- def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
240
- """
241
- Deterministically get the hash bucket a particular digest belongs to
242
- based on number of total hash buckets.
243
- """
244
-
245
- return int.from_bytes(digest, "big") % num_buckets
246
-
247
-
248
- def write_primary_key_index_files(
249
- table: pa.Table,
250
- primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
251
- s3_bucket: str,
252
- hb_index: int,
253
- records_per_index_file: int,
254
- ) -> PyArrowWriteResult:
255
- """
256
- Writes primary key index files for the given hash bucket index out to the
257
- specified S3 bucket at the path identified by the given primary key index
258
- version locator. Output is written as 1 or more Parquet files with the
259
- given maximum number of records per file.
260
-
261
- TODO(raghumdani): Support writing primary key index to any data catalog
262
- """
263
- logger.info(
264
- f"Writing primary key index files for hash bucket {hb_index}. "
265
- f"Primary key index version locator: "
266
- f"{primary_key_index_version_locator}."
267
- )
268
- s3_file_system = s3fs.S3FileSystem(
269
- anon=False,
270
- s3_additional_kwargs={
271
- "ContentType": ContentType.PARQUET.value,
272
- "ContentEncoding": ContentEncoding.IDENTITY.value,
273
- },
274
- config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG,
275
- )
276
- pkiv_hb_index_s3_url_base = (
277
- primary_key_index_version_locator.get_pkiv_hb_index_s3_url_base(
278
- s3_bucket, hb_index
279
- )
280
- )
281
- manifest_entries = s3u.upload_sliced_table(
282
- table,
283
- pkiv_hb_index_s3_url_base,
284
- s3_file_system,
285
- records_per_index_file,
286
- get_table_writer(table),
287
- get_table_slicer(table),
288
- )
289
- manifest = Manifest.of(manifest_entries)
290
- pkiv_hb_index_s3_manifest_s3_url = (
291
- primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
292
- s3_bucket, hb_index
293
- )
294
- )
295
- s3u.upload(pkiv_hb_index_s3_manifest_s3_url, str(json.dumps(manifest)))
296
- result = PyArrowWriteResult.of(
297
- len(manifest_entries),
298
- table.nbytes,
299
- manifest.meta.content_length,
300
- len(table),
301
- )
302
- logger.info(
303
- f"Wrote primary key index files for hash bucket {hb_index}. "
304
- f"Primary key index version locator: "
305
- f"{primary_key_index_version_locator}. Result: {result}"
306
- )
307
- return result
File without changes
File without changes
File without changes
File without changes
File without changes