deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +2 -3
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -1
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
  40. deltacat/compute/compactor_v2/steps/merge.py +11 -80
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  45. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  46. deltacat/compute/converter/constants.py +4 -0
  47. deltacat/compute/converter/converter_session.py +143 -0
  48. deltacat/compute/converter/model/convert_input.py +69 -0
  49. deltacat/compute/converter/model/convert_input_files.py +61 -0
  50. deltacat/compute/converter/model/converter_session_params.py +99 -0
  51. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  52. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  53. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  54. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  55. deltacat/compute/converter/steps/__init__.py +0 -0
  56. deltacat/compute/converter/steps/convert.py +211 -0
  57. deltacat/compute/converter/steps/dedupe.py +60 -0
  58. deltacat/compute/converter/utils/__init__.py +0 -0
  59. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat/compute/converter/utils/s3u.py +133 -0
  64. deltacat/compute/resource_estimation/delta.py +1 -19
  65. deltacat/constants.py +47 -1
  66. deltacat/env.py +51 -0
  67. deltacat/examples/__init__.py +0 -0
  68. deltacat/examples/basic_logging.py +101 -0
  69. deltacat/examples/common/__init__.py +0 -0
  70. deltacat/examples/common/fixtures.py +15 -0
  71. deltacat/examples/hello_world.py +27 -0
  72. deltacat/examples/iceberg/__init__.py +0 -0
  73. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  74. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  75. deltacat/exceptions.py +51 -9
  76. deltacat/logs.py +4 -1
  77. deltacat/storage/__init__.py +118 -28
  78. deltacat/storage/iceberg/__init__.py +0 -0
  79. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  80. deltacat/storage/iceberg/impl.py +737 -0
  81. deltacat/storage/iceberg/model.py +709 -0
  82. deltacat/storage/interface.py +217 -134
  83. deltacat/storage/main/__init__.py +0 -0
  84. deltacat/storage/main/impl.py +2077 -0
  85. deltacat/storage/model/delta.py +118 -71
  86. deltacat/storage/model/interop.py +24 -0
  87. deltacat/storage/model/list_result.py +8 -0
  88. deltacat/storage/model/locator.py +93 -3
  89. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  90. deltacat/storage/model/metafile.py +1316 -0
  91. deltacat/storage/model/namespace.py +34 -18
  92. deltacat/storage/model/partition.py +362 -37
  93. deltacat/storage/model/scan/__init__.py +0 -0
  94. deltacat/storage/model/scan/push_down.py +19 -0
  95. deltacat/storage/model/scan/scan_plan.py +10 -0
  96. deltacat/storage/model/scan/scan_task.py +34 -0
  97. deltacat/storage/model/schema.py +892 -0
  98. deltacat/storage/model/shard.py +47 -0
  99. deltacat/storage/model/sort_key.py +170 -13
  100. deltacat/storage/model/stream.py +208 -80
  101. deltacat/storage/model/table.py +123 -29
  102. deltacat/storage/model/table_version.py +322 -46
  103. deltacat/storage/model/transaction.py +757 -0
  104. deltacat/storage/model/transform.py +198 -61
  105. deltacat/storage/model/types.py +111 -13
  106. deltacat/storage/rivulet/__init__.py +11 -0
  107. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  108. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  109. deltacat/storage/rivulet/dataset.py +744 -0
  110. deltacat/storage/rivulet/dataset_executor.py +87 -0
  111. deltacat/storage/rivulet/feather/__init__.py +5 -0
  112. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  113. deltacat/storage/rivulet/feather/serializer.py +35 -0
  114. deltacat/storage/rivulet/fs/__init__.py +0 -0
  115. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  116. deltacat/storage/rivulet/fs/file_store.py +130 -0
  117. deltacat/storage/rivulet/fs/input_file.py +76 -0
  118. deltacat/storage/rivulet/fs/output_file.py +86 -0
  119. deltacat/storage/rivulet/logical_plan.py +105 -0
  120. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  121. deltacat/storage/rivulet/metastore/delta.py +190 -0
  122. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  123. deltacat/storage/rivulet/metastore/sst.py +82 -0
  124. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  125. deltacat/storage/rivulet/mvp/Table.py +101 -0
  126. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  127. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  129. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  130. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  131. deltacat/storage/rivulet/reader/__init__.py +0 -0
  132. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  133. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  134. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  135. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  136. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  137. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  138. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  139. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  140. deltacat/storage/rivulet/schema/__init__.py +0 -0
  141. deltacat/storage/rivulet/schema/datatype.py +128 -0
  142. deltacat/storage/rivulet/schema/schema.py +251 -0
  143. deltacat/storage/rivulet/serializer.py +40 -0
  144. deltacat/storage/rivulet/serializer_factory.py +42 -0
  145. deltacat/storage/rivulet/writer/__init__.py +0 -0
  146. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  147. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  148. deltacat/tests/_io/__init__.py +1 -0
  149. deltacat/tests/catalog/test_catalogs.py +324 -0
  150. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  151. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  152. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  153. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  154. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  155. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  156. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  157. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  158. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  159. deltacat/tests/compute/conftest.py +75 -0
  160. deltacat/tests/compute/converter/__init__.py +0 -0
  161. deltacat/tests/compute/converter/conftest.py +80 -0
  162. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  163. deltacat/tests/compute/converter/utils.py +123 -0
  164. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  165. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  166. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  167. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  168. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  169. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  170. deltacat/tests/compute/test_util_common.py +19 -12
  171. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  172. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  173. deltacat/tests/storage/__init__.py +0 -0
  174. deltacat/tests/storage/conftest.py +25 -0
  175. deltacat/tests/storage/main/__init__.py +0 -0
  176. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  177. deltacat/tests/storage/model/__init__.py +0 -0
  178. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  179. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  180. deltacat/tests/storage/model/test_schema.py +308 -0
  181. deltacat/tests/storage/model/test_shard.py +22 -0
  182. deltacat/tests/storage/model/test_table_version.py +110 -0
  183. deltacat/tests/storage/model/test_transaction.py +308 -0
  184. deltacat/tests/storage/rivulet/__init__.py +0 -0
  185. deltacat/tests/storage/rivulet/conftest.py +149 -0
  186. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  187. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  188. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  189. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  190. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  191. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  192. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  193. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  194. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  195. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  197. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  198. deltacat/tests/test_deltacat_api.py +39 -0
  199. deltacat/tests/test_utils/filesystem.py +14 -0
  200. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  201. deltacat/tests/test_utils/pyarrow.py +8 -15
  202. deltacat/tests/test_utils/storage.py +266 -3
  203. deltacat/tests/utils/test_daft.py +3 -3
  204. deltacat/tests/utils/test_pyarrow.py +0 -432
  205. deltacat/types/partial_download.py +1 -1
  206. deltacat/types/tables.py +1 -1
  207. deltacat/utils/export.py +59 -0
  208. deltacat/utils/filesystem.py +320 -0
  209. deltacat/utils/metafile_locator.py +73 -0
  210. deltacat/utils/pyarrow.py +36 -183
  211. deltacat-2.0.dist-info/METADATA +65 -0
  212. deltacat-2.0.dist-info/RECORD +347 -0
  213. deltacat/aws/redshift/__init__.py +0 -19
  214. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  215. deltacat/io/dataset.py +0 -73
  216. deltacat/io/read_api.py +0 -143
  217. deltacat/storage/model/delete_parameters.py +0 -40
  218. deltacat/storage/model/partition_spec.py +0 -71
  219. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  220. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  221. deltacat-1.1.35.dist-info/METADATA +0 -64
  222. deltacat-1.1.35.dist-info/RECORD +0 -219
  223. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  224. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  225. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  226. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  227. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  228. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  229. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  233. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  234. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  235. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -201,7 +201,7 @@ def _timed_hash_bucket(
201
201
  with memray.Tracker(
202
202
  f"hash_bucket_{worker_id}_{task_id}.bin"
203
203
  ) if enable_profiler else nullcontext():
204
- sort_key_names = [key.key_name for key in sort_keys]
204
+ sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
205
205
  if not round_completion_info:
206
206
  is_src_delta = True
207
207
  else:
@@ -25,9 +25,10 @@ from deltacat.storage import (
25
25
  DeltaType,
26
26
  Partition,
27
27
  PartitionLocator,
28
- Manifest,
29
28
  ManifestEntry,
29
+ ManifestEntryList,
30
30
  )
31
+ from deltacat.storage.model.manifest import Manifest
31
32
  from deltacat.storage import interface as unimplemented_deltacat_storage
32
33
  from deltacat.utils.common import ReadKwargsProvider
33
34
  from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
@@ -82,7 +83,10 @@ def materialize(
82
83
  assert (
83
84
  delta_type == DeltaType.UPSERT
84
85
  ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
85
- manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
86
+ manifest = Manifest.of(
87
+ entries=ManifestEntryList.of(manifest_entry_list_reference),
88
+ uuid=str(uuid4()),
89
+ )
86
90
  delta = Delta.of(
87
91
  locator=DeltaLocator.of(partition.locator),
88
92
  delta_type=delta_type,
@@ -358,7 +358,7 @@ def fit_input_deltas(
358
358
  def _discover_deltas(
359
359
  source_partition_locator: PartitionLocator,
360
360
  start_position_exclusive: Optional[int],
361
- end_position_inclusive: int,
361
+ end_position_inclusive: Optional[int],
362
362
  deltacat_storage=unimplemented_deltacat_storage,
363
363
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
364
364
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
@@ -1,6 +1,7 @@
1
1
  import pyarrow as pa
2
2
  from typing import List
3
- from deltacat.storage import PartitionLocator, SortKey
3
+ from itertools import chain
4
+ from deltacat.storage import PartitionLocator, SortKey, TransformName
4
5
 
5
6
  MAX_SORT_KEYS_BIT_WIDTH = 256
6
7
 
@@ -22,7 +23,13 @@ def validate_sort_keys(
22
23
  deltacat_storage_kwargs = {}
23
24
  total_sort_keys_bit_width = 0
24
25
  if sort_keys:
25
- sort_key_names = [key.key_name for key in sort_keys]
26
+ sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
27
+ assert all(
28
+ [
29
+ key.transform is None or key.transform.name == TransformName.IDENTITY
30
+ for key in sort_keys
31
+ ]
32
+ ), f"Sort key transforms are not supported: {sort_keys}"
26
33
  assert len(sort_key_names) == len(
27
34
  set(sort_key_names)
28
35
  ), f"Sort key names must be unique: {sort_key_names}"
@@ -27,9 +27,8 @@ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
27
27
  from deltacat.storage import (
28
28
  Delta,
29
29
  DeltaLocator,
30
- Manifest,
31
- Partition,
32
30
  )
31
+ from deltacat.storage.model.manifest import Manifest
33
32
  from deltacat.compute.compactor.model.compact_partition_params import (
34
33
  CompactPartitionParams,
35
34
  )
@@ -139,7 +138,7 @@ def _execute_compaction(
139
138
  logger.info("No input deltas found to compact.")
140
139
  return ExecutionCompactionResult(None, None, None, False)
141
140
  build_uniform_deltas_result: tuple[
142
- List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition
141
+ List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
143
142
  ] = _build_uniform_deltas(
144
143
  params, compaction_audit, input_deltas, delta_discovery_start
145
144
  )
@@ -1,5 +1,3 @@
1
- from deltacat.utils.common import env_bool, env_integer, env_string
2
-
3
1
  TOTAL_BYTES_IN_SHA1_HASH = 20
4
2
 
5
3
  PK_DELIMITER = "L6kl7u5f"
@@ -33,9 +31,7 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
33
31
  # The total size of records that will be hash bucketed at once
34
32
  # Since, sorting is nlogn, we ensure that is not performed
35
33
  # on a very large dataset for best performance.
36
- MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
37
- "MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
38
- )
34
+ MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
39
35
 
40
36
  # Whether to drop duplicates during merge.
41
37
  DROP_DUPLICATES = True
@@ -82,28 +78,3 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
82
78
  # Number of rounds to run hash/merge for a single
83
79
  # partition. (For large table support)
84
80
  DEFAULT_NUM_ROUNDS = 1
85
-
86
- # Whether to perform sha1 hashing when required to
87
- # optimize memory. For example, hashing is always
88
- # required for bucketing where it's not mandatory
89
- # when dropping duplicates. Setting this to True
90
- # will disable sha1 hashing in cases where it isn't
91
- # mandatory. This flag is False by default.
92
- SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
93
- "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
94
- )
95
-
96
- # This env variable specifies whether to check bucketing spec
97
- # compliance of the existing compacted table.
98
- # PRINT_LOG: Enable logging if any partition is found
99
- # to be non-compliant with the bucketing spec.
100
- # ASSERT: Fail the job with ValidationError if the
101
- # current compacted partition is found to be non-compliant
102
- # with bucketing spec. Note, logging is implicitly enabled
103
- # in this case.
104
- BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
105
- "BUCKETING_SPEC_COMPLIANCE_PROFILE", None
106
- )
107
-
108
- BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
109
- BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
@@ -49,7 +49,7 @@ def _aggregate_delete_deltas(input_deltas: List[Delta]) -> Dict[int, List[Delta]
49
49
  ] = [
50
50
  (is_delete, list(delete_delta_group))
51
51
  for (is_delete, _), delete_delta_group in itertools.groupby(
52
- input_deltas, lambda d: (d.type is DeltaType.DELETE, d.delete_parameters)
52
+ input_deltas, lambda d: (d.type is DeltaType.DELETE, d.meta.entry_params)
53
53
  )
54
54
  ]
55
55
  for (
@@ -89,11 +89,11 @@ def _get_delete_file_envelopes(
89
89
  consecutive_delete_tables: List[pa.Table] = []
90
90
  for delete_delta in delete_delta_sequence:
91
91
  assert (
92
- delete_delta.delete_parameters is not None
92
+ delete_delta.meta.entry_params is not None
93
93
  ), "Delete type deltas are required to have delete parameters defined"
94
94
  delete_columns: Optional[
95
95
  List[str]
96
- ] = delete_delta.delete_parameters.equality_column_names
96
+ ] = delete_delta.meta.entry_params.equality_field_locators
97
97
  assert len(delete_columns) > 0, "At least 1 delete column is required"
98
98
  # delete columns should exist in underlying table
99
99
  delete_dataset = params.deltacat_storage.download_delta(
@@ -43,7 +43,7 @@ class MergeInput(Dict):
43
43
  round_completion_info: Optional[RoundCompletionInfo] = None,
44
44
  object_store: Optional[IObjectStore] = None,
45
45
  delete_strategy: Optional[DeleteStrategy] = None,
46
- delete_file_envelopes: Optional[List] = None,
46
+ delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None,
47
47
  deltacat_storage=unimplemented_deltacat_storage,
48
48
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
49
49
  memory_logs_enabled: Optional[bool] = None,
@@ -63,7 +63,7 @@ from deltacat.compute.compactor_v2.steps import merge as mg
63
63
  from deltacat.compute.compactor_v2.steps import hash_bucket as hb
64
64
  from deltacat.compute.compactor_v2.utils import io
65
65
 
66
- from typing import List, Optional
66
+ from typing import List, Optional, Union
67
67
  from collections import defaultdict
68
68
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
69
69
  CompactionSessionAuditInfo,
@@ -83,7 +83,7 @@ def _fetch_compaction_metadata(
83
83
 
84
84
  # read the results from any previously completed compaction round
85
85
  round_completion_info: Optional[RoundCompletionInfo] = None
86
- high_watermark: Optional[HighWatermark] = None
86
+ high_watermark: Optional[Union[HighWatermark, int]] = None
87
87
  previous_compacted_delta_manifest: Optional[Manifest] = None
88
88
 
89
89
  if not params.rebase_source_partition_locator:
@@ -129,7 +129,7 @@ def _build_uniform_deltas(
129
129
  mutable_compaction_audit: CompactionSessionAuditInfo,
130
130
  input_deltas: List[Delta],
131
131
  delta_discovery_start: float,
132
- ) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition]:
132
+ ) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]]:
133
133
 
134
134
  delete_strategy: Optional[DeleteStrategy] = None
135
135
  delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
@@ -222,7 +222,7 @@ def _run_hash_and_merge(
222
222
  uniform_deltas: List[DeltaAnnotated],
223
223
  round_completion_info: RoundCompletionInfo,
224
224
  delete_strategy: Optional[DeleteStrategy],
225
- delete_file_envelopes: Optional[DeleteFileEnvelope],
225
+ delete_file_envelopes: Optional[List[DeleteFileEnvelope]],
226
226
  mutable_compaction_audit: CompactionSessionAuditInfo,
227
227
  previous_compacted_delta_manifest: Optional[Manifest],
228
228
  compacted_partition: Partition,
@@ -389,7 +389,7 @@ def _merge(
389
389
  all_hash_group_idx_to_obj_id: dict,
390
390
  compacted_partition: Partition,
391
391
  delete_strategy: DeleteStrategy,
392
- delete_file_envelopes: DeleteFileEnvelope,
392
+ delete_file_envelopes: List[DeleteFileEnvelope],
393
393
  ) -> tuple[List[MergeResult], float]:
394
394
  merge_options_provider = functools.partial(
395
395
  task_resource_options_provider,
@@ -7,7 +7,6 @@ import ray
7
7
  import itertools
8
8
  import time
9
9
  import pyarrow.compute as pc
10
- from deltacat.utils.pyarrow import MAX_INT_BYTES
11
10
  import deltacat.compute.compactor_v2.utils.merge as merge_utils
12
11
  from uuid import uuid4
13
12
  from deltacat import logs
@@ -32,25 +31,21 @@ from deltacat.utils.resources import (
32
31
  )
33
32
  from deltacat.compute.compactor_v2.utils.primary_key_index import (
34
33
  generate_pk_hash_column,
35
- pk_digest_to_hash_bucket_index,
36
34
  )
37
35
  from deltacat.storage import (
38
36
  Delta,
39
37
  DeltaLocator,
40
38
  DeltaType,
41
- Manifest,
42
39
  Partition,
43
40
  interface as unimplemented_deltacat_storage,
44
41
  )
42
+ from deltacat.storage.model.manifest import Manifest
45
43
  from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
46
44
  from deltacat.constants import BYTES_PER_GIBIBYTE
47
45
  from deltacat.compute.compactor_v2.constants import (
48
46
  MERGE_TIME_IN_SECONDS,
49
47
  MERGE_SUCCESS_COUNT,
50
48
  MERGE_FAILURE_COUNT,
51
- BUCKETING_SPEC_COMPLIANCE_PROFILE,
52
- BUCKETING_SPEC_COMPLIANCE_ASSERT,
53
- BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
54
49
  )
55
50
  from deltacat.exceptions import (
56
51
  categorize_errors,
@@ -152,32 +147,10 @@ def _merge_tables(
152
147
  if compacted_table:
153
148
  compacted_table = all_tables[0]
154
149
 
155
- compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
156
- incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
157
-
158
- logger.info(
159
- f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
160
- f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
161
- )
162
-
163
- if (
164
- compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
165
- or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
166
- ):
167
- logger.info("Casting compacted and incremental pk hash to large_string...")
168
- # is_in combines the chunks of the chunked array passed which can cause
169
- # ArrowCapacityError if the total size of string array is over 2GB.
170
- # Using a large_string would resolve this issue.
171
- # The cast here should be zero-copy in most cases.
172
- compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
173
- incremental_pk_hash_str = pc.cast(
174
- incremental_pk_hash_str, pa.large_string()
175
- )
176
-
177
150
  records_to_keep = pc.invert(
178
151
  pc.is_in(
179
- compacted_pk_hash_str,
180
- incremental_pk_hash_str,
152
+ compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
153
+ incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
181
154
  )
182
155
  )
183
156
 
@@ -192,34 +165,9 @@ def _merge_tables(
192
165
  return final_table
193
166
 
194
167
 
195
- def _validate_bucketing_spec_compliance(
196
- table: pa.Table, rcf: RoundCompletionInfo, hb_index: int, primary_keys: List[str]
197
- ) -> None:
198
- pki_table = generate_pk_hash_column(
199
- [table], primary_keys=primary_keys, requires_hash=True
200
- )[0]
201
- for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
202
- hash_bucket = pk_digest_to_hash_bucket_index(hash_value, rcf.hash_bucket_count)
203
- if hash_bucket != hb_index:
204
- logger.info(
205
- f"{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}"
206
- f".{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}"
207
- f".{rcf.compacted_delta_locator.partition_values} has non-compliant bucketing spec. "
208
- f"Expected hash bucket is {hb_index} but found {hash_bucket}."
209
- )
210
- if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
211
- raise AssertionError(
212
- "Hash bucket drift detected. Expected hash bucket index"
213
- f" to be {hb_index} but found {hash_bucket}"
214
- )
215
- # No further checks necessary
216
- break
217
-
218
-
219
168
  def _download_compacted_table(
220
169
  hb_index: int,
221
170
  rcf: RoundCompletionInfo,
222
- primary_keys: List[str],
223
171
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
224
172
  deltacat_storage=unimplemented_deltacat_storage,
225
173
  deltacat_storage_kwargs: Optional[dict] = None,
@@ -243,23 +191,7 @@ def _download_compacted_table(
243
191
 
244
192
  tables.append(table)
245
193
 
246
- compacted_table = pa.concat_tables(tables)
247
- check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
248
- BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
249
- BUCKETING_SPEC_COMPLIANCE_ASSERT,
250
- ]
251
-
252
- logger.debug(
253
- f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
254
- f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
255
- )
256
-
257
- # Bucketing spec compliance isn't required without primary keys
258
- if primary_keys and check_bucketing_spec:
259
- _validate_bucketing_spec_compliance(
260
- compacted_table, rcf, hb_index, primary_keys
261
- )
262
- return compacted_table
194
+ return pa.concat_tables(tables)
263
195
 
264
196
 
265
197
  def _copy_all_manifest_files_from_old_hash_buckets(
@@ -518,7 +450,9 @@ def _apply_upserts(
518
450
  # on non event based sort key does not produce consistent
519
451
  # compaction results. E.g., compaction(delta1, delta2, delta3)
520
452
  # will not be equal to compaction(compaction(delta1, delta2), delta3).
521
- table = table.sort_by(input.sort_keys)
453
+ table = table.sort_by(
454
+ [pa_key for key in input.sort_keys for pa_key in key.arrow]
455
+ )
522
456
  hb_table_record_count = len(table) + (len(prev_table) if prev_table else 0)
523
457
  table, merge_time = timed_invocation(
524
458
  func=_merge_tables,
@@ -560,11 +494,9 @@ def _copy_manifests_from_hash_bucketing(
560
494
  def _timed_merge(input: MergeInput) -> MergeResult:
561
495
  task_id = get_current_ray_task_id()
562
496
  worker_id = get_current_ray_worker_id()
563
- with (
564
- memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
565
- if input.enable_profiler
566
- else nullcontext()
567
- ):
497
+ with memray.Tracker(
498
+ f"merge_{worker_id}_{task_id}.bin"
499
+ ) if input.enable_profiler else nullcontext():
568
500
  total_input_records, total_deduped_records = 0, 0
569
501
  total_dropped_records = 0
570
502
  materialized_results: List[MaterializeResult] = []
@@ -588,7 +520,6 @@ def _timed_merge(input: MergeInput) -> MergeResult:
588
520
  compacted_table = _download_compacted_table(
589
521
  hb_index=merge_file_group.hb_index,
590
522
  rcf=input.round_completion_info,
591
- primary_keys=input.primary_keys,
592
523
  read_kwargs_provider=input.read_kwargs_provider,
593
524
  deltacat_storage=input.deltacat_storage,
594
525
  deltacat_storage_kwargs=input.deltacat_storage_kwargs,
@@ -673,5 +604,5 @@ def merge(input: MergeInput) -> MergeResult:
673
604
  merge_result[3],
674
605
  merge_result[4],
675
606
  np.double(emit_metrics_time),
676
- merge_result[6],
607
+ merge_result[4],
677
608
  )
@@ -5,7 +5,6 @@ from deltacat.compute.compactor_v2.constants import (
5
5
  TASK_MAX_PARALLELISM,
6
6
  MAX_PARQUET_METADATA_SIZE,
7
7
  )
8
- from deltacat.utils.common import ReadKwargsProvider
9
8
  from deltacat.utils.ray_utils.concurrency import invoke_parallel
10
9
  from deltacat import logs
11
10
  from deltacat.storage import (
@@ -76,21 +75,11 @@ def _download_parquet_metadata_for_manifest_entry(
76
75
  entry_index: int,
77
76
  deltacat_storage: unimplemented_deltacat_storage,
78
77
  deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
79
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
80
78
  ) -> Dict[str, Any]:
81
- logger.info(
82
- f"Downloading the parquet metadata for Delta with locator {delta.locator} and entry_index: {entry_index}"
83
- )
84
- if "file_reader_kwargs_provider" in deltacat_storage_kwargs:
85
- logger.info(
86
- "'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
87
- )
88
- deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
89
79
  pq_file = deltacat_storage.download_delta_manifest_entry(
90
80
  delta,
91
81
  entry_index=entry_index,
92
82
  table_type=TableType.PYARROW_PARQUET,
93
- file_reader_kwargs_provider=file_reader_kwargs_provider,
94
83
  **deltacat_storage_kwargs,
95
84
  )
96
85
 
@@ -108,15 +97,11 @@ def append_content_type_params(
108
97
  max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
109
98
  deltacat_storage=unimplemented_deltacat_storage,
110
99
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
111
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
112
100
  ) -> bool:
113
101
  """
114
102
  This operation appends content type params into the delta entry. Note
115
103
  that this operation can be time consuming, hence we cache it in a Ray actor.
116
104
  """
117
- logger.info(
118
- f"Appending the content type params for Delta with locator {delta.locator}..."
119
- )
120
105
 
121
106
  if not delta.meta:
122
107
  logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
@@ -174,7 +159,6 @@ def append_content_type_params(
174
159
 
175
160
  def input_provider(index, item) -> Dict:
176
161
  return {
177
- "file_reader_kwargs_provider": file_reader_kwargs_provider,
178
162
  "deltacat_storage_kwargs": deltacat_storage_kwargs,
179
163
  "deltacat_storage": deltacat_storage,
180
164
  "delta": delta,
@@ -184,7 +168,6 @@ def append_content_type_params(
184
168
  logger.info(
185
169
  f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
186
170
  )
187
-
188
171
  pq_files_promise = invoke_parallel(
189
172
  entry_indices_to_download,
190
173
  ray_task=_download_parquet_metadata_for_manifest_entry,
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
25
25
  result[index] = np.arange(cl, dtype="int32")
26
26
 
27
27
  chunk_lengths = ([0] + chunk_lengths)[:-1]
28
- result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
28
+ result = pa.chunked_array(result + np.cumsum(chunk_lengths))
29
29
  return result
30
30
 
31
31
 
@@ -101,6 +101,7 @@ def create_uniform_input_deltas(
101
101
  delta_manifest_entries_count = 0
102
102
  estimated_da_bytes = 0
103
103
  input_da_list = []
104
+
104
105
  for delta in input_deltas:
105
106
  if (
106
107
  compact_partition_params.enable_input_split
@@ -117,7 +118,6 @@ def create_uniform_input_deltas(
117
118
  deltacat_storage_kwargs=deltacat_storage_kwargs,
118
119
  task_max_parallelism=compact_partition_params.task_max_parallelism,
119
120
  max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
120
- file_reader_kwargs_provider=compact_partition_params.read_kwargs_provider,
121
121
  )
122
122
 
123
123
  manifest_entries = delta.manifest.entries
@@ -10,7 +10,6 @@ from deltacat.compute.compactor_v2.constants import (
10
10
  TOTAL_BYTES_IN_SHA1_HASH,
11
11
  PK_DELIMITER,
12
12
  MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
13
- SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
14
13
  )
15
14
  import time
16
15
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
@@ -49,13 +48,6 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
49
48
  f"Found total length of hash column={total_len} and total_size={total_size}"
50
49
  )
51
50
 
52
- if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
53
- logger.info(
54
- f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
55
- f"Returning False for is_sha1_desired"
56
- )
57
- return False
58
-
59
51
  return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
60
52
 
61
53
 
@@ -116,10 +108,9 @@ def _optimized_group_record_batches_by_hash_bucket(
116
108
  record_batches = []
117
109
  result_len = 0
118
110
  for record_batch in table_batches:
119
- if (
120
- record_batches
121
- and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
122
- ):
111
+ current_bytes += record_batch.nbytes
112
+ record_batches.append(record_batch)
113
+ if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
123
114
  logger.info(
124
115
  f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
125
116
  f"is {len(record_batches)} and size {current_bytes}"
@@ -137,9 +128,6 @@ def _optimized_group_record_batches_by_hash_bucket(
137
128
  current_bytes = 0
138
129
  record_batches.clear()
139
130
 
140
- current_bytes += record_batch.nbytes
141
- record_batches.append(record_batch)
142
-
143
131
  if record_batches:
144
132
  appended_len, append_latency = timed_invocation(
145
133
  _append_table_by_hash_bucket,
@@ -1,16 +1,11 @@
1
1
  import logging
2
2
  from typing import Dict, Optional, List, Tuple, Any
3
3
  from deltacat import logs
4
- from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
5
- from deltacat.compute.compactor_v2.constants import (
6
- AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
7
- )
8
4
  from deltacat.compute.compactor_v2.model.merge_file_group import (
9
5
  LocalMergeFileGroupsProvider,
10
6
  )
11
7
  from deltacat.storage import (
12
8
  Manifest,
13
- ManifestEntry,
14
9
  interface as unimplemented_deltacat_storage,
15
10
  )
16
11
  from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
@@ -86,27 +81,16 @@ def _get_merge_task_options(
86
81
  and compacted_delta_manifest
87
82
  and round_completion_info.hb_index_to_entry_range
88
83
  ):
89
- logger.debug_conditional(
90
- f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
91
- memory_logs_enabled,
92
- )
93
- previous_inflation: float = (
94
- (
95
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
96
- / round_completion_info.compacted_pyarrow_write_result.file_bytes
97
- )
98
- if round_completion_info.compacted_pyarrow_write_result.file_bytes
99
- else PYARROW_INFLATION_MULTIPLIER
84
+
85
+ previous_inflation = (
86
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
87
+ / round_completion_info.compacted_pyarrow_write_result.file_bytes
100
88
  )
101
89
  debug_memory_params["previous_inflation"] = previous_inflation
102
90
 
103
- average_record_size: float = (
104
- (
105
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
106
- / round_completion_info.compacted_pyarrow_write_result.records
107
- )
108
- if round_completion_info.compacted_pyarrow_write_result.records
109
- else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
91
+ average_record_size = (
92
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
93
+ / round_completion_info.compacted_pyarrow_write_result.records
110
94
  )
111
95
  debug_memory_params["average_record_size"] = average_record_size
112
96
 
@@ -122,36 +106,31 @@ def _get_merge_task_options(
122
106
  str(hb_idx)
123
107
  ]
124
108
  for entry_index in range(entry_start, entry_end):
125
- entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
126
- current_entry_size: float = (
127
- estimate_manifest_entry_size_bytes(
128
- entry=entry,
129
- operation_type=OperationType.PYARROW_DOWNLOAD,
130
- estimate_resources_params=estimate_resources_params,
131
- )
132
- or 0.0
109
+ entry = compacted_delta_manifest.entries[entry_index]
110
+
111
+ current_entry_size = estimate_manifest_entry_size_bytes(
112
+ entry=entry,
113
+ operation_type=OperationType.PYARROW_DOWNLOAD,
114
+ estimate_resources_params=estimate_resources_params,
133
115
  )
134
- current_entry_rows: int = (
135
- estimate_manifest_entry_num_rows(
136
- entry=entry,
137
- operation_type=OperationType.PYARROW_DOWNLOAD,
138
- estimate_resources_params=estimate_resources_params,
139
- )
140
- or 0
116
+ current_entry_rows = estimate_manifest_entry_num_rows(
117
+ entry=entry,
118
+ operation_type=OperationType.PYARROW_DOWNLOAD,
119
+ estimate_resources_params=estimate_resources_params,
141
120
  )
142
- # NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
121
+
143
122
  data_size += current_entry_size
144
123
  num_rows += current_entry_rows
124
+
145
125
  if primary_keys:
146
- pk_size: Optional[
147
- float
148
- ] = estimate_manifest_entry_column_size_bytes(
126
+ pk_size = estimate_manifest_entry_column_size_bytes(
149
127
  entry=entry,
150
128
  columns=primary_keys,
151
129
  operation_type=OperationType.PYARROW_DOWNLOAD,
152
130
  estimate_resources_params=estimate_resources_params,
153
131
  )
154
- if not pk_size:
132
+
133
+ if pk_size is None:
155
134
  pk_size_bytes += current_entry_size
156
135
  else:
157
136
  pk_size_bytes += pk_size
@@ -180,6 +159,7 @@ def _get_merge_task_options(
180
159
  f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
181
160
  memory_logs_enabled,
182
161
  )
162
+
183
163
  return _get_task_options(0.01, total_memory, ray_custom_resources)
184
164
 
185
165
 
@@ -0,0 +1,4 @@
1
+ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
2
+
3
+ # Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
4
+ DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30