deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -201,7 +201,7 @@ def _timed_hash_bucket(
201
201
  with memray.Tracker(
202
202
  f"hash_bucket_{worker_id}_{task_id}.bin"
203
203
  ) if enable_profiler else nullcontext():
204
- sort_key_names = [key.key_name for key in sort_keys]
204
+ sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
205
205
  if not round_completion_info:
206
206
  is_src_delta = True
207
207
  else:
@@ -25,9 +25,10 @@ from deltacat.storage import (
25
25
  DeltaType,
26
26
  Partition,
27
27
  PartitionLocator,
28
- Manifest,
29
28
  ManifestEntry,
29
+ ManifestEntryList,
30
30
  )
31
+ from deltacat.storage.model.manifest import Manifest
31
32
  from deltacat.storage import interface as unimplemented_deltacat_storage
32
33
  from deltacat.utils.common import ReadKwargsProvider
33
34
  from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
@@ -82,7 +83,10 @@ def materialize(
82
83
  assert (
83
84
  delta_type == DeltaType.UPSERT
84
85
  ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
85
- manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
86
+ manifest = Manifest.of(
87
+ entries=ManifestEntryList.of(manifest_entry_list_reference),
88
+ uuid=str(uuid4()),
89
+ )
86
90
  delta = Delta.of(
87
91
  locator=DeltaLocator.of(partition.locator),
88
92
  delta_type=delta_type,
@@ -358,7 +358,7 @@ def fit_input_deltas(
358
358
  def _discover_deltas(
359
359
  source_partition_locator: PartitionLocator,
360
360
  start_position_exclusive: Optional[int],
361
- end_position_inclusive: int,
361
+ end_position_inclusive: Optional[int],
362
362
  deltacat_storage=unimplemented_deltacat_storage,
363
363
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
364
364
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
@@ -1,6 +1,7 @@
1
1
  import pyarrow as pa
2
2
  from typing import List
3
- from deltacat.storage import PartitionLocator, SortKey
3
+ from itertools import chain
4
+ from deltacat.storage import PartitionLocator, SortKey, TransformName
4
5
 
5
6
  MAX_SORT_KEYS_BIT_WIDTH = 256
6
7
 
@@ -22,7 +23,13 @@ def validate_sort_keys(
22
23
  deltacat_storage_kwargs = {}
23
24
  total_sort_keys_bit_width = 0
24
25
  if sort_keys:
25
- sort_key_names = [key.key_name for key in sort_keys]
26
+ sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
27
+ assert all(
28
+ [
29
+ key.transform is None or key.transform.name == TransformName.IDENTITY
30
+ for key in sort_keys
31
+ ]
32
+ ), f"Sort key transforms are not supported: {sort_keys}"
26
33
  assert len(sort_key_names) == len(
27
34
  set(sort_key_names)
28
35
  ), f"Sort key names must be unique: {sort_key_names}"
@@ -27,9 +27,8 @@ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
27
27
  from deltacat.storage import (
28
28
  Delta,
29
29
  DeltaLocator,
30
- Manifest,
31
- Partition,
32
30
  )
31
+ from deltacat.storage.model.manifest import Manifest
33
32
  from deltacat.compute.compactor.model.compact_partition_params import (
34
33
  CompactPartitionParams,
35
34
  )
@@ -69,17 +68,14 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
69
68
  assert (
70
69
  params.hash_bucket_count is not None and params.hash_bucket_count >= 1
71
70
  ), "hash_bucket_count is a required arg for compactor v2"
72
- assert type(params.hash_bucket_count) is int, "Hash bucket count must be an integer"
73
71
  if params.num_rounds > 1:
74
72
  assert (
75
73
  not params.drop_duplicates
76
74
  ), "num_rounds > 1, drop_duplicates must be False but is True"
77
75
 
78
- with (
79
- memray.Tracker("compaction_partition.bin")
80
- if params.enable_profiler
81
- else nullcontext()
82
- ):
76
+ with memray.Tracker(
77
+ "compaction_partition.bin"
78
+ ) if params.enable_profiler else nullcontext():
83
79
  execute_compaction_result: ExecutionCompactionResult = _execute_compaction(
84
80
  params,
85
81
  **kwargs,
@@ -142,7 +138,7 @@ def _execute_compaction(
142
138
  logger.info("No input deltas found to compact.")
143
139
  return ExecutionCompactionResult(None, None, None, False)
144
140
  build_uniform_deltas_result: tuple[
145
- List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition
141
+ List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
146
142
  ] = _build_uniform_deltas(
147
143
  params, compaction_audit, input_deltas, delta_discovery_start
148
144
  )
@@ -1,5 +1,3 @@
1
- from deltacat.utils.common import env_bool, env_integer, env_string
2
-
3
1
  TOTAL_BYTES_IN_SHA1_HASH = 20
4
2
 
5
3
  PK_DELIMITER = "L6kl7u5f"
@@ -33,9 +31,7 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
33
31
  # The total size of records that will be hash bucketed at once
34
32
  # Since, sorting is nlogn, we ensure that is not performed
35
33
  # on a very large dataset for best performance.
36
- MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
37
- "MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
38
- )
34
+ MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
39
35
 
40
36
  # Whether to drop duplicates during merge.
41
37
  DROP_DUPLICATES = True
@@ -82,28 +78,3 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
82
78
  # Number of rounds to run hash/merge for a single
83
79
  # partition. (For large table support)
84
80
  DEFAULT_NUM_ROUNDS = 1
85
-
86
- # Whether to perform sha1 hashing when required to
87
- # optimize memory. For example, hashing is always
88
- # required for bucketing where it's not mandatory
89
- # when dropping duplicates. Setting this to True
90
- # will disable sha1 hashing in cases where it isn't
91
- # mandatory. This flag is False by default.
92
- SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
93
- "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
94
- )
95
-
96
- # This env variable specifies whether to check bucketing spec
97
- # compliance of the existing compacted table.
98
- # PRINT_LOG: Enable logging if any partition is found
99
- # to be non-compliant with the bucketing spec.
100
- # ASSERT: Fail the job with ValidationError if the
101
- # current compacted partition is found to be non-compliant
102
- # with bucketing spec. Note, logging is implicitly enabled
103
- # in this case.
104
- BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
105
- "BUCKETING_SPEC_COMPLIANCE_PROFILE", None
106
- )
107
-
108
- BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
109
- BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
@@ -49,7 +49,7 @@ def _aggregate_delete_deltas(input_deltas: List[Delta]) -> Dict[int, List[Delta]
49
49
  ] = [
50
50
  (is_delete, list(delete_delta_group))
51
51
  for (is_delete, _), delete_delta_group in itertools.groupby(
52
- input_deltas, lambda d: (d.type is DeltaType.DELETE, d.delete_parameters)
52
+ input_deltas, lambda d: (d.type is DeltaType.DELETE, d.meta.entry_params)
53
53
  )
54
54
  ]
55
55
  for (
@@ -89,11 +89,11 @@ def _get_delete_file_envelopes(
89
89
  consecutive_delete_tables: List[pa.Table] = []
90
90
  for delete_delta in delete_delta_sequence:
91
91
  assert (
92
- delete_delta.delete_parameters is not None
92
+ delete_delta.meta.entry_params is not None
93
93
  ), "Delete type deltas are required to have delete parameters defined"
94
94
  delete_columns: Optional[
95
95
  List[str]
96
- ] = delete_delta.delete_parameters.equality_column_names
96
+ ] = delete_delta.meta.entry_params.equality_field_locators
97
97
  assert len(delete_columns) > 0, "At least 1 delete column is required"
98
98
  # delete columns should exist in underlying table
99
99
  delete_dataset = params.deltacat_storage.download_delta(
@@ -43,12 +43,11 @@ class MergeInput(Dict):
43
43
  round_completion_info: Optional[RoundCompletionInfo] = None,
44
44
  object_store: Optional[IObjectStore] = None,
45
45
  delete_strategy: Optional[DeleteStrategy] = None,
46
- delete_file_envelopes: Optional[List] = None,
46
+ delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None,
47
47
  deltacat_storage=unimplemented_deltacat_storage,
48
48
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
49
49
  memory_logs_enabled: Optional[bool] = None,
50
50
  disable_copy_by_reference: Optional[bool] = None,
51
- hash_bucket_count: Optional[int] = None,
52
51
  ) -> MergeInput:
53
52
 
54
53
  result = MergeInput()
@@ -72,7 +71,6 @@ class MergeInput(Dict):
72
71
  result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
73
72
  result["memory_logs_enabled"] = memory_logs_enabled
74
73
  result["disable_copy_by_reference"] = disable_copy_by_reference
75
- result["hash_bucket_count"] = hash_bucket_count
76
74
  return result
77
75
 
78
76
  @property
@@ -156,7 +154,3 @@ class MergeInput(Dict):
156
154
  @property
157
155
  def disable_copy_by_reference(self) -> bool:
158
156
  return self["disable_copy_by_reference"]
159
-
160
- @property
161
- def hash_bucket_count(self) -> int:
162
- return self["hash_bucket_count"]
@@ -63,7 +63,7 @@ from deltacat.compute.compactor_v2.steps import merge as mg
63
63
  from deltacat.compute.compactor_v2.steps import hash_bucket as hb
64
64
  from deltacat.compute.compactor_v2.utils import io
65
65
 
66
- from typing import List, Optional
66
+ from typing import List, Optional, Union
67
67
  from collections import defaultdict
68
68
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
69
69
  CompactionSessionAuditInfo,
@@ -83,7 +83,7 @@ def _fetch_compaction_metadata(
83
83
 
84
84
  # read the results from any previously completed compaction round
85
85
  round_completion_info: Optional[RoundCompletionInfo] = None
86
- high_watermark: Optional[HighWatermark] = None
86
+ high_watermark: Optional[Union[HighWatermark, int]] = None
87
87
  previous_compacted_delta_manifest: Optional[Manifest] = None
88
88
 
89
89
  if not params.rebase_source_partition_locator:
@@ -129,7 +129,7 @@ def _build_uniform_deltas(
129
129
  mutable_compaction_audit: CompactionSessionAuditInfo,
130
130
  input_deltas: List[Delta],
131
131
  delta_discovery_start: float,
132
- ) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition]:
132
+ ) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]]:
133
133
 
134
134
  delete_strategy: Optional[DeleteStrategy] = None
135
135
  delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
@@ -222,7 +222,7 @@ def _run_hash_and_merge(
222
222
  uniform_deltas: List[DeltaAnnotated],
223
223
  round_completion_info: RoundCompletionInfo,
224
224
  delete_strategy: Optional[DeleteStrategy],
225
- delete_file_envelopes: Optional[DeleteFileEnvelope],
225
+ delete_file_envelopes: Optional[List[DeleteFileEnvelope]],
226
226
  mutable_compaction_audit: CompactionSessionAuditInfo,
227
227
  previous_compacted_delta_manifest: Optional[Manifest],
228
228
  compacted_partition: Partition,
@@ -389,7 +389,7 @@ def _merge(
389
389
  all_hash_group_idx_to_obj_id: dict,
390
390
  compacted_partition: Partition,
391
391
  delete_strategy: DeleteStrategy,
392
- delete_file_envelopes: DeleteFileEnvelope,
392
+ delete_file_envelopes: List[DeleteFileEnvelope],
393
393
  ) -> tuple[List[MergeResult], float]:
394
394
  merge_options_provider = functools.partial(
395
395
  task_resource_options_provider,
@@ -438,7 +438,6 @@ def _merge(
438
438
  delete_file_envelopes=delete_file_envelopes,
439
439
  memory_logs_enabled=params.memory_logs_enabled,
440
440
  disable_copy_by_reference=params.disable_copy_by_reference,
441
- hash_bucket_count=params.hash_bucket_count,
442
441
  )
443
442
  }
444
443
 
@@ -7,7 +7,6 @@ import ray
7
7
  import itertools
8
8
  import time
9
9
  import pyarrow.compute as pc
10
- from deltacat.utils.pyarrow import MAX_INT_BYTES
11
10
  import deltacat.compute.compactor_v2.utils.merge as merge_utils
12
11
  from uuid import uuid4
13
12
  from deltacat import logs
@@ -32,25 +31,21 @@ from deltacat.utils.resources import (
32
31
  )
33
32
  from deltacat.compute.compactor_v2.utils.primary_key_index import (
34
33
  generate_pk_hash_column,
35
- pk_digest_to_hash_bucket_index,
36
34
  )
37
35
  from deltacat.storage import (
38
36
  Delta,
39
37
  DeltaLocator,
40
38
  DeltaType,
41
- Manifest,
42
39
  Partition,
43
40
  interface as unimplemented_deltacat_storage,
44
41
  )
42
+ from deltacat.storage.model.manifest import Manifest
45
43
  from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
46
44
  from deltacat.constants import BYTES_PER_GIBIBYTE
47
45
  from deltacat.compute.compactor_v2.constants import (
48
46
  MERGE_TIME_IN_SECONDS,
49
47
  MERGE_SUCCESS_COUNT,
50
48
  MERGE_FAILURE_COUNT,
51
- BUCKETING_SPEC_COMPLIANCE_PROFILE,
52
- BUCKETING_SPEC_COMPLIANCE_ASSERT,
53
- BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
54
49
  )
55
50
  from deltacat.exceptions import (
56
51
  categorize_errors,
@@ -62,10 +57,6 @@ if importlib.util.find_spec("memray"):
62
57
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
63
58
 
64
59
 
65
- _EXISTING_VARIANT_LOG_PREFIX = "Existing variant "
66
- _INCREMENTAL_TABLE_LOG_PREFIX = "Incremental table "
67
-
68
-
69
60
  def _append_delta_type_column(table: pa.Table, value: np.bool_):
70
61
  return table.append_column(
71
62
  sc._DELTA_TYPE_COLUMN_FIELD,
@@ -116,8 +107,6 @@ def _merge_tables(
116
107
  table: pa.Table,
117
108
  primary_keys: List[str],
118
109
  can_drop_duplicates: bool,
119
- hb_index: int,
120
- num_buckets: int,
121
110
  compacted_table: Optional[pa.Table] = None,
122
111
  ) -> pa.Table:
123
112
  """
@@ -136,20 +125,6 @@ def _merge_tables(
136
125
 
137
126
  all_tables.append(table)
138
127
 
139
- check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
140
- BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
141
- BUCKETING_SPEC_COMPLIANCE_ASSERT,
142
- ]
143
-
144
- if primary_keys and check_bucketing_spec:
145
- _validate_bucketing_spec_compliance(
146
- table=all_tables[incremental_idx],
147
- num_buckets=num_buckets,
148
- primary_keys=primary_keys,
149
- hb_index=hb_index,
150
- log_prefix=_INCREMENTAL_TABLE_LOG_PREFIX,
151
- )
152
-
153
128
  if not primary_keys or not can_drop_duplicates:
154
129
  logger.info(
155
130
  f"Not dropping duplicates for primary keys={primary_keys} "
@@ -172,32 +147,10 @@ def _merge_tables(
172
147
  if compacted_table:
173
148
  compacted_table = all_tables[0]
174
149
 
175
- compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
176
- incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
177
-
178
- logger.info(
179
- f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
180
- f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
181
- )
182
-
183
- if (
184
- compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
185
- or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
186
- ):
187
- logger.info("Casting compacted and incremental pk hash to large_string...")
188
- # is_in combines the chunks of the chunked array passed which can cause
189
- # ArrowCapacityError if the total size of string array is over 2GB.
190
- # Using a large_string would resolve this issue.
191
- # The cast here should be zero-copy in most cases.
192
- compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
193
- incremental_pk_hash_str = pc.cast(
194
- incremental_pk_hash_str, pa.large_string()
195
- )
196
-
197
150
  records_to_keep = pc.invert(
198
151
  pc.is_in(
199
- compacted_pk_hash_str,
200
- incremental_pk_hash_str,
152
+ compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
153
+ incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
201
154
  )
202
155
  )
203
156
 
@@ -212,47 +165,9 @@ def _merge_tables(
212
165
  return final_table
213
166
 
214
167
 
215
- def _validate_bucketing_spec_compliance(
216
- table: pa.Table,
217
- num_buckets: int,
218
- hb_index: int,
219
- primary_keys: List[str],
220
- rcf: RoundCompletionInfo = None,
221
- log_prefix=None,
222
- ) -> None:
223
- if rcf is not None:
224
- message_prefix = f"{log_prefix}{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}.{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}.{rcf.compacted_delta_locator.partition_values}"
225
- else:
226
- message_prefix = f"{log_prefix}"
227
- pki_table = generate_pk_hash_column(
228
- [table], primary_keys=primary_keys, requires_hash=True
229
- )[0]
230
- is_not_compliant: bool = False
231
- for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
232
- hash_bucket: int = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
233
- if hash_bucket != hb_index:
234
- is_not_compliant = True
235
- logger.info(
236
- f"{message_prefix} has non-compliant bucketing spec at index: {index} "
237
- f"Expected hash bucket is {hb_index} but found {hash_bucket}."
238
- )
239
- if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
240
- raise AssertionError(
241
- f"Hash bucket drift detected at index: {index}. Expected hash bucket index"
242
- f" to be {hb_index} but found {hash_bucket}"
243
- )
244
- # No further checks necessary
245
- break
246
- if not is_not_compliant:
247
- logger.debug(
248
- f"{message_prefix} has compliant bucketing spec for hb_index: {hb_index}"
249
- )
250
-
251
-
252
168
  def _download_compacted_table(
253
169
  hb_index: int,
254
170
  rcf: RoundCompletionInfo,
255
- primary_keys: List[str],
256
171
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
257
172
  deltacat_storage=unimplemented_deltacat_storage,
258
173
  deltacat_storage_kwargs: Optional[dict] = None,
@@ -276,28 +191,7 @@ def _download_compacted_table(
276
191
 
277
192
  tables.append(table)
278
193
 
279
- compacted_table = pa.concat_tables(tables)
280
- check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
281
- BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
282
- BUCKETING_SPEC_COMPLIANCE_ASSERT,
283
- ]
284
-
285
- logger.debug(
286
- f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
287
- f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
288
- )
289
-
290
- # Bucketing spec compliance isn't required without primary keys
291
- if primary_keys and check_bucketing_spec:
292
- _validate_bucketing_spec_compliance(
293
- compacted_table,
294
- rcf.hash_bucket_count,
295
- hb_index,
296
- primary_keys,
297
- rcf=rcf,
298
- log_prefix=_EXISTING_VARIANT_LOG_PREFIX,
299
- )
300
- return compacted_table
194
+ return pa.concat_tables(tables)
301
195
 
302
196
 
303
197
  def _copy_all_manifest_files_from_old_hash_buckets(
@@ -500,12 +394,12 @@ def _compact_tables(
500
394
  _group_sequence_by_delta_type(reordered_all_dfes)
501
395
  ):
502
396
  if delta_type is DeltaType.UPSERT:
503
- (table, incremental_len, deduped_records, merge_time,) = _apply_upserts(
504
- input=input,
505
- dfe_list=delta_type_sequence,
506
- hb_idx=hb_idx,
507
- prev_table=table,
508
- )
397
+ (
398
+ table,
399
+ incremental_len,
400
+ deduped_records,
401
+ merge_time,
402
+ ) = _apply_upserts(input, delta_type_sequence, hb_idx, table)
509
403
  logger.info(
510
404
  f" [Merge task index {input.merge_task_index}] Merged"
511
405
  f" record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
@@ -556,7 +450,9 @@ def _apply_upserts(
556
450
  # on non event based sort key does not produce consistent
557
451
  # compaction results. E.g., compaction(delta1, delta2, delta3)
558
452
  # will not be equal to compaction(compaction(delta1, delta2), delta3).
559
- table = table.sort_by(input.sort_keys)
453
+ table = table.sort_by(
454
+ [pa_key for key in input.sort_keys for pa_key in key.arrow]
455
+ )
560
456
  hb_table_record_count = len(table) + (len(prev_table) if prev_table else 0)
561
457
  table, merge_time = timed_invocation(
562
458
  func=_merge_tables,
@@ -564,8 +460,6 @@ def _apply_upserts(
564
460
  primary_keys=input.primary_keys,
565
461
  can_drop_duplicates=input.drop_duplicates,
566
462
  compacted_table=prev_table,
567
- hb_index=hb_idx,
568
- num_buckets=input.hash_bucket_count,
569
463
  )
570
464
  deduped_records = hb_table_record_count - len(table)
571
465
  return table, incremental_len, deduped_records, merge_time
@@ -600,11 +494,9 @@ def _copy_manifests_from_hash_bucketing(
600
494
  def _timed_merge(input: MergeInput) -> MergeResult:
601
495
  task_id = get_current_ray_task_id()
602
496
  worker_id = get_current_ray_worker_id()
603
- with (
604
- memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
605
- if input.enable_profiler
606
- else nullcontext()
607
- ):
497
+ with memray.Tracker(
498
+ f"merge_{worker_id}_{task_id}.bin"
499
+ ) if input.enable_profiler else nullcontext():
608
500
  total_input_records, total_deduped_records = 0, 0
609
501
  total_dropped_records = 0
610
502
  materialized_results: List[MaterializeResult] = []
@@ -628,7 +520,6 @@ def _timed_merge(input: MergeInput) -> MergeResult:
628
520
  compacted_table = _download_compacted_table(
629
521
  hb_index=merge_file_group.hb_index,
630
522
  rcf=input.round_completion_info,
631
- primary_keys=input.primary_keys,
632
523
  read_kwargs_provider=input.read_kwargs_provider,
633
524
  deltacat_storage=input.deltacat_storage,
634
525
  deltacat_storage_kwargs=input.deltacat_storage_kwargs,
@@ -713,5 +604,5 @@ def merge(input: MergeInput) -> MergeResult:
713
604
  merge_result[3],
714
605
  merge_result[4],
715
606
  np.double(emit_metrics_time),
716
- merge_result[6],
607
+ merge_result[4],
717
608
  )
@@ -5,7 +5,6 @@ from deltacat.compute.compactor_v2.constants import (
5
5
  TASK_MAX_PARALLELISM,
6
6
  MAX_PARQUET_METADATA_SIZE,
7
7
  )
8
- from deltacat.utils.common import ReadKwargsProvider
9
8
  from deltacat.utils.ray_utils.concurrency import invoke_parallel
10
9
  from deltacat import logs
11
10
  from deltacat.storage import (
@@ -76,21 +75,11 @@ def _download_parquet_metadata_for_manifest_entry(
76
75
  entry_index: int,
77
76
  deltacat_storage: unimplemented_deltacat_storage,
78
77
  deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
79
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
80
78
  ) -> Dict[str, Any]:
81
- logger.info(
82
- f"Downloading the parquet metadata for Delta with locator {delta.locator} and entry_index: {entry_index}"
83
- )
84
- if "file_reader_kwargs_provider" in deltacat_storage_kwargs:
85
- logger.info(
86
- "'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
87
- )
88
- deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
89
79
  pq_file = deltacat_storage.download_delta_manifest_entry(
90
80
  delta,
91
81
  entry_index=entry_index,
92
82
  table_type=TableType.PYARROW_PARQUET,
93
- file_reader_kwargs_provider=file_reader_kwargs_provider,
94
83
  **deltacat_storage_kwargs,
95
84
  )
96
85
 
@@ -108,15 +97,11 @@ def append_content_type_params(
108
97
  max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
109
98
  deltacat_storage=unimplemented_deltacat_storage,
110
99
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
111
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
112
100
  ) -> bool:
113
101
  """
114
102
  This operation appends content type params into the delta entry. Note
115
103
  that this operation can be time consuming, hence we cache it in a Ray actor.
116
104
  """
117
- logger.info(
118
- f"Appending the content type params for Delta with locator {delta.locator}..."
119
- )
120
105
 
121
106
  if not delta.meta:
122
107
  logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
@@ -174,7 +159,6 @@ def append_content_type_params(
174
159
 
175
160
  def input_provider(index, item) -> Dict:
176
161
  return {
177
- "file_reader_kwargs_provider": file_reader_kwargs_provider,
178
162
  "deltacat_storage_kwargs": deltacat_storage_kwargs,
179
163
  "deltacat_storage": deltacat_storage,
180
164
  "delta": delta,
@@ -184,7 +168,6 @@ def append_content_type_params(
184
168
  logger.info(
185
169
  f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
186
170
  )
187
-
188
171
  pq_files_promise = invoke_parallel(
189
172
  entry_indices_to_download,
190
173
  ray_task=_download_parquet_metadata_for_manifest_entry,
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
25
25
  result[index] = np.arange(cl, dtype="int32")
26
26
 
27
27
  chunk_lengths = ([0] + chunk_lengths)[:-1]
28
- result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
28
+ result = pa.chunked_array(result + np.cumsum(chunk_lengths))
29
29
  return result
30
30
 
31
31
 
@@ -101,6 +101,7 @@ def create_uniform_input_deltas(
101
101
  delta_manifest_entries_count = 0
102
102
  estimated_da_bytes = 0
103
103
  input_da_list = []
104
+
104
105
  for delta in input_deltas:
105
106
  if (
106
107
  compact_partition_params.enable_input_split
@@ -117,7 +118,6 @@ def create_uniform_input_deltas(
117
118
  deltacat_storage_kwargs=deltacat_storage_kwargs,
118
119
  task_max_parallelism=compact_partition_params.task_max_parallelism,
119
120
  max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
120
- file_reader_kwargs_provider=compact_partition_params.read_kwargs_provider,
121
121
  )
122
122
 
123
123
  manifest_entries = delta.manifest.entries
@@ -133,5 +133,4 @@ def generate_local_merge_input(
133
133
  delete_strategy=delete_strategy,
134
134
  delete_file_envelopes=delete_file_envelopes,
135
135
  disable_copy_by_reference=params.disable_copy_by_reference,
136
- hash_bucket_count=params.hash_bucket_count,
137
136
  )
@@ -10,7 +10,6 @@ from deltacat.compute.compactor_v2.constants import (
10
10
  TOTAL_BYTES_IN_SHA1_HASH,
11
11
  PK_DELIMITER,
12
12
  MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
13
- SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
14
13
  )
15
14
  import time
16
15
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
@@ -49,13 +48,6 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
49
48
  f"Found total length of hash column={total_len} and total_size={total_size}"
50
49
  )
51
50
 
52
- if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
53
- logger.info(
54
- f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
55
- f"Returning False for is_sha1_desired"
56
- )
57
- return False
58
-
59
51
  return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
60
52
 
61
53
 
@@ -116,10 +108,9 @@ def _optimized_group_record_batches_by_hash_bucket(
116
108
  record_batches = []
117
109
  result_len = 0
118
110
  for record_batch in table_batches:
119
- if (
120
- record_batches
121
- and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
122
- ):
111
+ current_bytes += record_batch.nbytes
112
+ record_batches.append(record_batch)
113
+ if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
123
114
  logger.info(
124
115
  f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
125
116
  f"is {len(record_batches)} and size {current_bytes}"
@@ -137,9 +128,6 @@ def _optimized_group_record_batches_by_hash_bucket(
137
128
  current_bytes = 0
138
129
  record_batches.clear()
139
130
 
140
- current_bytes += record_batch.nbytes
141
- record_batches.append(record_batch)
142
-
143
131
  if record_batches:
144
132
  appended_len, append_latency = timed_invocation(
145
133
  _append_table_by_hash_bucket,