deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,11 @@
1
1
  import unittest
2
2
  from unittest import mock
3
- from deltacat.tests.test_utils.constants import TEST_UPSERT_DELTA
4
- from typing import Any, Dict
5
3
 
6
- DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
7
- "db_file_path",
8
- "deltacat/tests/local_deltacat_storage/db_test.sqlite",
4
+ from deltacat.tests.compute.conftest import (
5
+ create_local_deltacat_storage_file,
6
+ clean_up_local_deltacat_storage_file,
9
7
  )
8
+ from deltacat.tests.test_utils.constants import TEST_UPSERT_DELTA
10
9
 
11
10
 
12
11
  class TestFitInputDeltas(unittest.TestCase):
@@ -19,9 +18,7 @@ class TestFitInputDeltas(unittest.TestCase):
19
18
  CompactionSessionAuditInfo,
20
19
  )
21
20
 
22
- cls.kwargs_for_local_deltacat_storage: Dict[str, Any] = {
23
- DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
24
- }
21
+ cls.kwargs_for_local_deltacat_storage = create_local_deltacat_storage_file()
25
22
 
26
23
  cls.COMPACTION_AUDIT = CompactionSessionAuditInfo("1.0", "2.3", "test")
27
24
 
@@ -30,6 +27,7 @@ class TestFitInputDeltas(unittest.TestCase):
30
27
  @classmethod
31
28
  def tearDownClass(cls) -> None:
32
29
  cls.module_patcher.stop()
30
+ clean_up_local_deltacat_storage_file(cls.kwargs_for_local_deltacat_storage)
33
31
 
34
32
  def test_sanity(self):
35
33
  from deltacat.compute.compactor.utils import io
@@ -1,14 +1,10 @@
1
- from typing import Dict, Any
2
1
  import ray
3
2
  import os
4
- import pyarrow as pa
5
3
  import pytest
6
4
  import boto3
7
- import json
8
5
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
9
6
  CompactionSessionAuditInfo,
10
7
  )
11
- from deltacat.exceptions import ValidationError
12
8
  from boto3.resources.base import ServiceResource
13
9
  import deltacat.tests.local_deltacat_storage as ds
14
10
  from deltacat.types.media import ContentType
@@ -31,11 +27,6 @@ from deltacat.tests.test_utils.pyarrow import (
31
27
  )
32
28
  from moto import mock_s3
33
29
 
34
- DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
35
- "db_file_path",
36
- "deltacat/tests/local_deltacat_storage/db_test.sqlite",
37
- )
38
-
39
30
 
40
31
  @pytest.fixture(autouse=True, scope="module")
41
32
  def setup_ray_cluster():
@@ -69,38 +60,6 @@ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
69
60
  yield
70
61
 
71
62
 
72
- @pytest.fixture(scope="function")
73
- def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
74
- kwargs_for_local_deltacat_storage: Dict[str, Any] = {
75
- DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
76
- }
77
- yield kwargs_for_local_deltacat_storage
78
- if os.path.exists(DATABASE_FILE_PATH_VALUE):
79
- os.remove(DATABASE_FILE_PATH_VALUE)
80
-
81
-
82
- @pytest.fixture(scope="function")
83
- def disable_sha1(monkeypatch):
84
- import deltacat.compute.compactor_v2.utils.primary_key_index
85
-
86
- monkeypatch.setattr(
87
- deltacat.compute.compactor_v2.utils.primary_key_index,
88
- "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
89
- True,
90
- )
91
-
92
-
93
- @pytest.fixture(scope="function")
94
- def enable_bucketing_spec_validation(monkeypatch):
95
- import deltacat.compute.compactor_v2.steps.merge
96
-
97
- monkeypatch.setattr(
98
- deltacat.compute.compactor_v2.steps.merge,
99
- "BUCKETING_SPEC_COMPLIANCE_PROFILE",
100
- "ASSERT",
101
- )
102
-
103
-
104
63
  class TestCompactionSession:
105
64
  """
106
65
  This class adds specific tests that aren't part of the parametrized test suite.
@@ -581,428 +540,3 @@ class TestCompactionSession:
581
540
  }
582
541
  )
583
542
  )
584
-
585
- def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
586
- self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
587
- ):
588
- """
589
- A test case which ensures the compaction succeeds even if the incremental
590
- arrow table size is over 2GB. It is added to prevent ArrowCapacityError
591
- when running is_in operation during merge.
592
-
593
- Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
594
- which truncates the lengths of pk strings when deduping.
595
- """
596
- # setup
597
- staged_source = stage_partition_from_file_paths(
598
- self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
599
- )
600
- # we create chunked array to avoid ArrowCapacityError
601
- chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
602
- table = pa.table([chunked_pk_array], names=["pk"])
603
- source_delta = commit_delta_to_staged_partition(
604
- staged_source, pa_table=table, **local_deltacat_storage_kwargs
605
- )
606
-
607
- staged_dest = stage_partition_from_file_paths(
608
- self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
609
- )
610
- dest_partition = ds.commit_partition(
611
- staged_dest, **local_deltacat_storage_kwargs
612
- )
613
-
614
- # rebase first
615
- rebase_url = compact_partition(
616
- CompactPartitionParams.of(
617
- {
618
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
619
- "compacted_file_content_type": ContentType.PARQUET,
620
- "dd_max_parallelism_ratio": 1.0,
621
- "deltacat_storage": ds,
622
- "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
623
- "destination_partition_locator": dest_partition.locator,
624
- "drop_duplicates": True,
625
- "hash_bucket_count": 1,
626
- "last_stream_position_to_compact": source_delta.stream_position,
627
- "list_deltas_kwargs": {
628
- **local_deltacat_storage_kwargs,
629
- **{"equivalent_table_types": []},
630
- },
631
- "primary_keys": ["pk"],
632
- "rebase_source_partition_locator": source_delta.partition_locator,
633
- "rebase_source_partition_high_watermark": source_delta.stream_position,
634
- "records_per_compacted_file": 4000,
635
- "s3_client_kwargs": {},
636
- "source_partition_locator": source_delta.partition_locator,
637
- "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
638
- }
639
- )
640
- )
641
-
642
- rebased_rcf = get_rcf(s3_resource, rebase_url)
643
-
644
- assert rebased_rcf.compacted_pyarrow_write_result.files == 1
645
- assert rebased_rcf.compacted_pyarrow_write_result.records == 2
646
-
647
- # Run incremental with a small delta on source
648
- chunked_pk_array = pa.chunked_array(
649
- [["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
650
- ) # 2.3GB
651
- table = pa.table([chunked_pk_array], names=["pk"])
652
-
653
- incremental_source_delta = commit_delta_to_partition(
654
- source_delta.partition_locator,
655
- pa_table=table,
656
- **local_deltacat_storage_kwargs,
657
- )
658
- assert (
659
- incremental_source_delta.partition_locator == source_delta.partition_locator
660
- ), "source partition locator should not change"
661
- dest_partition = ds.get_partition(
662
- dest_partition.stream_locator,
663
- dest_partition.partition_values,
664
- **local_deltacat_storage_kwargs,
665
- )
666
-
667
- assert (
668
- dest_partition.locator
669
- == rebased_rcf.compacted_delta_locator.partition_locator
670
- ), "The new destination partition should be same as compacted partition"
671
-
672
- # Run incremental
673
- incremental_url = compact_partition(
674
- CompactPartitionParams.of(
675
- {
676
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
677
- "compacted_file_content_type": ContentType.PARQUET,
678
- "dd_max_parallelism_ratio": 1.0,
679
- "deltacat_storage": ds,
680
- "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
681
- "destination_partition_locator": dest_partition.locator,
682
- "drop_duplicates": True,
683
- "hash_bucket_count": 1,
684
- "last_stream_position_to_compact": incremental_source_delta.stream_position,
685
- "list_deltas_kwargs": {
686
- **local_deltacat_storage_kwargs,
687
- **{"equivalent_table_types": []},
688
- },
689
- "primary_keys": ["pk"],
690
- "records_per_compacted_file": 4000,
691
- "s3_client_kwargs": {},
692
- "source_partition_locator": incremental_source_delta.partition_locator,
693
- "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
694
- }
695
- )
696
- )
697
-
698
- incremental_rcf = get_rcf(s3_resource, incremental_url)
699
-
700
- assert incremental_rcf.compacted_pyarrow_write_result.files == 1
701
- assert (
702
- incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
703
- )
704
- assert incremental_rcf.compacted_pyarrow_write_result.records == 4
705
-
706
- def test_compact_partition_when_bucket_spec_validation_fails(
707
- self,
708
- s3_resource,
709
- local_deltacat_storage_kwargs,
710
- enable_bucketing_spec_validation,
711
- ):
712
- """
713
- A test case which asserts the bucketing spec validation throws an assertion error
714
- when the validation has failed.
715
- """
716
-
717
- # setup
718
- staged_source = stage_partition_from_file_paths(
719
- self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
720
- )
721
-
722
- source_delta = commit_delta_to_staged_partition(
723
- staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
724
- )
725
-
726
- staged_dest = stage_partition_from_file_paths(
727
- self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
728
- )
729
- dest_partition = ds.commit_partition(
730
- staged_dest, **local_deltacat_storage_kwargs
731
- )
732
-
733
- # action
734
- rcf_url = compact_partition(
735
- CompactPartitionParams.of(
736
- {
737
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
738
- "compacted_file_content_type": ContentType.PARQUET,
739
- "dd_max_parallelism_ratio": 1.0,
740
- "deltacat_storage": ds,
741
- "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
742
- "destination_partition_locator": dest_partition.locator,
743
- "drop_duplicates": True,
744
- "hash_bucket_count": 4,
745
- "last_stream_position_to_compact": source_delta.stream_position,
746
- "list_deltas_kwargs": {
747
- **local_deltacat_storage_kwargs,
748
- **{"equivalent_table_types": []},
749
- },
750
- "primary_keys": ["pk"],
751
- "rebase_source_partition_locator": source_delta.partition_locator,
752
- "rebase_source_partition_high_watermark": source_delta.stream_position,
753
- "records_per_compacted_file": 1,
754
- "s3_client_kwargs": {},
755
- "source_partition_locator": source_delta.partition_locator,
756
- }
757
- )
758
- )
759
-
760
- backfill_rcf = get_rcf(s3_resource, rcf_url)
761
- bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
762
- # Move the records to different hash buckets to simulate a validation failure.
763
- backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
764
- s3_resource.Bucket(bucket).put_object(
765
- Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
766
- )
767
-
768
- # Now run an incremental compaction and verify if the previous RCF was read properly.
769
- new_source_delta = commit_delta_to_partition(
770
- source_delta.partition_locator,
771
- [self.INCREMENTAL_FILE_PATH],
772
- **local_deltacat_storage_kwargs,
773
- )
774
-
775
- new_destination_partition = ds.get_partition(
776
- dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
777
- )
778
-
779
- with pytest.raises(ValidationError) as excinfo:
780
- compact_partition(
781
- CompactPartitionParams.of(
782
- {
783
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
784
- "compacted_file_content_type": ContentType.PARQUET,
785
- "dd_max_parallelism_ratio": 1.0,
786
- "deltacat_storage": ds,
787
- "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
788
- "destination_partition_locator": new_destination_partition.locator,
789
- "drop_duplicates": True,
790
- "hash_bucket_count": 4,
791
- "last_stream_position_to_compact": new_source_delta.stream_position,
792
- "list_deltas_kwargs": {
793
- **local_deltacat_storage_kwargs,
794
- **{"equivalent_table_types": []},
795
- },
796
- "primary_keys": ["pk"],
797
- "rebase_source_partition_locator": None,
798
- "rebase_source_partition_high_watermark": None,
799
- "records_per_compacted_file": 4000,
800
- "s3_client_kwargs": {},
801
- "source_partition_locator": new_source_delta.partition_locator,
802
- }
803
- )
804
- )
805
-
806
- assert (
807
- "Hash bucket drift detected at index: 0. Expected hash bucket index to be 1 but found 0"
808
- in str(excinfo.value)
809
- )
810
-
811
- def test_compact_partition_when_bucket_spec_validation_fails_but_env_variable_disabled(
812
- self,
813
- s3_resource,
814
- local_deltacat_storage_kwargs,
815
- ):
816
- """
817
- A test case which asserts even if bucketing spec validation fails, compaction doesn't
818
- throw an error if the feature is not enabled.
819
- """
820
-
821
- # setup
822
- staged_source = stage_partition_from_file_paths(
823
- self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
824
- )
825
-
826
- source_delta = commit_delta_to_staged_partition(
827
- staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
828
- )
829
-
830
- staged_dest = stage_partition_from_file_paths(
831
- self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
832
- )
833
- dest_partition = ds.commit_partition(
834
- staged_dest, **local_deltacat_storage_kwargs
835
- )
836
-
837
- # action
838
- rcf_url = compact_partition(
839
- CompactPartitionParams.of(
840
- {
841
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
842
- "compacted_file_content_type": ContentType.PARQUET,
843
- "dd_max_parallelism_ratio": 1.0,
844
- "deltacat_storage": ds,
845
- "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
846
- "destination_partition_locator": dest_partition.locator,
847
- "drop_duplicates": True,
848
- "hash_bucket_count": 4,
849
- "last_stream_position_to_compact": source_delta.stream_position,
850
- "list_deltas_kwargs": {
851
- **local_deltacat_storage_kwargs,
852
- **{"equivalent_table_types": []},
853
- },
854
- "primary_keys": ["pk"],
855
- "rebase_source_partition_locator": source_delta.partition_locator,
856
- "rebase_source_partition_high_watermark": source_delta.stream_position,
857
- "records_per_compacted_file": 1,
858
- "s3_client_kwargs": {},
859
- "source_partition_locator": source_delta.partition_locator,
860
- }
861
- )
862
- )
863
-
864
- backfill_rcf = get_rcf(s3_resource, rcf_url)
865
- bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
866
- # Move the records to different hash buckets to simulate a validation failure.
867
- backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
868
- s3_resource.Bucket(bucket).put_object(
869
- Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
870
- )
871
-
872
- # Now run an incremental compaction and verify if the previous RCF was read properly.
873
- new_source_delta = commit_delta_to_partition(
874
- source_delta.partition_locator,
875
- [self.INCREMENTAL_FILE_PATH],
876
- **local_deltacat_storage_kwargs,
877
- )
878
-
879
- new_destination_partition = ds.get_partition(
880
- dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
881
- )
882
-
883
- new_rcf = compact_partition(
884
- CompactPartitionParams.of(
885
- {
886
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
887
- "compacted_file_content_type": ContentType.PARQUET,
888
- "dd_max_parallelism_ratio": 1.0,
889
- "deltacat_storage": ds,
890
- "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
891
- "destination_partition_locator": new_destination_partition.locator,
892
- "drop_duplicates": True,
893
- "hash_bucket_count": 4,
894
- "last_stream_position_to_compact": new_source_delta.stream_position,
895
- "list_deltas_kwargs": {
896
- **local_deltacat_storage_kwargs,
897
- **{"equivalent_table_types": []},
898
- },
899
- "primary_keys": ["pk"],
900
- "rebase_source_partition_locator": None,
901
- "rebase_source_partition_high_watermark": None,
902
- "records_per_compacted_file": 4000,
903
- "s3_client_kwargs": {},
904
- "source_partition_locator": new_source_delta.partition_locator,
905
- }
906
- )
907
- )
908
-
909
- incremental_rcf = get_rcf(s3_resource, new_rcf)
910
- assert incremental_rcf.hash_bucket_count == 4
911
- assert len(incremental_rcf.hb_index_to_entry_range) == 2
912
-
913
- def test_compact_partition_when_bucket_spec_validation_succeeds(
914
- self,
915
- s3_resource,
916
- local_deltacat_storage_kwargs,
917
- enable_bucketing_spec_validation,
918
- ):
919
- """
920
- A test case which asserts the bucketing spec validation does not throw
921
- and error when the validation succeeds.
922
- """
923
-
924
- # setup
925
- staged_source = stage_partition_from_file_paths(
926
- self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
927
- )
928
-
929
- source_delta = commit_delta_to_staged_partition(
930
- staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
931
- )
932
-
933
- staged_dest = stage_partition_from_file_paths(
934
- self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
935
- )
936
- dest_partition = ds.commit_partition(
937
- staged_dest, **local_deltacat_storage_kwargs
938
- )
939
-
940
- # action
941
- rcf_url = compact_partition(
942
- CompactPartitionParams.of(
943
- {
944
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
945
- "compacted_file_content_type": ContentType.PARQUET,
946
- "dd_max_parallelism_ratio": 1.0,
947
- "deltacat_storage": ds,
948
- "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
949
- "destination_partition_locator": dest_partition.locator,
950
- "drop_duplicates": True,
951
- "hash_bucket_count": 4,
952
- "last_stream_position_to_compact": source_delta.stream_position,
953
- "list_deltas_kwargs": {
954
- **local_deltacat_storage_kwargs,
955
- **{"equivalent_table_types": []},
956
- },
957
- "primary_keys": ["pk"],
958
- "rebase_source_partition_locator": source_delta.partition_locator,
959
- "rebase_source_partition_high_watermark": source_delta.stream_position,
960
- "records_per_compacted_file": 1,
961
- "s3_client_kwargs": {},
962
- "source_partition_locator": source_delta.partition_locator,
963
- }
964
- )
965
- )
966
-
967
- rcf = get_rcf(s3_resource, rcf_url)
968
- assert rcf.hash_bucket_count == 4
969
-
970
- # Now run an incremental compaction and verify if the previous RCF was read properly.
971
- new_source_delta = commit_delta_to_partition(
972
- source_delta.partition_locator,
973
- [self.INCREMENTAL_FILE_PATH],
974
- **local_deltacat_storage_kwargs,
975
- )
976
-
977
- new_destination_partition = ds.get_partition(
978
- dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
979
- )
980
-
981
- new_uri = compact_partition(
982
- CompactPartitionParams.of(
983
- {
984
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
985
- "compacted_file_content_type": ContentType.PARQUET,
986
- "dd_max_parallelism_ratio": 1.0,
987
- "deltacat_storage": ds,
988
- "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
989
- "destination_partition_locator": new_destination_partition.locator,
990
- "drop_duplicates": True,
991
- "hash_bucket_count": 4,
992
- "last_stream_position_to_compact": new_source_delta.stream_position,
993
- "list_deltas_kwargs": {
994
- **local_deltacat_storage_kwargs,
995
- **{"equivalent_table_types": []},
996
- },
997
- "primary_keys": ["pk"],
998
- "rebase_source_partition_locator": None,
999
- "rebase_source_partition_high_watermark": None,
1000
- "records_per_compacted_file": 4000,
1001
- "s3_client_kwargs": {},
1002
- "source_partition_locator": new_source_delta.partition_locator,
1003
- }
1004
- )
1005
- )
1006
-
1007
- rcf = get_rcf(s3_resource, new_uri)
1008
- assert rcf.hash_bucket_count == 4