deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -1,37 +1,6 @@
1
1
  import unittest
2
2
  import ray
3
- from deltacat.compute.compactor_v2.utils.task_options import (
4
- _get_task_options,
5
- _get_merge_task_options,
6
- logger,
7
- )
8
- from deltacat.compute.resource_estimation.model import (
9
- EstimateResourcesParams,
10
- ResourceEstimationMethod,
11
- )
12
- from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
13
- from deltacat.compute.compactor import (
14
- PyArrowWriteResult,
15
- RoundCompletionInfo,
16
- )
17
- from deltacat.types.media import (
18
- ContentType,
19
- ContentEncoding,
20
- )
21
- from deltacat.storage import (
22
- DeltaLocator,
23
- Manifest,
24
- ManifestMeta,
25
- ManifestEntry,
26
- ManifestEntryList,
27
- PartitionValues,
28
- )
29
- from unittest.mock import MagicMock
30
- from typing import Optional
31
-
32
- from deltacat.compute.compactor_v2.constants import (
33
- AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
34
- )
3
+ from deltacat.compute.compactor_v2.utils.task_options import _get_task_options
35
4
 
36
5
 
37
6
  @ray.remote
@@ -45,95 +14,11 @@ def throwing_func():
45
14
 
46
15
 
47
16
  class TestTaskOptions(unittest.TestCase):
48
- TEST_INDEX = 0
49
- TEST_HB_GROUP_IDX = 0
50
- TEST_STREAM_POSITION = 1_000_000
51
- TEST_NUM_HASH_GROUPS = 1
52
-
53
17
  @classmethod
54
18
  def setUpClass(cls):
55
19
  ray.init(local_mode=True, ignore_reinit_error=True)
56
20
  super().setUpClass()
57
21
 
58
- @classmethod
59
- def tearDownClass(cls) -> None:
60
- ray.shutdown()
61
-
62
- def _make_estimate_resource_params(
63
- cls,
64
- resource_estimation_method: Optional[
65
- ResourceEstimationMethod
66
- ] = ResourceEstimationMethod.DEFAULT,
67
- previous_inflation: Optional[int] = 7,
68
- average_record_size_bytes: Optional[int] = 1000,
69
- ):
70
- return EstimateResourcesParams.of(
71
- resource_estimation_method=resource_estimation_method,
72
- previous_inflation=previous_inflation,
73
- average_record_size_bytes=average_record_size_bytes,
74
- )
75
-
76
- def _make_manifest(
77
- self,
78
- source_content_length: Optional[int] = 1000,
79
- content_type: Optional[ContentType] = ContentType.PARQUET,
80
- content_encoding: Optional[ContentEncoding] = ContentEncoding.IDENTITY,
81
- partition_values: Optional[PartitionValues] = None,
82
- uri: Optional[str] = "test",
83
- url: Optional[str] = "test",
84
- author: Optional[str] = "foo",
85
- entry_uuid: Optional[str] = "foo",
86
- manifest_uuid: Optional[str] = "bar",
87
- ) -> Manifest:
88
- meta = ManifestMeta.of(
89
- 10,
90
- 10,
91
- content_type=content_type,
92
- content_encoding=content_encoding,
93
- source_content_length=source_content_length,
94
- partition_values=partition_values,
95
- )
96
-
97
- return Manifest.of(
98
- entries=ManifestEntryList.of(
99
- [
100
- ManifestEntry.of(
101
- uri=uri, url=url, meta=meta, mandatory=True, uuid=entry_uuid
102
- )
103
- ]
104
- ),
105
- author=author,
106
- uuid=manifest_uuid,
107
- )
108
-
109
- def make_round_completion_info(
110
- self,
111
- high_watermark: Optional[int] = 1_000_000,
112
- compacted_delta_locator: Optional[DeltaLocator] = None,
113
- records_written: Optional[int] = 10,
114
- bytes_written: Optional[int] = 10,
115
- files_written: Optional[int] = 10,
116
- rows_dropped: Optional[int] = 10,
117
- sort_keys_bit_width: Optional[int] = 0,
118
- hash_bucket_count: Optional[int] = 1,
119
- hb_index_to_entry_range: Optional[dict] = None,
120
- ) -> RoundCompletionInfo:
121
- if compacted_delta_locator is None:
122
- compacted_delta_locator = MagicMock(spec=DeltaLocator)
123
-
124
- hb_index_to_entry_range = hb_index_to_entry_range or {"0": (0, 1)}
125
-
126
- return RoundCompletionInfo.of(
127
- compacted_delta_locator=compacted_delta_locator,
128
- high_watermark=high_watermark,
129
- compacted_pyarrow_write_result=PyArrowWriteResult.of(
130
- records_written, bytes_written, files_written, rows_dropped
131
- ),
132
- sort_keys_bit_width=sort_keys_bit_width,
133
- hb_index_to_entry_range=hb_index_to_entry_range,
134
- hash_bucket_count=hash_bucket_count,
135
- )
136
-
137
22
  def test_get_task_options_sanity(self):
138
23
  opts = _get_task_options(0.01, 0.01)
139
24
  result_ref = valid_func.options(**opts).remote()
@@ -146,160 +31,3 @@ class TestTaskOptions(unittest.TestCase):
146
31
  result_ref = throwing_func.options(**opts).remote()
147
32
 
148
33
  self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
149
-
150
- def test_get_merge_task_options_memory_logs_enabled_sanity(self):
151
- test_index = 0
152
- test_hb_group_idx = 0
153
- test_debug_memory_params = {"merge_task_index": test_index}
154
- test_estimate_memory_params = self._make_estimate_resource_params()
155
- test_ray_custom_resources = {}
156
- test_rcf = self.make_round_completion_info()
157
- test_manifest = self._make_manifest()
158
- expected_task_opts = {
159
- "max_retries": 3,
160
- "memory": 1680.64,
161
- "num_cpus": 0.01,
162
- "scheduling_strategy": "SPREAD",
163
- }
164
- expected_previous_inflation = 1.0
165
- expected_average_record_size = 1.0
166
- with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
167
- # At least one log of level DEBUG must be emitted
168
- actual_merge_tasks_opts = _get_merge_task_options(
169
- index=test_index,
170
- hb_group_idx=test_hb_group_idx,
171
- data_size=1,
172
- pk_size_bytes=1,
173
- num_rows=1,
174
- num_hash_groups=1,
175
- total_memory_buffer_percentage=1,
176
- incremental_index_array_size=1,
177
- debug_memory_params=test_debug_memory_params,
178
- ray_custom_resources=test_ray_custom_resources,
179
- estimate_resources_params=test_estimate_memory_params,
180
- round_completion_info=test_rcf,
181
- compacted_delta_manifest=test_manifest,
182
- memory_logs_enabled=True,
183
- )
184
- assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
185
- log_message_round_completion_info = cm.records[0].getMessage()
186
- log_message_debug_memory_params = cm.records[1].getMessage()
187
- self.assertIn(
188
- f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
189
- log_message_round_completion_info,
190
- )
191
- self.assertIn(
192
- f"[Merge task {test_index}]: Params used for calculating merge memory",
193
- log_message_debug_memory_params,
194
- )
195
- self.assertIn(
196
- f"'previous_inflation': {expected_previous_inflation}",
197
- log_message_debug_memory_params,
198
- )
199
- self.assertIn(
200
- f"'average_record_size': {expected_average_record_size}",
201
- log_message_debug_memory_params,
202
- )
203
-
204
- def test_get_merge_task_options_memory_logs_enabled_fallback_previous_inflation_fallback_average_record_size(
205
- self,
206
- ):
207
- test_index = 0
208
- test_hb_group_idx = 0
209
- test_debug_memory_params = {"merge_task_index": test_index}
210
- test_estimate_memory_params = self._make_estimate_resource_params()
211
- test_ray_custom_resources = {}
212
- test_rcf = self.make_round_completion_info(
213
- bytes_written=0, records_written=0, files_written=0, rows_dropped=0
214
- )
215
- test_manifest = self._make_manifest()
216
- expected_task_opts = {
217
- "max_retries": 3,
218
- "memory": 1680.64,
219
- "num_cpus": 0.01,
220
- "scheduling_strategy": "SPREAD",
221
- }
222
- expected_previous_inflation = PYARROW_INFLATION_MULTIPLIER
223
- expected_average_record_size = DEFAULT_AVERAGE_RECORD_SIZE_BYTES
224
- with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
225
- # At least one log of level DEBUG must be emitted
226
- actual_merge_tasks_opts = _get_merge_task_options(
227
- index=test_index,
228
- hb_group_idx=test_hb_group_idx,
229
- data_size=1,
230
- pk_size_bytes=1,
231
- num_rows=1,
232
- num_hash_groups=1,
233
- total_memory_buffer_percentage=1,
234
- incremental_index_array_size=1,
235
- debug_memory_params=test_debug_memory_params,
236
- ray_custom_resources=test_ray_custom_resources,
237
- estimate_resources_params=test_estimate_memory_params,
238
- round_completion_info=test_rcf,
239
- compacted_delta_manifest=test_manifest,
240
- memory_logs_enabled=True,
241
- )
242
- assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
243
- log_message_round_completion_info = cm.records[0].getMessage()
244
- log_message_debug_memory_params = cm.records[1].getMessage()
245
- self.assertIn(
246
- f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
247
- log_message_round_completion_info,
248
- )
249
- self.assertIn(
250
- f"[Merge task {test_index}]: Params used for calculating merge memory",
251
- log_message_debug_memory_params,
252
- )
253
- self.assertIn(
254
- f"'previous_inflation': {expected_previous_inflation}",
255
- log_message_debug_memory_params,
256
- )
257
- self.assertIn(
258
- f"'average_record_size': {expected_average_record_size}",
259
- log_message_debug_memory_params,
260
- )
261
-
262
- def test_get_merge_task_options_memory_logs_enabled_not_using_previous_round_completion_info(
263
- self,
264
- ):
265
- test_index = 0
266
- test_hb_group_idx = 0
267
- test_debug_memory_params = {"merge_task_index": test_index}
268
- test_estimate_memory_params = self._make_estimate_resource_params()
269
- test_ray_custom_resources = {}
270
- test_rcf = None
271
- test_manifest = self._make_manifest()
272
- expected_task_opts = {
273
- "max_retries": 3,
274
- "memory": 1680.64,
275
- "num_cpus": 0.01,
276
- "scheduling_strategy": "SPREAD",
277
- }
278
- with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
279
- # At least one log of level DEBUG must be emitted
280
- actual_merge_tasks_opts = _get_merge_task_options(
281
- index=test_index,
282
- hb_group_idx=test_hb_group_idx,
283
- data_size=1,
284
- pk_size_bytes=1,
285
- num_rows=1,
286
- num_hash_groups=1,
287
- total_memory_buffer_percentage=1,
288
- incremental_index_array_size=1,
289
- debug_memory_params=test_debug_memory_params,
290
- ray_custom_resources=test_ray_custom_resources,
291
- estimate_resources_params=test_estimate_memory_params,
292
- round_completion_info=test_rcf,
293
- compacted_delta_manifest=test_manifest,
294
- memory_logs_enabled=True,
295
- )
296
- assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
297
- log_message_debug_memory_params = cm.records[0].getMessage()
298
- self.assertIn(
299
- f"[Merge task {test_index}]: Params used for calculating merge memory",
300
- log_message_debug_memory_params,
301
- )
302
- self.assertNotIn(
303
- "'average_record_size'",
304
- log_message_debug_memory_params,
305
- )
@@ -0,0 +1,75 @@
1
+ import os
2
+ import tempfile
3
+ import shutil
4
+ from typing import Dict
5
+
6
+ import pytest
7
+
8
+
9
+ @pytest.fixture
10
+ def temp_dir():
11
+ """
12
+ Fixture that creates a temporary directory for tests and cleans it up afterwards.
13
+
14
+ Returns:
15
+ str: Path to the temporary directory
16
+ """
17
+ # Create a temporary directory
18
+ dir_path = tempfile.mkdtemp()
19
+
20
+ # Provide the directory path to the test
21
+ yield dir_path
22
+
23
+ # Cleanup: remove the directory after the test is done
24
+ shutil.rmtree(dir_path)
25
+
26
+
27
+ @pytest.fixture(scope="function")
28
+ def local_deltacat_storage_kwargs(temp_dir):
29
+ """
30
+ Fixture that creates a temporary database file for each test function
31
+ and returns storage kwargs dictionary.
32
+
33
+ Returns:
34
+ dict: A dictionary with db_file_path key pointing to a temporary database file
35
+ """
36
+ # Create a unique database file in the temporary directory
37
+ db_file_path = os.path.join(temp_dir, "db_test.sqlite")
38
+
39
+ # Return kwargs dictionary ready to use
40
+ kwargs = {"db_file_path": db_file_path}
41
+ yield kwargs
42
+
43
+ # Cleanup: remove the database file if it exists
44
+ if os.path.exists(db_file_path):
45
+ os.remove(db_file_path)
46
+
47
+
48
+ def create_local_deltacat_storage_file() -> Dict[str, str]:
49
+ """
50
+ Helper function to create a local deltacat storage file
51
+
52
+ Essentially uses the same approach as local_deltacat_storage_kwargs, but more flexible
53
+ if the consumer does not want to use a function scoped fixture
54
+
55
+ Returns: kwargs to use for local deltacat storage, i.e. {"db_file_path": $db_file}
56
+ """
57
+ temp_dir = tempfile.mkdtemp()
58
+ db_file_path = os.path.join(temp_dir, "db_test.sqlite")
59
+ return {"db_file_path": db_file_path}
60
+
61
+
62
+ def clean_up_local_deltacat_storage_file(local_storage_kwargs: Dict[str, str]):
63
+ """
64
+ Cleans up local file and directory created by create_local_deltacat_storage_file
65
+ """
66
+ db_file = local_storage_kwargs["db_file_path"]
67
+ dir_path = os.path.dirname(db_file)
68
+
69
+ # Remove the database file if it exists
70
+ if os.path.exists(db_file):
71
+ os.remove(db_file)
72
+
73
+ # Remove the temporary directory if it exists
74
+ if os.path.exists(dir_path):
75
+ shutil.rmtree(dir_path)
File without changes
@@ -0,0 +1,80 @@
1
+ import pytest
2
+ from pyspark.sql import SparkSession
3
+ import os
4
+ import ray
5
+ from pyiceberg.catalog import Catalog, load_catalog
6
+
7
+
8
+ @pytest.fixture
9
+ def spark():
10
+ import importlib.metadata
11
+
12
+ spark_version = ".".join(importlib.metadata.version("pyspark").split(".")[:2])
13
+ scala_version = "2.12"
14
+ iceberg_version = "1.6.0"
15
+
16
+ os.environ["PYSPARK_SUBMIT_ARGS"] = (
17
+ f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version},"
18
+ f"org.apache.iceberg:iceberg-aws-bundle:{iceberg_version} pyspark-shell"
19
+ )
20
+ os.environ["AWS_REGION"] = "us-east-1"
21
+ os.environ["AWS_ACCESS_KEY_ID"] = "admin"
22
+ os.environ["AWS_SECRET_ACCESS_KEY"] = "password"
23
+
24
+ spark = (
25
+ SparkSession.builder.appName("PyIceberg integration test")
26
+ .config("spark.sql.session.timeZone", "UTC")
27
+ .config(
28
+ "spark.sql.extensions",
29
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
30
+ )
31
+ .config(
32
+ "spark.sql.catalog.integration", "org.apache.iceberg.spark.SparkCatalog"
33
+ )
34
+ .config(
35
+ "spark.sql.catalog.integration.catalog-impl",
36
+ "org.apache.iceberg.rest.RESTCatalog",
37
+ )
38
+ .config("spark.sql.catalog.integration.cache-enabled", "false")
39
+ .config("spark.sql.catalog.integration.uri", "http://localhost:8181")
40
+ .config(
41
+ "spark.sql.catalog.integration.io-impl",
42
+ "org.apache.iceberg.aws.s3.S3FileIO",
43
+ )
44
+ .config("spark.sql.catalog.integration.warehouse", "s3://warehouse/wh/")
45
+ .config("spark.sql.catalog.integration.s3.endpoint", "http://localhost:9000")
46
+ .config("spark.sql.catalog.integration.s3.path-style-access", "true")
47
+ .config("spark.sql.defaultCatalog", "integration")
48
+ .config("spark.sql.catalog.hive", "org.apache.iceberg.spark.SparkCatalog")
49
+ .config("spark.sql.catalog.hive.type", "hive")
50
+ .config("spark.sql.catalog.hive.uri", "http://localhost:9083")
51
+ .config("spark.sql.catalog.hive.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
52
+ .config("spark.sql.catalog.hive.warehouse", "s3://warehouse/hive/")
53
+ .config("spark.sql.catalog.hive.s3.endpoint", "http://localhost:9000")
54
+ .config("spark.sql.catalog.hive.s3.path-style-access", "true")
55
+ .config("spark.sql.execution.arrow.pyspark.enabled", "true")
56
+ .getOrCreate()
57
+ )
58
+
59
+ return spark
60
+
61
+
62
+ @pytest.fixture(scope="session")
63
+ def session_catalog() -> Catalog:
64
+ return load_catalog(
65
+ "local",
66
+ **{
67
+ "type": "rest",
68
+ "uri": "http://localhost:8181",
69
+ "s3.endpoint": "http://localhost:9000",
70
+ "s3.access-key-id": "admin",
71
+ "s3.secret-access-key": "password",
72
+ },
73
+ )
74
+
75
+
76
+ @pytest.fixture(autouse=True, scope="module")
77
+ def setup_ray_cluster():
78
+ ray.init(local_mode=True, ignore_reinit_error=True)
79
+ yield
80
+ ray.shutdown()