deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,478 @@
1
+ import pytest
2
+ import ray
3
+ from typing import List
4
+ from pyiceberg.catalog.rest import RestCatalog
5
+ from pyiceberg.expressions import EqualTo
6
+ from pyiceberg.schema import Schema
7
+ from pyiceberg.types import (
8
+ NestedField,
9
+ StringType,
10
+ LongType,
11
+ )
12
+ from pyiceberg.partitioning import PartitionSpec, PartitionField
13
+ from pyiceberg.transforms import IdentityTransform
14
+ import pyarrow as pa
15
+
16
+ from deltacat.compute.converter.steps.convert import convert
17
+ from deltacat.compute.converter.model.convert_input import ConvertInput
18
+ from deltacat.compute.converter.pyiceberg.overrides import (
19
+ fetch_all_bucket_files,
20
+ parquet_files_dict_to_iceberg_data_files,
21
+ )
22
+ from collections import defaultdict
23
+ from deltacat.compute.converter.utils.converter_session_utils import (
24
+ group_all_files_to_each_bucket,
25
+ )
26
+ from deltacat.tests.compute.converter.utils import (
27
+ get_s3_file_system,
28
+ drop_table_if_exists,
29
+ )
30
+ from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
31
+ commit_append_snapshot,
32
+ )
33
+
34
+
35
+ def run_spark_commands(spark, sqls: List[str]) -> None:
36
+ for sql in sqls:
37
+ spark.sql(sql)
38
+
39
+
40
+ @pytest.mark.integration
41
+ def test_pyiceberg_spark_setup_sanity(spark, session_catalog: RestCatalog) -> None:
42
+ """
43
+ This Test was copied over from Pyiceberg integ test: https://github.com/apache/iceberg-python/blob/main/tests/integration/test_deletes.py#L62
44
+ First sanity check to ensure all integration with Pyiceberg and Spark are working as expected.
45
+ """
46
+ identifier = "default.table_partitioned_delete"
47
+
48
+ run_spark_commands(
49
+ spark,
50
+ [
51
+ f"DROP TABLE IF EXISTS {identifier}",
52
+ f"""
53
+ CREATE TABLE {identifier} (
54
+ number_partitioned int,
55
+ number int
56
+ )
57
+ USING iceberg
58
+ PARTITIONED BY (number_partitioned)
59
+ TBLPROPERTIES('format-version' = 2)
60
+ """,
61
+ f"""
62
+ INSERT INTO {identifier} VALUES (10, 20), (10, 30)
63
+ """,
64
+ f"""
65
+ INSERT INTO {identifier} VALUES (11, 20), (11, 30)
66
+ """,
67
+ ],
68
+ )
69
+
70
+ tbl = session_catalog.load_table(identifier)
71
+ tbl.delete(EqualTo("number_partitioned", 10))
72
+
73
+ # No overwrite operation
74
+ assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == [
75
+ "append",
76
+ "append",
77
+ "delete",
78
+ ]
79
+ assert tbl.scan().to_arrow().to_pydict() == {
80
+ "number_partitioned": [11, 11],
81
+ "number": [20, 30],
82
+ }
83
+
84
+
85
+ @pytest.mark.integration
86
+ def test_spark_position_delete_production_sanity(
87
+ spark, session_catalog: RestCatalog
88
+ ) -> None:
89
+ """
90
+ Sanity test to ensure Spark position delete production is successful with `merge-on-read` spec V2.
91
+ Table has two partition levels. 1. BucketTransform on primary key
92
+ """
93
+ identifier = "default.table_spark_position_delete_production_sanity"
94
+
95
+ run_spark_commands(
96
+ spark,
97
+ [
98
+ f"DROP TABLE IF EXISTS {identifier}",
99
+ f"""
100
+ CREATE TABLE {identifier} (
101
+ number_partitioned INT,
102
+ primary_key STRING
103
+ )
104
+ USING iceberg
105
+ PARTITIONED BY (bucket(3, primary_key), number_partitioned)
106
+ TBLPROPERTIES(
107
+ 'format-version' = 2,
108
+ 'write.delete.mode'='merge-on-read',
109
+ 'write.update.mode'='merge-on-read',
110
+ 'write.merge.mode'='merge-on-read'
111
+ )
112
+ """,
113
+ f"""
114
+ INSERT INTO {identifier} VALUES (0, 'pk1'), (0, 'pk2'), (0, 'pk3')
115
+ """,
116
+ f"""
117
+ INSERT INTO {identifier} VALUES (1, 'pk1'), (1, 'pk2'), (1, 'pk3')
118
+ """,
119
+ ],
120
+ )
121
+
122
+ run_spark_commands(
123
+ spark,
124
+ [
125
+ f"""
126
+ DELETE FROM {identifier} WHERE primary_key in ("pk1")
127
+ """,
128
+ ],
129
+ )
130
+
131
+ tbl = session_catalog.load_table(identifier)
132
+ tbl.refresh()
133
+
134
+ assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == [
135
+ "append",
136
+ "append",
137
+ "delete",
138
+ ]
139
+
140
+ assert tbl.scan().to_arrow().to_pydict() == {
141
+ "number_partitioned": [1, 1, 0, 0],
142
+ "primary_key": ["pk2", "pk3", "pk2", "pk3"],
143
+ }
144
+
145
+
146
+ @pytest.mark.integration
147
+ def test_converter_drop_duplicates_success(
148
+ spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
149
+ ) -> None:
150
+ """
151
+ Test for convert compute remote function happy case. Download file results are mocked.
152
+ """
153
+
154
+ # 1. Create Iceberg table
155
+ namespace = "default"
156
+ table_name = "table_converter_ray_pos_delete_drop_duplicates_compute"
157
+ identifier = f"{namespace}.{table_name}"
158
+
159
+ schema = Schema(
160
+ NestedField(
161
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
162
+ ),
163
+ NestedField(
164
+ field_id=2, name="primary_key", field_type=StringType(), required=False
165
+ ),
166
+ # Explicitly define "file_path" and "pos" for assertion of deterministic record after dedupe
167
+ NestedField(
168
+ field_id=2147483546,
169
+ name="file_path",
170
+ field_type=StringType(),
171
+ required=False,
172
+ ),
173
+ NestedField(
174
+ field_id=2147483545, name="pos", field_type=LongType(), required=False
175
+ ),
176
+ schema_id=0,
177
+ )
178
+
179
+ partition_field_identity = PartitionField(
180
+ source_id=1,
181
+ field_id=101,
182
+ transform=IdentityTransform(),
183
+ name="number_partitioned",
184
+ )
185
+ partition_spec = PartitionSpec(partition_field_identity)
186
+
187
+ properties = dict()
188
+ properties["write.format.default"] = "parquet"
189
+ properties["write.delete.mode"] = "merge-on-read"
190
+ properties["write.update.mode"] = "merge-on-read"
191
+ properties["write.merge.mode"] = "merge-on-read"
192
+ properties["format-version"] = "2"
193
+
194
+ drop_table_if_exists(identifier, session_catalog)
195
+ session_catalog.create_table(
196
+ identifier,
197
+ schema=schema,
198
+ partition_spec=partition_spec,
199
+ properties=properties,
200
+ )
201
+
202
+ # 2. Use Spark to generate initial data files
203
+ tbl = session_catalog.load_table(identifier)
204
+ tbl.refresh()
205
+ run_spark_commands(
206
+ spark,
207
+ [
208
+ f"""
209
+ INSERT INTO {identifier} VALUES (0, "pk1", "path1", 1), (0, "pk2", "path2", 2), (0, "pk3", "path3", 3)
210
+ """
211
+ ],
212
+ )
213
+ run_spark_commands(
214
+ spark,
215
+ [
216
+ f"""
217
+ INSERT INTO {identifier} VALUES (0, "pk1", "path1", 4), (0, "pk2", "path2", 5), (0, "pk3", "path3", 6)
218
+ """
219
+ ],
220
+ )
221
+ run_spark_commands(
222
+ spark,
223
+ [
224
+ f"""
225
+ INSERT INTO {identifier} VALUES (0, "pk4", "path4", 7), (0, "pk2", "path2", 8), (0, "pk3", "path3", 9)
226
+ """
227
+ ],
228
+ )
229
+
230
+ tbl = session_catalog.load_table(identifier)
231
+ # 3. Use convert.remote() function to compute position deletes
232
+ data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
233
+
234
+ convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
235
+ data_file_dict=data_file_dict,
236
+ equality_delete_dict=equality_delete_dict,
237
+ pos_delete_dict=pos_delete_dict,
238
+ )
239
+
240
+ s3_file_system = get_s3_file_system()
241
+ for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
242
+ convert_input = ConvertInput.of(
243
+ convert_input_files=one_bucket_files,
244
+ convert_task_index=i,
245
+ iceberg_table_warehouse_prefix="warehouse/default",
246
+ identifier_fields=["primary_key"],
247
+ compact_small_files=False,
248
+ enforce_primary_key_uniqueness=True,
249
+ position_delete_for_multiple_data_files=True,
250
+ max_parallel_data_file_download=10,
251
+ s3_file_system=s3_file_system,
252
+ )
253
+
254
+ number_partitioned_array_1 = pa.array([0, 0, 0], type=pa.int32())
255
+ primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
256
+ names = ["number_partitioned", "primary_key"]
257
+ data_table_1 = pa.Table.from_arrays(
258
+ [number_partitioned_array_1, primary_key_array_1], names=names
259
+ )
260
+
261
+ number_partitioned_array_2 = pa.array([0, 0, 0], type=pa.int32())
262
+ primary_key_array_2 = pa.array(["pk1", "pk2", "pk3"])
263
+ names = ["number_partitioned", "primary_key"]
264
+ data_table_2 = pa.Table.from_arrays(
265
+ [number_partitioned_array_2, primary_key_array_2], names=names
266
+ )
267
+
268
+ number_partitioned_array_3 = pa.array([0, 0, 0], type=pa.int32())
269
+ primary_key_array_3 = pa.array(["pk4", "pk2", "pk3"])
270
+ names = ["number_partitioned", "primary_key"]
271
+ data_table_3 = pa.Table.from_arrays(
272
+ [number_partitioned_array_3, primary_key_array_3], names=names
273
+ )
274
+
275
+ download_data_mock = mocker.patch(
276
+ "deltacat.compute.converter.utils.io.download_parquet_with_daft_hash_applied"
277
+ )
278
+ download_data_mock.side_effect = (data_table_1, data_table_2, data_table_3)
279
+
280
+ convert_ref = convert.remote(convert_input)
281
+
282
+ to_be_deleted_files_list = []
283
+ to_be_added_files_dict_list = []
284
+ convert_result = ray.get(convert_ref)
285
+
286
+ partition_value = convert_input.convert_input_files.partition_value
287
+
288
+ if convert_result[0]:
289
+ to_be_deleted_files_list.extend(convert_result[0].values())
290
+
291
+ file_location = convert_result[1][partition_value][0]
292
+ to_be_added_files = f"s3://{file_location}"
293
+
294
+ to_be_added_files_dict = defaultdict()
295
+ to_be_added_files_dict[partition_value] = [to_be_added_files]
296
+ to_be_added_files_dict_list.append(to_be_added_files_dict)
297
+
298
+ # 4. Commit position delete, delete equality deletes from table
299
+ new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
300
+ io=tbl.io,
301
+ table_metadata=tbl.metadata,
302
+ files_dict_list=to_be_added_files_dict_list,
303
+ )
304
+ commit_append_snapshot(
305
+ iceberg_table=tbl,
306
+ new_position_delete_files=new_position_delete_files,
307
+ )
308
+ tbl.refresh()
309
+
310
+ # 5. Only primary key 2 and 3 should exist in table, as primary key 1 is deleted.
311
+ pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
312
+
313
+ # Only one unique record for each pk exists
314
+ all_pk = sorted(pyiceberg_scan_table_rows["primary_key"])
315
+ assert all_pk == ["pk1", "pk2", "pk3", "pk4"]
316
+
317
+ # Expected unique record to keep for each pk
318
+ expected_pk_to_pos_mapping = {"pk1": 4, "pk2": 8, "pk3": 9, "pk4": 7}
319
+ for pk, pos in zip(
320
+ pyiceberg_scan_table_rows["primary_key"], pyiceberg_scan_table_rows["pos"]
321
+ ):
322
+ assert pos == expected_pk_to_pos_mapping[pk]
323
+
324
+
325
+ @pytest.mark.integration
326
+ def test_converter_pos_delete_read_by_spark_success(
327
+ spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
328
+ ) -> None:
329
+ """
330
+ Test for convert compute remote function happy case. Download file results are mocked.
331
+ """
332
+
333
+ # 1. Create Iceberg table
334
+ namespace = "default"
335
+ table_name = "table_converter_ray_pos_delete_read_by_spark_success"
336
+ identifier = f"{namespace}.{table_name}"
337
+
338
+ schema = Schema(
339
+ NestedField(
340
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
341
+ ),
342
+ NestedField(
343
+ field_id=2, name="primary_key", field_type=StringType(), required=False
344
+ ),
345
+ schema_id=0,
346
+ )
347
+
348
+ partition_field_identity = PartitionField(
349
+ source_id=1,
350
+ field_id=101,
351
+ transform=IdentityTransform(),
352
+ name="number_partitioned",
353
+ )
354
+ partition_spec = PartitionSpec(partition_field_identity)
355
+
356
+ properties = dict()
357
+ properties["write.format.default"] = "parquet"
358
+ properties["write.delete.mode"] = "merge-on-read"
359
+ properties["write.update.mode"] = "merge-on-read"
360
+ properties["write.merge.mode"] = "merge-on-read"
361
+ properties["format-version"] = "2"
362
+
363
+ drop_table_if_exists(identifier, session_catalog)
364
+ session_catalog.create_table(
365
+ identifier,
366
+ schema=schema,
367
+ partition_spec=partition_spec,
368
+ properties=properties,
369
+ )
370
+
371
+ # 2. Use Spark to generate initial data files
372
+ tbl = session_catalog.load_table(identifier)
373
+
374
+ run_spark_commands(
375
+ spark,
376
+ [
377
+ f"""
378
+ INSERT INTO {identifier} VALUES (0, "pk1"), (0, "pk2"), (0, "pk3")
379
+ """
380
+ ],
381
+ )
382
+ run_spark_commands(
383
+ spark,
384
+ [
385
+ f"""
386
+ INSERT INTO {identifier} VALUES (0, "pk1"), (0, "pk2"), (0, "pk3")
387
+ """
388
+ ],
389
+ )
390
+ run_spark_commands(
391
+ spark,
392
+ [
393
+ f"""
394
+ INSERT INTO {identifier} VALUES (0, "pk4"), (0, "pk2"), (0, "pk3")
395
+ """
396
+ ],
397
+ )
398
+ tbl.refresh()
399
+
400
+ # 3. Use convert.remote() function to compute position deletes
401
+ data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
402
+
403
+ convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
404
+ data_file_dict=data_file_dict,
405
+ equality_delete_dict=equality_delete_dict,
406
+ pos_delete_dict=pos_delete_dict,
407
+ )
408
+
409
+ s3_file_system = get_s3_file_system()
410
+ for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
411
+ convert_input = ConvertInput.of(
412
+ convert_input_files=one_bucket_files,
413
+ convert_task_index=i,
414
+ iceberg_table_warehouse_prefix="warehouse/default",
415
+ identifier_fields=["primary_key"],
416
+ compact_small_files=False,
417
+ enforce_primary_key_uniqueness=True,
418
+ position_delete_for_multiple_data_files=True,
419
+ max_parallel_data_file_download=10,
420
+ s3_file_system=s3_file_system,
421
+ )
422
+
423
+ primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
424
+ names = ["primary_key"]
425
+ data_table_1 = pa.Table.from_arrays([primary_key_array_1], names=names)
426
+
427
+ primary_key_array_2 = pa.array(["pk1", "pk2", "pk3"])
428
+ names = ["primary_key"]
429
+ data_table_2 = pa.Table.from_arrays([primary_key_array_2], names=names)
430
+
431
+ primary_key_array_3 = pa.array(["pk4", "pk2", "pk3"])
432
+ names = ["primary_key"]
433
+ data_table_3 = pa.Table.from_arrays([primary_key_array_3], names=names)
434
+
435
+ download_data_mock = mocker.patch(
436
+ "deltacat.compute.converter.utils.io.download_parquet_with_daft_hash_applied"
437
+ )
438
+ download_data_mock.side_effect = (data_table_1, data_table_2, data_table_3)
439
+
440
+ convert_ref = convert.remote(convert_input)
441
+
442
+ to_be_deleted_files_list = []
443
+ to_be_added_files_dict_list = []
444
+ convert_result = ray.get(convert_ref)
445
+
446
+ partition_value = convert_input.convert_input_files.partition_value
447
+
448
+ if convert_result[0]:
449
+ to_be_deleted_files_list.extend(convert_result[0].values())
450
+
451
+ file_location = convert_result[1][partition_value][0]
452
+ to_be_added_files = f"s3://{file_location}"
453
+
454
+ to_be_added_files_dict = defaultdict()
455
+ to_be_added_files_dict[partition_value] = [to_be_added_files]
456
+ to_be_added_files_dict_list.append(to_be_added_files_dict)
457
+
458
+ # 4. Commit position delete, delete equality deletes from table
459
+ new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
460
+ io=tbl.io,
461
+ table_metadata=tbl.metadata,
462
+ files_dict_list=to_be_added_files_dict_list,
463
+ )
464
+
465
+ commit_append_snapshot(
466
+ iceberg_table=tbl,
467
+ new_position_delete_files=new_position_delete_files,
468
+ )
469
+ tbl.refresh()
470
+
471
+ # 5. Result assertion: Spark read table contains unique primary key
472
+ spark_read_pos_delete = spark.sql(f"select * from {identifier}").collect()
473
+ all_pk = [
474
+ spark_read_pos_delete[row_idx][1]
475
+ for row_idx in range(len(spark_read_pos_delete))
476
+ ]
477
+ all_pk_sorted = sorted(all_pk)
478
+ assert all_pk_sorted == ["pk1", "pk2", "pk3", "pk4"]
@@ -0,0 +1,123 @@
1
+ import uuid
2
+ import logging
3
+ from pyiceberg.exceptions import NoSuchTableError
4
+ from deltacat import logs
5
+
6
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
7
+
8
+
9
+ def get_s3_file_system():
10
+ import pyarrow
11
+
12
+ return pyarrow.fs.S3FileSystem(
13
+ access_key="admin",
14
+ secret_key="password",
15
+ endpoint_override="http://localhost:9000",
16
+ )
17
+ # 'region="us-east-1", proxy_options={'scheme': 'http', 'host': 'localhost',
18
+ # 'port': 9000, 'username': 'admin',
19
+ # 'password': 'password'})
20
+
21
+
22
+ def write_equality_data_table(
23
+ file_link_prefix, table, partition_value, equality_delete_table
24
+ ):
25
+ import pyarrow.parquet as pq
26
+
27
+ uuid_path = uuid.uuid4()
28
+ deletes_file_path = f"{file_link_prefix}/{uuid_path}_deletes.parquet"
29
+ file_system = get_s3_file_system()
30
+ pq.write_table(equality_delete_table, deletes_file_path, filesystem=file_system)
31
+ return f"s3://{deletes_file_path}"
32
+
33
+
34
+ def add_equality_data_files(file_paths, table, partition_value):
35
+ with table.transaction() as tx:
36
+ if table.metadata.name_mapping() is None:
37
+ tx.set_properties(
38
+ **{
39
+ "schema.name-mapping.default": table.metadata.schema().name_mapping.model_dump_json()
40
+ }
41
+ )
42
+ with tx.update_snapshot().fast_append() as update_snapshot:
43
+ data_files = parquet_files_to_equality_data_files(
44
+ table_metadata=table.metadata,
45
+ file_paths=file_paths,
46
+ io=table.io,
47
+ partition_value=partition_value,
48
+ )
49
+ for data_file in data_files:
50
+ update_snapshot.append_data_file(data_file)
51
+
52
+
53
+ def parquet_files_to_equality_data_files(
54
+ io, table_metadata, file_paths, partition_value
55
+ ):
56
+ from pyiceberg.io.pyarrow import (
57
+ _check_pyarrow_schema_compatible,
58
+ data_file_statistics_from_parquet_metadata,
59
+ compute_statistics_plan,
60
+ parquet_path_to_id_mapping,
61
+ )
62
+ from pyiceberg.manifest import (
63
+ DataFile,
64
+ DataFileContent,
65
+ FileFormat,
66
+ )
67
+ import pyarrow.parquet as pq
68
+
69
+ for file_path in file_paths:
70
+ input_file = io.new_input(file_path)
71
+ with input_file.open() as input_stream:
72
+ parquet_metadata = pq.read_metadata(input_stream)
73
+
74
+ schema = table_metadata.schema()
75
+ _check_pyarrow_schema_compatible(
76
+ schema, parquet_metadata.schema.to_arrow_schema()
77
+ )
78
+
79
+ statistics = data_file_statistics_from_parquet_metadata(
80
+ parquet_metadata=parquet_metadata,
81
+ stats_columns=compute_statistics_plan(schema, table_metadata.properties),
82
+ parquet_column_mapping=parquet_path_to_id_mapping(schema),
83
+ )
84
+ data_file = DataFile(
85
+ content=DataFileContent.EQUALITY_DELETES,
86
+ file_path=file_path,
87
+ file_format=FileFormat.PARQUET,
88
+ partition=partition_value,
89
+ file_size_in_bytes=len(input_file),
90
+ sort_order_id=None,
91
+ spec_id=table_metadata.default_spec_id,
92
+ equality_ids=None,
93
+ key_metadata=None,
94
+ **statistics.to_serialized_dict(),
95
+ )
96
+
97
+ yield data_file
98
+
99
+
100
+ def commit_equality_delete_to_table(
101
+ table, file_link_prefix, partition_value, equality_delete_table
102
+ ):
103
+
104
+ data_files = [
105
+ write_equality_data_table(
106
+ table=table,
107
+ file_link_prefix=file_link_prefix,
108
+ partition_value=partition_value,
109
+ equality_delete_table=equality_delete_table,
110
+ )
111
+ ]
112
+
113
+ add_equality_data_files(
114
+ file_paths=data_files, partition_value=partition_value, table=table
115
+ )
116
+ return data_files
117
+
118
+
119
+ def drop_table_if_exists(table, catalog):
120
+ try:
121
+ catalog.drop_table(table)
122
+ except NoSuchTableError:
123
+ logger.warning(f"table:{table} doesn't exist, not dropping table.")
@@ -1,6 +1,5 @@
1
1
  import deltacat.tests.local_deltacat_storage as ds
2
2
  from deltacat.types.media import ContentType
3
- import os
4
3
  import pytest
5
4
  from deltacat.storage import Delta
6
5
  from deltacat.compute.resource_estimation.delta import (
@@ -21,21 +20,6 @@ Function scoped fixtures
21
20
  """
22
21
 
23
22
 
24
- @pytest.fixture(scope="function")
25
- def local_deltacat_storage_kwargs():
26
- DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
27
- "db_file_path",
28
- "deltacat/tests/local_deltacat_storage/db_test.sqlite",
29
- )
30
- # see deltacat/tests/local_deltacat_storage/README.md for documentation
31
- kwargs_for_local_deltacat_storage = {
32
- DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
33
- }
34
- yield kwargs_for_local_deltacat_storage
35
- if os.path.exists(DATABASE_FILE_PATH_VALUE):
36
- os.remove(DATABASE_FILE_PATH_VALUE)
37
-
38
-
39
23
  @pytest.fixture(scope="function")
40
24
  def parquet_delta_with_manifest(local_deltacat_storage_kwargs):
41
25
  """