deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
+ from collections import defaultdict
1
2
  import pytest
2
3
  import ray
3
- from typing import List
4
+ from typing import List, Dict, Any, Tuple
4
5
  from pyiceberg.catalog.rest import RestCatalog
5
- from pyiceberg.expressions import EqualTo
6
6
  from pyiceberg.schema import Schema
7
7
  from pyiceberg.types import (
8
8
  NestedField,
@@ -25,144 +25,85 @@ from deltacat.compute.converter.utils.converter_session_utils import (
25
25
  from deltacat.tests.compute.converter.utils import (
26
26
  get_s3_file_system,
27
27
  drop_table_if_exists,
28
+ commit_equality_delete_to_table,
28
29
  )
29
30
  from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
30
31
  commit_append_snapshot,
32
+ commit_replace_snapshot,
31
33
  )
32
34
 
35
+ from pyiceberg.typedef import Record
36
+ from deltacat.compute.converter.utils.convert_task_options import BASE_MEMORY_BUFFER
37
+ from deltacat.tests.test_utils.filesystem import temp_dir_autocleanup
38
+ from deltacat.compute.converter.converter_session import converter_session
39
+ from deltacat.compute.converter.model.converter_session_params import (
40
+ ConverterSessionParams,
41
+ )
42
+ from pyiceberg.catalog import load_catalog
43
+ import os
44
+ import pyarrow.parquet as pq
45
+ from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
46
+ from pyiceberg.io.pyarrow import (
47
+ data_file_statistics_from_parquet_metadata,
48
+ compute_statistics_plan,
49
+ parquet_path_to_id_mapping,
50
+ )
51
+ from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible
52
+ from pyiceberg.exceptions import NamespaceAlreadyExistsError, NoSuchTableError
53
+ from pyiceberg.io.pyarrow import schema_to_pyarrow
33
54
 
34
- def run_spark_commands(spark, sqls: List[str]) -> None:
35
- for sql in sqls:
36
- spark.sql(sql)
55
+ # Task memory in bytes for testing
56
+ TASK_MEMORY_BYTES = BASE_MEMORY_BUFFER
37
57
 
38
58
 
39
- @pytest.mark.integration
40
- def test_pyiceberg_spark_setup_sanity(spark, session_catalog: RestCatalog) -> None:
41
- """
42
- This Test was copied over from Pyiceberg integ test: https://github.com/apache/iceberg-python/blob/main/tests/integration/test_deletes.py#L62
43
- First sanity check to ensure all integration with Pyiceberg and Spark are working as expected.
44
- """
45
- identifier = "default.table_partitioned_delete"
46
-
47
- run_spark_commands(
48
- spark,
49
- [
50
- f"DROP TABLE IF EXISTS {identifier}",
51
- f"""
52
- CREATE TABLE {identifier} (
53
- number_partitioned int,
54
- number int
55
- )
56
- USING iceberg
57
- PARTITIONED BY (number_partitioned)
58
- TBLPROPERTIES('format-version' = 2)
59
- """,
60
- f"""
61
- INSERT INTO {identifier} VALUES (10, 20), (10, 30)
62
- """,
63
- f"""
64
- INSERT INTO {identifier} VALUES (11, 20), (11, 30)
65
- """,
66
- ],
59
+ # Test data fixtures
60
+ @pytest.fixture
61
+ def base_schema():
62
+ return Schema(
63
+ NestedField(
64
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
65
+ ),
66
+ NestedField(
67
+ field_id=2, name="primary_key", field_type=StringType(), required=False
68
+ ),
69
+ NestedField(
70
+ field_id=2147483546,
71
+ name="file_path",
72
+ field_type=StringType(),
73
+ required=False,
74
+ ),
75
+ NestedField(
76
+ field_id=2147483545, name="pos", field_type=LongType(), required=False
77
+ ),
78
+ schema_id=0,
67
79
  )
68
80
 
69
- tbl = session_catalog.load_table(identifier)
70
- tbl.delete(EqualTo("number_partitioned", 10))
71
-
72
- # No overwrite operation
73
- assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == [
74
- "append",
75
- "append",
76
- "delete",
77
- ]
78
- assert tbl.scan().to_arrow().to_pydict() == {
79
- "number_partitioned": [11, 11],
80
- "number": [20, 30],
81
- }
82
-
83
-
84
- @pytest.mark.integration
85
- def test_spark_position_delete_production_sanity(
86
- spark, session_catalog: RestCatalog
87
- ) -> None:
88
- """
89
- Sanity test to ensure Spark position delete production is successful with `merge-on-read` spec V2.
90
- Table has two partition levels. 1. BucketTransform on primary key
91
- """
92
- identifier = "default.table_spark_position_delete_production_sanity"
93
-
94
- run_spark_commands(
95
- spark,
96
- [
97
- f"DROP TABLE IF EXISTS {identifier}",
98
- f"""
99
- CREATE TABLE {identifier} (
100
- number_partitioned INT,
101
- primary_key STRING
102
- )
103
- USING iceberg
104
- PARTITIONED BY (bucket(3, primary_key), number_partitioned)
105
- TBLPROPERTIES(
106
- 'format-version' = 2,
107
- 'write.delete.mode'='merge-on-read',
108
- 'write.update.mode'='merge-on-read',
109
- 'write.merge.mode'='merge-on-read'
110
- )
111
- """,
112
- f"""
113
- INSERT INTO {identifier} VALUES (0, 'pk1'), (0, 'pk2'), (0, 'pk3')
114
- """,
115
- f"""
116
- INSERT INTO {identifier} VALUES (1, 'pk1'), (1, 'pk2'), (1, 'pk3')
117
- """,
118
- ],
119
- )
120
81
 
121
- run_spark_commands(
122
- spark,
123
- [
124
- f"""
125
- DELETE FROM {identifier} WHERE primary_key in ("pk1")
126
- """,
127
- ],
82
+ @pytest.fixture
83
+ def base_schema_without_metadata():
84
+ return Schema(
85
+ NestedField(
86
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
87
+ ),
88
+ NestedField(
89
+ field_id=2, name="primary_key", field_type=StringType(), required=False
90
+ ),
91
+ schema_id=0,
128
92
  )
129
93
 
130
- tbl = session_catalog.load_table(identifier)
131
- tbl.refresh()
132
-
133
- assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == [
134
- "append",
135
- "append",
136
- "delete",
137
- ]
138
-
139
- assert tbl.scan().to_arrow().to_pydict() == {
140
- "number_partitioned": [1, 1, 0, 0],
141
- "primary_key": ["pk2", "pk3", "pk2", "pk3"],
142
- }
143
-
144
-
145
- @pytest.mark.integration
146
- def test_converter_drop_duplicates_success(
147
- spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
148
- ) -> None:
149
- """
150
- Test for convert compute remote function happy case. Download file results are mocked.
151
- """
152
-
153
- # 1. Create Iceberg table
154
- namespace = "default"
155
- table_name = "table_converter_ray_pos_delete_drop_duplicates_compute"
156
- identifier = f"{namespace}.{table_name}"
157
94
 
158
- schema = Schema(
95
+ @pytest.fixture
96
+ def multi_key_schema():
97
+ return Schema(
159
98
  NestedField(
160
99
  field_id=1, name="number_partitioned", field_type=LongType(), required=False
161
100
  ),
162
101
  NestedField(
163
- field_id=2, name="primary_key", field_type=StringType(), required=False
102
+ field_id=2, name="primary_key1", field_type=StringType(), required=False
103
+ ),
104
+ NestedField(
105
+ field_id=3, name="primary_key2", field_type=LongType(), required=False
164
106
  ),
165
- # Explicitly define "file_path" and "pos" for assertion of deterministic record after dedupe
166
107
  NestedField(
167
108
  field_id=2147483546,
168
109
  name="file_path",
@@ -175,21 +116,55 @@ def test_converter_drop_duplicates_success(
175
116
  schema_id=0,
176
117
  )
177
118
 
119
+
120
+ @pytest.fixture
121
+ def multi_key_schema_without_file_path():
122
+ return Schema(
123
+ NestedField(
124
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
125
+ ),
126
+ NestedField(
127
+ field_id=2, name="primary_key1", field_type=StringType(), required=False
128
+ ),
129
+ NestedField(
130
+ field_id=3, name="primary_key2", field_type=LongType(), required=False
131
+ ),
132
+ schema_id=0,
133
+ )
134
+
135
+
136
+ @pytest.fixture
137
+ def base_partition_spec():
178
138
  partition_field_identity = PartitionField(
179
139
  source_id=1,
180
140
  field_id=101,
181
141
  transform=IdentityTransform(),
182
142
  name="number_partitioned",
183
143
  )
184
- partition_spec = PartitionSpec(partition_field_identity)
144
+ return PartitionSpec(partition_field_identity)
185
145
 
186
- properties = dict()
187
- properties["write.format.default"] = "parquet"
188
- properties["write.delete.mode"] = "merge-on-read"
189
- properties["write.update.mode"] = "merge-on-read"
190
- properties["write.merge.mode"] = "merge-on-read"
191
- properties["format-version"] = "2"
192
146
 
147
+ @pytest.fixture
148
+ def table_properties():
149
+ return {
150
+ "write.format.default": "parquet",
151
+ "write.delete.mode": "merge-on-read",
152
+ "write.update.mode": "merge-on-read",
153
+ "write.merge.mode": "merge-on-read",
154
+ "format-version": "2",
155
+ }
156
+
157
+
158
+ def create_test_table(
159
+ session_catalog: RestCatalog,
160
+ namespace: str,
161
+ table_name: str,
162
+ schema: Schema,
163
+ partition_spec: PartitionSpec,
164
+ properties: Dict[str, str],
165
+ ) -> str:
166
+ """Helper function to create a test table"""
167
+ identifier = f"{namespace}.{table_name}"
193
168
  drop_table_if_exists(identifier, session_catalog)
194
169
  session_catalog.create_table(
195
170
  identifier,
@@ -197,204 +172,323 @@ def test_converter_drop_duplicates_success(
197
172
  partition_spec=partition_spec,
198
173
  properties=properties,
199
174
  )
175
+ return identifier
200
176
 
201
- # 2. Use Spark to generate initial data files
202
- tbl = session_catalog.load_table(identifier)
203
- tbl.refresh()
204
- run_spark_commands(
205
- spark,
206
- [
207
- f"""
208
- INSERT INTO {identifier} VALUES (0, "pk1", "path1", 1), (0, "pk2", "path2", 2), (0, "pk3", "path3", 3)
209
- """
210
- ],
211
- )
212
- run_spark_commands(
213
- spark,
214
- [
215
- f"""
216
- INSERT INTO {identifier} VALUES (0, "pk1", "path1", 4), (0, "pk2", "path2", 5), (0, "pk3", "path3", 6)
217
- """
218
- ],
219
- )
220
- run_spark_commands(
221
- spark,
222
- [
223
- f"""
224
- INSERT INTO {identifier} VALUES (0, "pk4", "path4", 7), (0, "pk2", "path2", 8), (0, "pk3", "path3", 9)
225
- """
226
- ],
227
- )
228
177
 
229
- tbl = session_catalog.load_table(identifier)
230
- # 3. Use convert.remote() function to compute position deletes
231
- data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
178
+ def create_mock_data_tables(test_case: Dict[str, Any]) -> Tuple[daft.DataFrame, ...]:
179
+ """Helper function to create mock data tables based on test case"""
180
+ tables = []
181
+ for data in test_case["mock_data"]:
182
+ if "primary_key2" in data: # Multi-key case
183
+ names = ["primary_key1", "primary_key2"]
184
+ table = pa.Table.from_arrays(
185
+ [pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
186
+ names=names,
187
+ )
188
+ else: # Single key case
189
+ names = ["primary_key"]
190
+ table = pa.Table.from_arrays([pa.array(data["primary_key"])], names=names)
191
+ tables.append(daft.from_arrow(table))
192
+ if "equality_delete_data_mock" in test_case:
193
+ for data in test_case["equality_delete_data_mock"]:
194
+ if "primary_key2" in data: # Multi-key case
195
+ names = ["primary_key1", "primary_key2"]
196
+ table = pa.Table.from_arrays(
197
+ [pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
198
+ names=names,
199
+ )
200
+ else: # Single key case
201
+ names = ["primary_key"]
202
+ table = pa.Table.from_arrays(
203
+ [pa.array(data["primary_key"])], names=names
204
+ )
205
+ tables.append(daft.from_arrow(table))
206
+ return tuple(tables)
232
207
 
233
- convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
234
- data_file_dict=data_file_dict,
235
- equality_delete_dict=equality_delete_dict,
236
- pos_delete_dict=pos_delete_dict,
237
- )
238
208
 
239
- s3_file_system = get_s3_file_system()
209
+ def run_spark_commands(spark, sqls: List[str]) -> None:
210
+ """Helper function to run Spark SQL commands"""
211
+ for sql in sqls:
212
+ spark.sql(sql)
213
+
214
+
215
+ def insert_test_data(spark, identifier: str, test_case: Dict[str, Any]) -> None:
216
+ """Helper function to insert test data into the table"""
217
+ if "primary_key2" in test_case["mock_data"][0]:
218
+ # Multi-key case
219
+ for data in test_case["mock_data"]:
220
+ values = ", ".join(
221
+ f"(0, '{pk1}', {pk2})"
222
+ for pk1, pk2 in zip(data["primary_key1"], data["primary_key2"])
223
+ )
224
+ run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
225
+ else:
226
+ # Single key case
227
+ if test_case["schema"] == "base_schema":
228
+ # For drop duplicates test, use file_path and pos from mock_data
229
+ for data in test_case["mock_data"]:
230
+ values = ", ".join(
231
+ f"(0, '{pk}', '{path}', {pos})"
232
+ for pk, path, pos in zip(
233
+ data["primary_key"], data["file_path"], data["pos"]
234
+ )
235
+ )
236
+ run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
237
+ else:
238
+ # For other tests, just include the basic columns
239
+ for data in test_case["mock_data"]:
240
+ values = ", ".join(f"(0, '{pk}')" for pk in data["primary_key"])
241
+ run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
242
+
243
+
244
+ def create_convert_input(
245
+ tbl,
246
+ convert_input_files_for_all_buckets: List[Any],
247
+ test_case: Dict[str, Any],
248
+ s3_file_system: Any,
249
+ ) -> List[ConvertInput]:
250
+ """Helper function to create convert inputs"""
251
+ convert_inputs = []
240
252
  for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
241
253
  convert_input = ConvertInput.of(
242
254
  convert_input_files=one_bucket_files,
243
255
  convert_task_index=i,
244
256
  iceberg_table_warehouse_prefix="warehouse/default",
245
- identifier_fields=["primary_key"],
257
+ identifier_fields=test_case["identifier_fields"],
246
258
  table_io=tbl.io,
247
259
  table_metadata=tbl.metadata,
248
260
  compact_previous_position_delete_files=False,
249
261
  enforce_primary_key_uniqueness=True,
250
262
  position_delete_for_multiple_data_files=True,
251
263
  max_parallel_data_file_download=10,
252
- s3_file_system=s3_file_system,
264
+ filesystem=s3_file_system,
253
265
  s3_client_kwargs={},
266
+ task_memory=TASK_MEMORY_BYTES,
254
267
  )
268
+ convert_inputs.append(convert_input)
269
+ return convert_inputs
255
270
 
256
- number_partitioned_array_1 = pa.array([0, 0, 0], type=pa.int32())
257
- primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
258
- names = ["number_partitioned", "primary_key"]
259
- data_table_1 = pa.Table.from_arrays(
260
- [number_partitioned_array_1, primary_key_array_1], names=names
261
- )
262
-
263
- number_partitioned_array_2 = pa.array([0, 0, 0], type=pa.int32())
264
- primary_key_array_2 = pa.array(["pk1", "pk2", "pk3"])
265
- names = ["number_partitioned", "primary_key"]
266
- data_table_2 = pa.Table.from_arrays(
267
- [number_partitioned_array_2, primary_key_array_2], names=names
268
- )
269
-
270
- number_partitioned_array_3 = pa.array([0, 0, 0], type=pa.int32())
271
- primary_key_array_3 = pa.array(["pk4", "pk2", "pk3"])
272
- names = ["number_partitioned", "primary_key"]
273
- data_table_3 = pa.Table.from_arrays(
274
- [number_partitioned_array_3, primary_key_array_3], names=names
275
- )
276
-
277
- daft_df_1 = daft.from_arrow(data_table_1)
278
- daft_df_2 = daft.from_arrow(data_table_2)
279
- daft_df_3 = daft.from_arrow(data_table_3)
280
271
 
281
- download_data_mock = mocker.patch(
282
- "deltacat.compute.converter.utils.io.daft_read_parquet"
283
- )
284
- download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
272
+ def process_convert_result(convert_result: Any) -> Tuple[List[Any], List[Any]]:
273
+ """Helper function to process convert results
285
274
 
286
- convert_ref = convert.remote(convert_input)
275
+ Args:
276
+ convert_result: The result from convert_session
287
277
 
278
+ Returns:
279
+ Tuple[List[Any], List[Any]]: Lists of files to be deleted and added
280
+ """
288
281
  to_be_deleted_files_list = []
289
-
290
- convert_result = ray.get(convert_ref)
291
-
292
282
  to_be_added_files_list = []
293
- # Check if there're files to delete
294
283
  if convert_result.to_be_deleted_files:
295
284
  to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
296
285
  if convert_result.to_be_added_files:
297
286
  to_be_added_files_list.extend(convert_result.to_be_added_files)
287
+ return to_be_deleted_files_list, to_be_added_files_list
298
288
 
299
- commit_append_snapshot(
300
- iceberg_table=tbl,
301
- new_position_delete_files=to_be_added_files_list,
302
- )
303
- tbl.refresh()
304
289
 
305
- # 5. Only primary key 2 and 3 should exist in table, as primary key 1 is deleted.
306
- pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
290
+ def verify_result(result, expected_result, verify_pos_index=False):
291
+ """Verify the result matches the expected result.
292
+
293
+ Args:
294
+ result: The result to verify
295
+ expected_result: The expected result
296
+ verify_pos_index: Whether to verify position values for primary keys
297
+ """
298
+ if "primary_keys" in expected_result and "primary_key" in result:
299
+ # Single key case
300
+ assert set(result["primary_key"]) == set(expected_result["primary_keys"])
301
+ if verify_pos_index and "pk_to_pos" in expected_result:
302
+ for index in range(len(result["primary_key"])):
303
+ assert (
304
+ result["pos"][index]
305
+ == expected_result["pk_to_pos"][result["primary_key"][index]]
306
+ )
307
+ elif "pk_tuples" in expected_result:
308
+ pk_combined_res = []
309
+ for pk1, pk2 in zip(
310
+ result["primary_key1"],
311
+ result["primary_key2"],
312
+ ):
313
+ pk_combined_res.append((pk1, pk2))
314
+
315
+ # Multi-key case
316
+ assert set(pk_combined_res) == set(expected_result["pk_tuples"])
317
+ else:
318
+ assert set(result) == set(expected_result["primary_keys"])
319
+
320
+
321
+ def verify_spark_read_results(spark, identifier, expected_result):
322
+ spark_read_pos_delete = spark.sql(f"select * from {identifier}").collect()
323
+ all_pk = [
324
+ spark_read_pos_delete[row_idx][1]
325
+ for row_idx in range(len(spark_read_pos_delete))
326
+ ]
327
+ verify_result(all_pk, expected_result, verify_pos_index=False)
328
+
329
+
330
+ def get_file_prefix(tbl):
331
+ """Get the file prefix from a table's data files.
307
332
 
308
- # Only one unique record for each pk exists
309
- all_pk = sorted(pyiceberg_scan_table_rows["primary_key"])
310
- assert all_pk == ["pk1", "pk2", "pk3", "pk4"]
333
+ Args:
334
+ tbl: The table to get the file prefix from
311
335
 
312
- # Expected unique record to keep for each pk
313
- expected_pk_to_pos_mapping = {"pk1": 4, "pk2": 8, "pk3": 9, "pk4": 7}
314
- for pk, pos in zip(
315
- pyiceberg_scan_table_rows["primary_key"], pyiceberg_scan_table_rows["pos"]
316
- ):
317
- assert pos == expected_pk_to_pos_mapping[pk]
336
+ Returns:
337
+ str: The file prefix
338
+ """
339
+ df = tbl.inspect.entries()
340
+ data_files = df.to_pydict()["data_file"]
341
+ file_link = data_files[0]["file_path"]
342
+ file_prefix = "/".join(file_link.split("/")[:-1])
343
+ return file_prefix.split("//")[1]
344
+
345
+
346
+ # Test cases configuration
347
+ TEST_CASES = [
348
+ {
349
+ "name": "single_key_drop_duplicates",
350
+ "table_name": "table_converter_ray_drop_duplicates_success",
351
+ "schema": "base_schema",
352
+ "identifier_fields": ["primary_key"],
353
+ "mock_data": [
354
+ {
355
+ "primary_key": ["pk1", "pk2", "pk3"],
356
+ "file_path": ["path1", "path2", "path3"],
357
+ "pos": [1, 2, 3],
358
+ },
359
+ {
360
+ "primary_key": ["pk1", "pk2", "pk3"],
361
+ "file_path": ["path1", "path2", "path3"],
362
+ "pos": [4, 5, 6],
363
+ },
364
+ {
365
+ "primary_key": ["pk4", "pk2", "pk3"],
366
+ "file_path": ["path4", "path2", "path3"],
367
+ "pos": [7, 8, 9],
368
+ },
369
+ ],
370
+ "expected_result": {
371
+ "primary_keys": ["pk1", "pk2", "pk3", "pk4"],
372
+ "pk_to_pos": {"pk1": 4, "pk2": 8, "pk3": 9, "pk4": 7},
373
+ },
374
+ },
375
+ {
376
+ "name": "multi_key_drop_duplicates",
377
+ "table_name": "table_converter_ray_pos_delete_multiple_identifier_fields",
378
+ "schema": "multi_key_schema_without_file_path",
379
+ "identifier_fields": ["primary_key1", "primary_key2"],
380
+ "mock_data": [
381
+ {"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
382
+ {"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
383
+ {"primary_key1": ["pk4", "pk2", "pk3"], "primary_key2": [1, 3, 4]},
384
+ ],
385
+ "expected_result": {
386
+ "pk_tuples": [
387
+ ("pk1", 1),
388
+ ("pk2", 2),
389
+ ("pk2", 3),
390
+ ("pk3", 3),
391
+ ("pk3", 4),
392
+ ("pk4", 1),
393
+ ]
394
+ },
395
+ },
396
+ {
397
+ "name": "equality_delete",
398
+ "table_name": "table_converter_ray_equality_delete_success",
399
+ "schema": "base_schema_without_metadata",
400
+ "identifier_fields": ["primary_key"],
401
+ "mock_data": [
402
+ {"primary_key": ["pk1", "pk2", "pk3"]},
403
+ {"primary_key": ["pk1", "pk2", "pk3"]},
404
+ {"primary_key": ["pk4", "pk2", "pk3"]},
405
+ ],
406
+ "equality_delete_data_mock": [{"primary_key": ["pk1"]}],
407
+ "equality_delete_data": pa.Table.from_arrays(["pk1"], names=["primary_key"]),
408
+ "verify_spark_read": True,
409
+ "expected_result": {"primary_keys": ["pk2", "pk3", "pk4"]},
410
+ },
411
+ {
412
+ "name": "position_delete",
413
+ "table_name": "table_converter_ray_position_delete_success",
414
+ "schema": "base_schema_without_metadata",
415
+ "identifier_fields": ["primary_key"],
416
+ "mock_data": [
417
+ {"primary_key": ["pk1", "pk2", "pk3"]},
418
+ {"primary_key": ["pk1", "pk2", "pk3"]},
419
+ {"primary_key": ["pk4", "pk2", "pk3"]},
420
+ ],
421
+ "expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
422
+ },
423
+ {
424
+ "name": "position_delete_read_by_spark",
425
+ "table_name": "table_converter_ray_pos_delete_read_by_spark_success",
426
+ "schema": "base_schema_without_metadata",
427
+ "identifier_fields": ["primary_key"],
428
+ "mock_data": [
429
+ {"primary_key": ["pk1", "pk2", "pk3"]},
430
+ {"primary_key": ["pk1", "pk2", "pk3"]},
431
+ {"primary_key": ["pk4", "pk2", "pk3"]},
432
+ ],
433
+ "expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
434
+ "verify_spark_read": True,
435
+ "expected_spark_count": 4,
436
+ },
437
+ ]
318
438
 
319
439
 
440
+ @pytest.mark.parametrize("test_case", TEST_CASES)
320
441
  @pytest.mark.integration
321
- def test_converter_pos_delete_read_by_spark_success(
322
- spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
442
+ def test_converter(
443
+ test_case: Dict[str, Any],
444
+ spark,
445
+ session_catalog: RestCatalog,
446
+ setup_ray_cluster,
447
+ mocker,
448
+ request,
323
449
  ) -> None:
324
450
  """
325
- Test for convert compute remote function happy case. Download file results are mocked.
451
+ Parameterized test for converter functionality.
452
+ Tests drop duplicates, equality delete, and position delete scenarios.
326
453
  """
327
-
328
- # 1. Create Iceberg table
329
- namespace = "default"
330
- table_name = "table_converter_ray_pos_delete_read_by_spark_success"
331
- identifier = f"{namespace}.{table_name}"
332
-
333
- schema = Schema(
334
- NestedField(
335
- field_id=1, name="number_partitioned", field_type=LongType(), required=False
336
- ),
337
- NestedField(
338
- field_id=2, name="primary_key", field_type=StringType(), required=False
339
- ),
340
- schema_id=0,
341
- )
342
-
343
- partition_field_identity = PartitionField(
344
- source_id=1,
345
- field_id=101,
346
- transform=IdentityTransform(),
347
- name="number_partitioned",
348
- )
349
- partition_spec = PartitionSpec(partition_field_identity)
350
-
351
- properties = dict()
352
- properties["write.format.default"] = "parquet"
353
- properties["write.delete.mode"] = "merge-on-read"
354
- properties["write.update.mode"] = "merge-on-read"
355
- properties["write.merge.mode"] = "merge-on-read"
356
- properties["format-version"] = "2"
357
-
358
- drop_table_if_exists(identifier, session_catalog)
359
- session_catalog.create_table(
360
- identifier,
454
+ # Get schema fixture based on test case
455
+ schema = request.getfixturevalue(test_case["schema"])
456
+
457
+ # Create test table
458
+ identifier = create_test_table(
459
+ session_catalog=session_catalog,
460
+ namespace="default",
461
+ table_name=test_case["table_name"],
361
462
  schema=schema,
362
- partition_spec=partition_spec,
363
- properties=properties,
463
+ partition_spec=request.getfixturevalue("base_partition_spec"),
464
+ properties=request.getfixturevalue("table_properties"),
364
465
  )
365
466
 
366
- # 2. Use Spark to generate initial data files
367
- tbl = session_catalog.load_table(identifier)
368
-
369
- run_spark_commands(
370
- spark,
371
- [
372
- f"""
373
- INSERT INTO {identifier} VALUES (0, "pk1"), (0, "pk2"), (0, "pk3")
374
- """
375
- ],
376
- )
377
- run_spark_commands(
378
- spark,
379
- [
380
- f"""
381
- INSERT INTO {identifier} VALUES (0, "pk1"), (0, "pk2"), (0, "pk3")
382
- """
383
- ],
384
- )
385
- run_spark_commands(
386
- spark,
387
- [
388
- f"""
389
- INSERT INTO {identifier} VALUES (0, "pk4"), (0, "pk2"), (0, "pk3")
390
- """
391
- ],
392
- )
393
- tbl.refresh()
467
+ # Insert test data
468
+ insert_test_data(spark, identifier, test_case)
394
469
 
395
- # 3. Use convert.remote() function to compute position deletes
470
+ # Get files and create convert input
471
+ tbl = session_catalog.load_table(identifier)
396
472
  data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
397
473
 
474
+ # Handle equality delete if present
475
+ if "equality_delete_data" in test_case:
476
+ tbl = session_catalog.load_table(identifier)
477
+ file_prefix = get_file_prefix(tbl)
478
+ partition_value = Record(number_partitioned=0)
479
+
480
+ # Note: Just upload to S3 to mock input data here.
481
+ # NOT committing to Iceberg metadata as equality delete write path not implemented in Pyiceberg/Spark.
482
+ equality_file_list = commit_equality_delete_to_table(
483
+ table=tbl,
484
+ partition_value=partition_value,
485
+ equality_delete_table=test_case["equality_delete_data"],
486
+ file_link_prefix=file_prefix,
487
+ )
488
+ # Mock equality delete input to converter with latest file sequence, so equality delete can be applied to all data before
489
+ equality_delete_dict = defaultdict()
490
+ equality_delete_dict[partition_value] = [(4, equality_file_list[0])]
491
+
398
492
  convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
399
493
  data_file_dict=data_file_dict,
400
494
  equality_delete_dict=equality_delete_dict,
@@ -402,240 +496,331 @@ def test_converter_pos_delete_read_by_spark_success(
402
496
  )
403
497
 
404
498
  s3_file_system = get_s3_file_system()
405
- for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
406
- convert_input = ConvertInput.of(
407
- convert_input_files=one_bucket_files,
408
- convert_task_index=i,
409
- iceberg_table_warehouse_prefix="warehouse/default",
410
- identifier_fields=["primary_key"],
411
- table_io=tbl.io,
412
- table_metadata=tbl.metadata,
413
- compact_previous_position_delete_files=False,
414
- enforce_primary_key_uniqueness=True,
415
- position_delete_for_multiple_data_files=True,
416
- max_parallel_data_file_download=10,
417
- s3_file_system=s3_file_system,
418
- s3_client_kwargs={},
419
- )
420
-
421
- primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
422
- names = ["primary_key"]
423
- data_table_1 = pa.Table.from_arrays([primary_key_array_1], names=names)
424
-
425
- primary_key_array_2 = pa.array(["pk1", "pk2", "pk3"])
426
- names = ["primary_key"]
427
- data_table_2 = pa.Table.from_arrays([primary_key_array_2], names=names)
428
-
429
- primary_key_array_3 = pa.array(["pk4", "pk2", "pk3"])
430
- names = ["primary_key"]
431
- data_table_3 = pa.Table.from_arrays([primary_key_array_3], names=names)
432
-
433
- daft_df_1 = daft.from_arrow(data_table_1)
434
- daft_df_2 = daft.from_arrow(data_table_2)
435
- daft_df_3 = daft.from_arrow(data_table_3)
499
+ convert_inputs = create_convert_input(
500
+ tbl, convert_input_files_for_all_buckets, test_case, s3_file_system
501
+ )
436
502
 
503
+ # Create and set up mock data
504
+ mock_data_tables = create_mock_data_tables(test_case)
437
505
  download_data_mock = mocker.patch(
438
506
  "deltacat.compute.converter.utils.io.daft_read_parquet"
439
507
  )
440
- download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
441
508
 
442
- convert_ref = convert.remote(convert_input)
509
+ download_data_mock.side_effect = mock_data_tables
443
510
 
444
- to_be_deleted_files_list = []
445
- to_be_added_files_list = []
511
+ # Run conversion
512
+ convert_ref = convert.remote(convert_inputs[0])
446
513
  convert_result = ray.get(convert_ref)
447
514
 
448
- if convert_result.to_be_deleted_files:
449
- to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
450
- if convert_result.to_be_added_files:
451
- to_be_added_files_list.extend(convert_result.to_be_added_files)
452
-
453
- # 4. Commit position delete, delete equality deletes from table
454
- commit_append_snapshot(
455
- iceberg_table=tbl,
456
- new_position_delete_files=to_be_added_files_list,
515
+ # Process results
516
+ to_be_deleted_files_list, to_be_added_files_list = process_convert_result(
517
+ convert_result
457
518
  )
519
+
520
+ if not to_be_deleted_files_list:
521
+ # Commit changes
522
+ commit_append_snapshot(
523
+ iceberg_table=tbl,
524
+ new_position_delete_files=to_be_added_files_list,
525
+ )
526
+ else:
527
+ commit_replace_snapshot(
528
+ iceberg_table=tbl,
529
+ to_be_deleted_files=to_be_deleted_files_list[0],
530
+ new_position_delete_files=to_be_added_files_list,
531
+ )
458
532
  tbl.refresh()
459
533
 
460
- # 5. Result assertion: Spark read table contains unique primary key
461
- spark_read_pos_delete = spark.sql(f"select * from {identifier}").collect()
462
- all_pk = [
463
- spark_read_pos_delete[row_idx][1]
464
- for row_idx in range(len(spark_read_pos_delete))
465
- ]
466
- all_pk_sorted = sorted(all_pk)
467
- assert all_pk_sorted == ["pk1", "pk2", "pk3", "pk4"]
534
+ # Verify results
535
+ pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
468
536
 
537
+ # Verify Spark read if required
538
+ if test_case.get("verify_spark_read", False):
539
+ verify_spark_read_results(spark, identifier, test_case["expected_result"])
540
+ else:
541
+ verify_result(
542
+ pyiceberg_scan_table_rows,
543
+ test_case["expected_result"],
544
+ verify_pos_index=test_case.get("verify_pos_index", False),
545
+ )
469
546
 
470
- @pytest.mark.integration
471
- def test_converter_pos_delete_multiple_identifier_fields_success(
472
- spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
547
+
548
+ def test_converter_session_with_local_filesystem_and_duplicate_ids(
549
+ setup_ray_cluster,
473
550
  ) -> None:
474
551
  """
475
- Test for convert compute remote function happy case. Download file results are mocked.
552
+ Test converter_session functionality with local PyArrow filesystem using duplicate IDs.
553
+ This test simulates the pattern where duplicate IDs represent updates to existing records.
554
+ The converter should merge these updates by creating position delete files.
476
555
  """
556
+ with temp_dir_autocleanup() as temp_catalog_dir:
557
+ # Create warehouse directory
558
+ warehouse_path = os.path.join(temp_catalog_dir, "iceberg_warehouse")
559
+ os.makedirs(warehouse_path, exist_ok=True)
560
+
561
+ # Set up local in-memory catalog
562
+ local_catalog = load_catalog(
563
+ "local_sql_catalog",
564
+ **{
565
+ "type": "in-memory",
566
+ "warehouse": warehouse_path,
567
+ },
568
+ )
477
569
 
478
- # 1. Create Iceberg table
479
- namespace = "default"
480
- table_name = "table_converter_ray_pos_delete_multiple_identifier_fields"
481
-
482
- identifier = f"{namespace}.{table_name}"
483
-
484
- schema = Schema(
485
- NestedField(
486
- field_id=1, name="number_partitioned", field_type=LongType(), required=False
487
- ),
488
- NestedField(
489
- field_id=2, name="primary_key1", field_type=StringType(), required=False
490
- ),
491
- NestedField(
492
- field_id=3, name="primary_key2", field_type=LongType(), required=False
493
- ),
494
- schema_id=0,
495
- )
496
-
497
- partition_field_identity = PartitionField(
498
- source_id=1,
499
- field_id=101,
500
- transform=IdentityTransform(),
501
- name="number_partitioned",
502
- )
503
- partition_spec = PartitionSpec(partition_field_identity)
504
-
505
- properties = dict()
506
- properties["write.format.default"] = "parquet"
507
- properties["write.delete.mode"] = "merge-on-read"
508
- properties["write.update.mode"] = "merge-on-read"
509
- properties["write.merge.mode"] = "merge-on-read"
510
- properties["format-version"] = "2"
511
-
512
- drop_table_if_exists(identifier, session_catalog)
513
- session_catalog.create_table(
514
- identifier,
515
- schema=schema,
516
- partition_spec=partition_spec,
517
- properties=properties,
518
- )
519
-
520
- # 2. Use Spark to generate initial data files
521
- tbl = session_catalog.load_table(identifier)
522
-
523
- run_spark_commands(
524
- spark,
525
- [
526
- f"""
527
- INSERT INTO {identifier} VALUES (0, "pk1", 1), (0, "pk2", 2), (0, "pk3", 3)
528
- """
529
- ],
530
- )
531
- run_spark_commands(
532
- spark,
533
- [
534
- f"""
535
- INSERT INTO {identifier} VALUES (0, "pk1", 1), (0, "pk2", 2), (0, "pk3", 3)
536
- """
537
- ],
538
- )
539
- run_spark_commands(
540
- spark,
541
- [
542
- f"""
543
- INSERT INTO {identifier} VALUES (0, "pk4", 1), (0, "pk2", 3), (0, "pk3", 4)
544
- """
545
- ],
546
- )
547
- tbl.refresh()
548
-
549
- # 3. Use convert.remote() function to compute position deletes
550
- data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
551
-
552
- convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
553
- data_file_dict=data_file_dict,
554
- equality_delete_dict=equality_delete_dict,
555
- pos_delete_dict=pos_delete_dict,
556
- )
557
-
558
- s3_file_system = get_s3_file_system()
559
- for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
560
- convert_input = ConvertInput.of(
561
- convert_input_files=one_bucket_files,
562
- convert_task_index=i,
563
- iceberg_table_warehouse_prefix="warehouse/default",
564
- identifier_fields=["primary_key1", "primary_key2"],
565
- table_io=tbl.io,
566
- table_metadata=tbl.metadata,
567
- compact_previous_position_delete_files=False,
568
- enforce_primary_key_uniqueness=True,
569
- position_delete_for_multiple_data_files=True,
570
- max_parallel_data_file_download=10,
571
- s3_file_system=s3_file_system,
572
- s3_client_kwargs={},
570
+ # Create local PyArrow filesystem
571
+ import pyarrow.fs as pafs
572
+
573
+ local_filesystem = pafs.LocalFileSystem()
574
+
575
+ # Define schema (id, name, value, version)
576
+ schema = Schema(
577
+ NestedField(field_id=1, name="id", field_type=LongType(), required=True),
578
+ NestedField(
579
+ field_id=2, name="name", field_type=StringType(), required=False
580
+ ),
581
+ NestedField(
582
+ field_id=3, name="value", field_type=LongType(), required=False
583
+ ),
584
+ NestedField(
585
+ field_id=4, name="version", field_type=LongType(), required=False
586
+ ),
587
+ schema_id=0,
573
588
  )
574
589
 
575
- names = ["primary_key1", "primary_key2"]
590
+ # Create table properties for merge-on-read
591
+ properties = {
592
+ "write.format.default": "parquet",
593
+ "write.delete.mode": "merge-on-read",
594
+ "write.update.mode": "merge-on-read",
595
+ "write.merge.mode": "merge-on-read",
596
+ "format-version": "2",
597
+ }
598
+
599
+ # Create the table
600
+ table_identifier = "default.test_duplicate_ids"
601
+ try:
602
+ local_catalog.create_namespace("default")
603
+ except NamespaceAlreadyExistsError:
604
+ pass # Namespace may already exist
605
+ try:
606
+ local_catalog.drop_table(table_identifier)
607
+ except NoSuchTableError:
608
+ pass # Table may not exist
609
+
610
+ local_catalog.create_table(
611
+ table_identifier,
612
+ schema=schema,
613
+ properties=properties,
614
+ )
615
+ tbl = local_catalog.load_table(table_identifier)
576
616
 
577
- primary_key1_array_1 = pa.array(["pk1", "pk2", "pk3"])
578
- primary_key2_array_1 = pa.array([1, 2, 3])
579
- data_table_1 = pa.Table.from_arrays(
580
- [primary_key1_array_1, primary_key2_array_1], names=names
581
- )
617
+ # Set the name mapping property so Iceberg can read parquet files without field IDs
618
+ with tbl.transaction() as tx:
619
+ tx.set_properties(
620
+ **{"schema.name-mapping.default": schema.name_mapping.model_dump_json()}
621
+ )
582
622
 
583
- primary_key1_array_2 = pa.array(["pk1", "pk2", "pk3"])
584
- primary_key2_array_2 = pa.array([1, 2, 3])
585
- data_table_2 = pa.Table.from_arrays(
586
- [primary_key1_array_2, primary_key2_array_2], names=names
587
- )
623
+ # Step 1: Write initial data
624
+ # Create PyArrow table with explicit schema to match Iceberg schema
625
+ arrow_schema = schema_to_pyarrow(schema)
626
+
627
+ initial_data = pa.table(
628
+ {
629
+ "id": [1, 2, 3, 4],
630
+ "name": ["Alice", "Bob", "Charlie", "David"],
631
+ "value": [100, 200, 300, 400],
632
+ "version": [1, 1, 1, 1],
633
+ },
634
+ schema=arrow_schema,
635
+ )
588
636
 
589
- primary_key1_array_3 = pa.array(["pk4", "pk2", "pk3"])
590
- primary_key2_array_3 = pa.array([1, 3, 4])
591
- data_table_3 = pa.Table.from_arrays(
592
- [primary_key1_array_3, primary_key2_array_3], names=names
593
- )
637
+ # Step 2: Write additional data
638
+ additional_data = pa.table(
639
+ {
640
+ "id": [5, 6, 7, 8],
641
+ "name": ["Eve", "Frank", "Grace", "Henry"],
642
+ "value": [500, 600, 700, 800],
643
+ "version": [1, 1, 1, 1],
644
+ },
645
+ schema=arrow_schema,
646
+ )
594
647
 
595
- daft_df_1 = daft.from_arrow(data_table_1)
596
- daft_df_2 = daft.from_arrow(data_table_2)
597
- daft_df_3 = daft.from_arrow(data_table_3)
648
+ # Step 3: Write updates to existing records (this creates duplicates by ID)
649
+ # These should overwrite the original records with same IDs
650
+ updated_data = pa.table(
651
+ {
652
+ "id": [2, 3, 9], # IDs 2 and 3 are duplicates, 9 is new
653
+ "name": [
654
+ "Robert",
655
+ "Charles",
656
+ "Ivan",
657
+ ], # Updated names for Bob and Charlie
658
+ "value": [201, 301, 900], # Updated values
659
+ "version": [2, 2, 1], # Higher version numbers for updates
660
+ },
661
+ schema=arrow_schema,
662
+ )
598
663
 
599
- download_data_mock = mocker.patch(
600
- "deltacat.compute.converter.utils.io.daft_read_parquet"
601
- )
602
- download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
664
+ # Write all data to separate parquet files to simulate multiple writes
665
+ data_files_to_commit = []
603
666
 
604
- convert_ref = convert.remote(convert_input)
667
+ for i, data in enumerate([initial_data, additional_data, updated_data]):
668
+ data_file_path = os.path.join(warehouse_path, f"data_{i}.parquet")
669
+ pq.write_table(data, data_file_path)
605
670
 
606
- to_be_deleted_files_list = []
607
- to_be_added_files_list = []
608
- convert_result = ray.get(convert_ref)
671
+ # Create DataFile objects for Iceberg
672
+ parquet_metadata = pq.read_metadata(data_file_path)
673
+ file_size = os.path.getsize(data_file_path)
609
674
 
610
- if convert_result.to_be_deleted_files:
611
- to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
612
- if convert_result.to_be_added_files:
613
- to_be_added_files_list.extend(convert_result.to_be_added_files)
675
+ # Check schema compatibility
676
+ _check_pyarrow_schema_compatible(
677
+ schema, parquet_metadata.schema.to_arrow_schema()
678
+ )
614
679
 
615
- # 4. Commit position delete, delete equality deletes from table
680
+ # Calculate statistics
681
+ statistics = data_file_statistics_from_parquet_metadata(
682
+ parquet_metadata=parquet_metadata,
683
+ stats_columns=compute_statistics_plan(schema, tbl.metadata.properties),
684
+ parquet_column_mapping=parquet_path_to_id_mapping(schema),
685
+ )
616
686
 
617
- commit_append_snapshot(
618
- iceberg_table=tbl,
619
- new_position_delete_files=to_be_added_files_list,
620
- )
621
- tbl.refresh()
687
+ data_file = DataFile(
688
+ content=DataFileContent.DATA,
689
+ file_path=data_file_path,
690
+ file_format=FileFormat.PARQUET,
691
+ partition={}, # No partitioning
692
+ file_size_in_bytes=file_size,
693
+ sort_order_id=None,
694
+ spec_id=tbl.metadata.default_spec_id,
695
+ key_metadata=None,
696
+ equality_ids=None,
697
+ **statistics.to_serialized_dict(),
698
+ )
699
+ data_files_to_commit.append(data_file)
700
+
701
+ # Commit all data files to the table
702
+ with tbl.transaction() as tx:
703
+ with tx.update_snapshot().fast_append() as update_snapshot:
704
+ for data_file in data_files_to_commit:
705
+ update_snapshot.append_data_file(data_file)
706
+
707
+ tbl.refresh()
708
+
709
+ # Verify we have duplicate IDs before conversion
710
+ initial_scan = tbl.scan().to_arrow().to_pydict()
711
+ print(f"Before conversion - Records with IDs: {sorted(initial_scan['id'])}")
712
+
713
+ # There should be duplicates: [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
714
+ expected_duplicate_ids = [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
715
+ assert (
716
+ sorted(initial_scan["id"]) == expected_duplicate_ids
717
+ ), f"Expected duplicate IDs {expected_duplicate_ids}, got {sorted(initial_scan['id'])}"
718
+
719
+ # Now call converter_session to convert equality deletes to position deletes
720
+ converter_params = ConverterSessionParams.of(
721
+ {
722
+ "catalog": local_catalog,
723
+ "iceberg_table_name": table_identifier,
724
+ "iceberg_warehouse_bucket_name": warehouse_path, # Local warehouse path
725
+ "merge_keys": ["id"], # Use ID as the merge key
726
+ "enforce_primary_key_uniqueness": True,
727
+ "task_max_parallelism": 1, # Single task for local testing
728
+ "filesystem": local_filesystem,
729
+ "location_provider_prefix_override": None, # Use local filesystem
730
+ "location_provider_prefix_override": None, # Let the system auto-generate the prefix
731
+ }
732
+ )
622
733
 
623
- # 5. Result assertion: Expected unique primary keys to be kept
624
- pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
625
- expected_result_tuple_list = [
626
- ("pk1", 1),
627
- ("pk2", 2),
628
- ("pk2", 3),
629
- ("pk3", 3),
630
- ("pk3", 4),
631
- ("pk4", 1),
632
- ]
633
- pk_combined_res = []
634
- for pk1, pk2 in zip(
635
- pyiceberg_scan_table_rows["primary_key1"],
636
- pyiceberg_scan_table_rows["primary_key2"],
637
- ):
638
- pk_combined_res.append((pk1, pk2))
639
-
640
- # Assert elements are same disregard ordering in list
641
- assert sorted(pk_combined_res) == sorted(expected_result_tuple_list)
734
+ print(f"Running converter_session with local filesystem...")
735
+ print(f"Warehouse path: {warehouse_path}")
736
+ print(f"Merge keys: ['id']")
737
+ print(f"Enforce uniqueness: True")
738
+
739
+ # Run the converter
740
+ converter_session(params=converter_params)
741
+
742
+ # Refresh table and scan again
743
+ tbl.refresh()
744
+ final_scan = tbl.scan().to_arrow().to_pydict()
745
+
746
+ print(f"After conversion - Records with IDs: {sorted(final_scan['id'])}")
747
+ print(f"Final data: {final_scan}")
748
+
749
+ # Verify position delete files were created by checking table metadata
750
+ latest_snapshot = tbl.metadata.current_snapshot()
751
+ if latest_snapshot:
752
+ manifests = latest_snapshot.manifests(tbl.io)
753
+ position_delete_files = []
754
+
755
+ for manifest in manifests:
756
+ entries = manifest.fetch_manifest_entry(tbl.io)
757
+ for entry in entries:
758
+ if entry.data_file.content == DataFileContent.POSITION_DELETES:
759
+ position_delete_files.append(entry.data_file.file_path)
760
+
761
+ print(f"Position delete files found: {position_delete_files}")
762
+ assert (
763
+ len(position_delete_files) > 0
764
+ ), "No position delete files were created by converter_session"
765
+
766
+ # Verify the final result has unique IDs (duplicates should be resolved)
767
+ # Expected: Latest values for each ID based on the updates
768
+ expected_unique_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9] # All unique IDs
769
+ actual_ids = sorted(final_scan["id"])
770
+
771
+ print(f"Expected unique IDs: {expected_unique_ids}")
772
+ print(f"Actual IDs after conversion: {actual_ids}")
773
+
774
+ assert (
775
+ actual_ids == expected_unique_ids
776
+ ), f"Expected unique IDs {expected_unique_ids}, got {actual_ids}"
777
+
778
+ # Verify the updated values are present (higher version should win)
779
+ final_data_by_id = {}
780
+ for i, id_val in enumerate(final_scan["id"]):
781
+ final_data_by_id[id_val] = {
782
+ "name": final_scan["name"][i],
783
+ "value": final_scan["value"][i],
784
+ "version": final_scan["version"][i],
785
+ }
786
+
787
+ # Check that ID 2 has updated value (Robert, 201, version 2)
788
+ assert (
789
+ final_data_by_id[2]["name"] == "Robert"
790
+ ), f"ID 2 should have updated name 'Robert', got '{final_data_by_id[2]['name']}'"
791
+ assert (
792
+ final_data_by_id[2]["value"] == 201
793
+ ), f"ID 2 should have updated value 201, got {final_data_by_id[2]['value']}"
794
+ assert (
795
+ final_data_by_id[2]["version"] == 2
796
+ ), f"ID 2 should have version 2, got {final_data_by_id[2]['version']}"
797
+
798
+ # Check that ID 3 has updated value (Charles, 301, version 2)
799
+ assert (
800
+ final_data_by_id[3]["name"] == "Charles"
801
+ ), f"ID 3 should have updated name 'Charles', got '{final_data_by_id[3]['name']}'"
802
+ assert (
803
+ final_data_by_id[3]["value"] == 301
804
+ ), f"ID 3 should have updated value 301, got {final_data_by_id[3]['value']}"
805
+ assert (
806
+ final_data_by_id[3]["version"] == 2
807
+ ), f"ID 3 should have version 2, got {final_data_by_id[3]['version']}"
808
+
809
+ # Check that new ID 9 is present
810
+ assert (
811
+ final_data_by_id[9]["name"] == "Ivan"
812
+ ), f"ID 9 should have name 'Ivan', got '{final_data_by_id[9]['name']}'"
813
+ assert (
814
+ final_data_by_id[9]["value"] == 900
815
+ ), f"ID 9 should have value 900, got {final_data_by_id[9]['value']}"
816
+
817
+ print(f"✅ Test completed successfully!")
818
+ print(
819
+ f"✅ Position delete files were created: {len(position_delete_files)} files"
820
+ )
821
+ print(f"✅ Duplicate IDs were resolved correctly")
822
+ print(
823
+ f"✅ Updated values were applied (ID 2: Bob->Robert, ID 3: Charlie->Charles)"
824
+ )
825
+ print(f"✅ Final table has {len(actual_ids)} unique records")
826
+ print(f"✅ Temporary warehouse cleaned up at: {temp_catalog_dir}")