deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,13 @@
1
- from typing import Optional, List
1
+ from typing import List, Dict
2
+ from collections import defaultdict
2
3
  import uuid
4
+ from pyiceberg.table import Table
5
+ from pyiceberg.table.metadata import TableMetadata
3
6
  from pyiceberg.table.snapshots import (
4
7
  Operation,
5
8
  )
6
9
  from pyiceberg.manifest import (
10
+ DataFile,
7
11
  DataFileContent,
8
12
  ManifestContent,
9
13
  ManifestEntry,
@@ -13,71 +17,116 @@ from pyiceberg.manifest import (
13
17
  )
14
18
  import itertools
15
19
  from pyiceberg.utils.concurrent import ExecutorFactory
16
- from pyiceberg.table.update.snapshot import UpdateSnapshot, _SnapshotProducer
20
+ from pyiceberg.table.update.snapshot import _SnapshotProducer, UpdateSnapshot
17
21
 
18
22
 
19
- class _ReplaceFiles(_SnapshotProducer["_ReplaceFiles"]):
20
- """Overwrites data from the table. This will produce an OVERWRITE snapshot.
23
+ def replace_delete_files_override(
24
+ update_snapshot: UpdateSnapshot,
25
+ ) -> "_ReplaceDeleteFilesOverride":
26
+ commit_uuid = uuid.uuid4()
27
+ return _ReplaceDeleteFilesOverride(
28
+ commit_uuid=commit_uuid,
29
+ operation=Operation.OVERWRITE,
30
+ transaction=update_snapshot._transaction,
31
+ io=update_snapshot._io,
32
+ snapshot_properties=update_snapshot._snapshot_properties,
33
+ )
34
+
35
+
36
+ class _ReplaceDeleteFilesOverride(_SnapshotProducer):
37
+ def _manifests(self) -> List[ManifestFile]:
38
+ def _write_added_manifest() -> List[ManifestFile]:
39
+ if self._added_data_files:
40
+ with write_manifest(
41
+ format_version=self._transaction.table_metadata.format_version,
42
+ spec=self._transaction.table_metadata.spec(),
43
+ schema=self._transaction.table_metadata.schema(),
44
+ output_file=self.new_manifest_output(),
45
+ snapshot_id=self._snapshot_id,
46
+ ) as writer:
47
+ for data_file in self._added_data_files:
48
+ writer.add(
49
+ ManifestEntry(
50
+ status=ManifestEntryStatus.ADDED,
51
+ snapshot_id=self._snapshot_id,
52
+ sequence_number=None,
53
+ file_sequence_number=None,
54
+ data_file=data_file,
55
+ )
56
+ )
57
+ writer.content = self.writer_content
58
+ return [writer.to_manifest_file()]
59
+ else:
60
+ return []
61
+
62
+ def _write_delete_manifest() -> List[ManifestFile]:
63
+ # Check if we need to mark the files as deleted
64
+ deleted_entries = self._deleted_entries()
65
+ if len(deleted_entries) > 0:
66
+ deleted_manifests = []
67
+ partition_groups: Dict[int, List[ManifestEntry]] = defaultdict(list)
68
+ for deleted_entry in deleted_entries:
69
+ partition_groups[deleted_entry.data_file.spec_id].append(
70
+ deleted_entry
71
+ )
72
+ for spec_id, entries in partition_groups.items():
73
+ with write_manifest(
74
+ format_version=self._transaction.table_metadata.format_version,
75
+ spec=self._transaction.table_metadata.specs()[spec_id],
76
+ schema=self._transaction.table_metadata.schema(),
77
+ output_file=self.new_manifest_output(),
78
+ snapshot_id=self._snapshot_id,
79
+ ) as writer:
80
+ for entry in entries:
81
+ writer.add_entry(entry)
82
+ deleted_manifests.append(writer.to_manifest_file())
83
+ return deleted_manifests
84
+ else:
85
+ return []
21
86
 
22
- Data and delete files were added and removed in a logical overwrite operation.
23
- """
87
+ executor = ExecutorFactory.get_or_create()
88
+
89
+ added_manifests = executor.submit(_write_added_manifest)
90
+ existing_manifests = executor.submit(self._existing_manifests)
91
+ delete_manifests = executor.submit(_write_delete_manifest)
92
+ return self._process_manifests(
93
+ added_manifests.result()
94
+ + existing_manifests.result()
95
+ + delete_manifests.result()
96
+ )
97
+
98
+ def writer_content(self) -> ManifestContent:
99
+ return ManifestContent.DELETES
24
100
 
25
101
  def _existing_manifests(self) -> List[ManifestFile]:
26
- """Determine if there are any existing manifest files."""
27
- existing_files = []
28
- snapshot = self._transaction.table_metadata.current_snapshot()
29
- if snapshot:
30
- for manifest_file in snapshot.manifests(io=self._io):
31
- entries = manifest_file.fetch_manifest_entry(
32
- io=self._io, discard_deleted=True
102
+ """To determine if there are any existing manifest files.
103
+
104
+ A fast append will add another ManifestFile to the ManifestList.
105
+ All the existing manifest files are considered existing.
106
+ """
107
+ existing_manifests = []
108
+
109
+ if self._parent_snapshot_id is not None:
110
+ previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
111
+ self._parent_snapshot_id
112
+ )
113
+
114
+ if previous_snapshot is None:
115
+ raise ValueError(
116
+ f"Snapshot could not be found: {self._parent_snapshot_id}"
33
117
  )
34
118
 
35
- found_deleted_data_files = [
36
- entry.data_file
37
- for entry in entries
38
- if entry.data_file in self._deleted_data_files
39
- ]
119
+ for manifest in previous_snapshot.manifests(io=self._io):
120
+ if (
121
+ manifest.has_added_files()
122
+ or manifest.has_existing_files()
123
+ or manifest.added_snapshot_id == self._snapshot_id
124
+ ):
125
+ existing_manifests.append(manifest)
40
126
 
41
- if len(found_deleted_data_files) == 0:
42
- existing_files.append(manifest_file)
43
- else:
44
- # We have to replace the manifest file without the deleted data files
45
- if any(
46
- entry.data_file not in found_deleted_data_files
47
- for entry in entries
48
- ):
49
- with write_manifest(
50
- format_version=self._transaction.table_metadata.format_version,
51
- spec=self._transaction.table_metadata.specs()[
52
- manifest_file.partition_spec_id
53
- ],
54
- schema=self._transaction.table_metadata.schema(),
55
- output_file=self.new_manifest_output(),
56
- snapshot_id=self._snapshot_id,
57
- ) as writer:
58
- [
59
- writer.add_entry(
60
- ManifestEntry(
61
- status=ManifestEntryStatus.EXISTING,
62
- snapshot_id=entry.snapshot_id,
63
- sequence_number=entry.sequence_number,
64
- file_sequence_number=entry.file_sequence_number,
65
- data_file=entry.data_file,
66
- )
67
- )
68
- for entry in entries
69
- if entry.data_file not in found_deleted_data_files
70
- ]
71
- existing_files.append(writer.to_manifest_file())
72
- return existing_files
127
+ return existing_manifests
73
128
 
74
129
  def _deleted_entries(self) -> List[ManifestEntry]:
75
- """To determine if we need to record any deleted entries.
76
-
77
- With a full overwrite all the entries are considered deleted.
78
- With partial overwrites we have to use the predicate to evaluate
79
- which entries are affected.
80
- """
81
130
  if self._parent_snapshot_id is not None:
82
131
  previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
83
132
  self._parent_snapshot_id
@@ -102,7 +151,7 @@ class _ReplaceFiles(_SnapshotProducer["_ReplaceFiles"]):
102
151
  for entry in manifest.fetch_manifest_entry(
103
152
  self._io, discard_deleted=True
104
153
  )
105
- if entry.data_file.content == DataFileContent.DATA
154
+ if entry.data_file.content == DataFileContent.EQUALITY_DELETES
106
155
  and entry.data_file in self._deleted_data_files
107
156
  ]
108
157
 
@@ -114,45 +163,30 @@ class _ReplaceFiles(_SnapshotProducer["_ReplaceFiles"]):
114
163
  return []
115
164
 
116
165
 
117
- def replace(
118
- self,
119
- commit_uuid: Optional[uuid.UUID] = None,
120
- using_starting_sequence: Optional[bool] = False,
121
- ) -> _ReplaceFiles:
122
- return _ReplaceFiles(
123
- commit_uuid=commit_uuid,
124
- operation=Operation.REPLACE
125
- if self._transaction.table_metadata.current_snapshot() is not None
126
- else Operation.APPEND,
127
- transaction=self._transaction,
128
- io=self._io,
129
- snapshot_properties=self._snapshot_properties,
130
- using_starting_sequence=using_starting_sequence,
131
- )
132
-
133
-
134
- UpdateSnapshot.replace = replace
135
-
136
-
137
- def commit_replace_snapshot(
138
- iceberg_table, to_be_deleted_files_list, new_position_delete_files
139
- ):
166
+ def commit_append_snapshot(
167
+ iceberg_table: Table, new_position_delete_files: List[DataFile]
168
+ ) -> TableMetadata:
140
169
  tx = iceberg_table.transaction()
141
- snapshot_properties = {}
142
- commit_uuid = uuid.uuid4()
143
- update_snapshot = tx.update_snapshot(snapshot_properties=snapshot_properties)
144
- replace_snapshot = replace(
145
- self=update_snapshot, commit_uuid=commit_uuid, using_starting_sequence=False
146
- )
147
- for to_be_deleted_file in to_be_deleted_files_list:
148
- replace_snapshot.append_data_file(to_be_deleted_file)
149
- for to_be_added_file in new_position_delete_files:
150
- replace_snapshot.delete_data_file(to_be_added_file)
151
- replace_snapshot._commit()
152
- tx.commit_transaction()
170
+ try:
171
+ if iceberg_table.metadata.name_mapping() is None:
172
+ tx.set_properties(
173
+ **{
174
+ "schema.name-mapping.default": tx.table_metadata.schema().name_mapping.model_dump_json()
175
+ }
176
+ )
177
+ with append_delete_files_override(tx.update_snapshot()) as append_snapshot:
178
+ if new_position_delete_files:
179
+ for data_file in new_position_delete_files:
180
+ append_snapshot.append_data_file(data_file)
181
+ except Exception as e:
182
+ raise e
183
+ else:
184
+ return tx.commit_transaction().metadata
153
185
 
154
186
 
155
- def append_delete_files_override(update_snapshot):
187
+ def append_delete_files_override(
188
+ update_snapshot: UpdateSnapshot,
189
+ ) -> "_AppendDeleteFilesOverride":
156
190
  commit_uuid = uuid.uuid4()
157
191
  return _AppendDeleteFilesOverride(
158
192
  commit_uuid=commit_uuid,
@@ -164,8 +198,8 @@ def append_delete_files_override(update_snapshot):
164
198
 
165
199
 
166
200
  class _AppendDeleteFilesOverride(_SnapshotProducer):
167
- def _manifests(self):
168
- def _write_added_manifest():
201
+ def _manifests(self) -> List[ManifestFile]:
202
+ def _write_added_manifest() -> List[ManifestFile]:
169
203
  if self._added_data_files:
170
204
  with write_manifest(
171
205
  format_version=self._transaction.table_metadata.format_version,
@@ -198,7 +232,7 @@ class _AppendDeleteFilesOverride(_SnapshotProducer):
198
232
  added_manifests.result() + existing_manifests.result()
199
233
  )
200
234
 
201
- def writer_content(self):
235
+ def writer_content(self) -> ManifestContent:
202
236
  return ManifestContent.DELETES
203
237
 
204
238
  def _existing_manifests(self) -> List[ManifestFile]:
@@ -237,15 +271,29 @@ class _AppendDeleteFilesOverride(_SnapshotProducer):
237
271
  return []
238
272
 
239
273
 
240
- def commit_append_snapshot(iceberg_table, new_position_delete_files):
241
- with iceberg_table.transaction() as tx:
274
+ def commit_replace_snapshot(
275
+ iceberg_table: Table,
276
+ new_position_delete_files: List[DataFile],
277
+ to_be_deleted_files: List[DataFile],
278
+ ) -> TableMetadata:
279
+ tx = iceberg_table.transaction()
280
+ try:
242
281
  if iceberg_table.metadata.name_mapping() is None:
243
282
  tx.set_properties(
244
283
  **{
245
284
  "schema.name-mapping.default": tx.table_metadata.schema().name_mapping.model_dump_json()
246
285
  }
247
286
  )
248
- with append_delete_files_override(tx.update_snapshot()) as append_snapshot:
287
+ with replace_delete_files_override(
288
+ tx.update_snapshot()
289
+ ) as replace_delete_snapshot:
249
290
  if new_position_delete_files:
250
291
  for data_file in new_position_delete_files:
251
- append_snapshot.append_data_file(data_file)
292
+ replace_delete_snapshot.append_data_file(data_file)
293
+ if to_be_deleted_files:
294
+ for delete_file in to_be_deleted_files:
295
+ replace_delete_snapshot.delete_data_file(delete_file)
296
+ except Exception as e:
297
+ raise e
298
+ else:
299
+ return tx.commit_transaction().metadata