deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +2 -3
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -1
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
  40. deltacat/compute/compactor_v2/steps/merge.py +11 -80
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  45. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  46. deltacat/compute/converter/constants.py +4 -0
  47. deltacat/compute/converter/converter_session.py +143 -0
  48. deltacat/compute/converter/model/convert_input.py +69 -0
  49. deltacat/compute/converter/model/convert_input_files.py +61 -0
  50. deltacat/compute/converter/model/converter_session_params.py +99 -0
  51. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  52. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  53. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  54. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  55. deltacat/compute/converter/steps/__init__.py +0 -0
  56. deltacat/compute/converter/steps/convert.py +211 -0
  57. deltacat/compute/converter/steps/dedupe.py +60 -0
  58. deltacat/compute/converter/utils/__init__.py +0 -0
  59. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat/compute/converter/utils/s3u.py +133 -0
  64. deltacat/compute/resource_estimation/delta.py +1 -19
  65. deltacat/constants.py +47 -1
  66. deltacat/env.py +51 -0
  67. deltacat/examples/__init__.py +0 -0
  68. deltacat/examples/basic_logging.py +101 -0
  69. deltacat/examples/common/__init__.py +0 -0
  70. deltacat/examples/common/fixtures.py +15 -0
  71. deltacat/examples/hello_world.py +27 -0
  72. deltacat/examples/iceberg/__init__.py +0 -0
  73. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  74. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  75. deltacat/exceptions.py +51 -9
  76. deltacat/logs.py +4 -1
  77. deltacat/storage/__init__.py +118 -28
  78. deltacat/storage/iceberg/__init__.py +0 -0
  79. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  80. deltacat/storage/iceberg/impl.py +737 -0
  81. deltacat/storage/iceberg/model.py +709 -0
  82. deltacat/storage/interface.py +217 -134
  83. deltacat/storage/main/__init__.py +0 -0
  84. deltacat/storage/main/impl.py +2077 -0
  85. deltacat/storage/model/delta.py +118 -71
  86. deltacat/storage/model/interop.py +24 -0
  87. deltacat/storage/model/list_result.py +8 -0
  88. deltacat/storage/model/locator.py +93 -3
  89. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  90. deltacat/storage/model/metafile.py +1316 -0
  91. deltacat/storage/model/namespace.py +34 -18
  92. deltacat/storage/model/partition.py +362 -37
  93. deltacat/storage/model/scan/__init__.py +0 -0
  94. deltacat/storage/model/scan/push_down.py +19 -0
  95. deltacat/storage/model/scan/scan_plan.py +10 -0
  96. deltacat/storage/model/scan/scan_task.py +34 -0
  97. deltacat/storage/model/schema.py +892 -0
  98. deltacat/storage/model/shard.py +47 -0
  99. deltacat/storage/model/sort_key.py +170 -13
  100. deltacat/storage/model/stream.py +208 -80
  101. deltacat/storage/model/table.py +123 -29
  102. deltacat/storage/model/table_version.py +322 -46
  103. deltacat/storage/model/transaction.py +757 -0
  104. deltacat/storage/model/transform.py +198 -61
  105. deltacat/storage/model/types.py +111 -13
  106. deltacat/storage/rivulet/__init__.py +11 -0
  107. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  108. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  109. deltacat/storage/rivulet/dataset.py +744 -0
  110. deltacat/storage/rivulet/dataset_executor.py +87 -0
  111. deltacat/storage/rivulet/feather/__init__.py +5 -0
  112. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  113. deltacat/storage/rivulet/feather/serializer.py +35 -0
  114. deltacat/storage/rivulet/fs/__init__.py +0 -0
  115. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  116. deltacat/storage/rivulet/fs/file_store.py +130 -0
  117. deltacat/storage/rivulet/fs/input_file.py +76 -0
  118. deltacat/storage/rivulet/fs/output_file.py +86 -0
  119. deltacat/storage/rivulet/logical_plan.py +105 -0
  120. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  121. deltacat/storage/rivulet/metastore/delta.py +190 -0
  122. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  123. deltacat/storage/rivulet/metastore/sst.py +82 -0
  124. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  125. deltacat/storage/rivulet/mvp/Table.py +101 -0
  126. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  127. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  129. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  130. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  131. deltacat/storage/rivulet/reader/__init__.py +0 -0
  132. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  133. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  134. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  135. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  136. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  137. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  138. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  139. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  140. deltacat/storage/rivulet/schema/__init__.py +0 -0
  141. deltacat/storage/rivulet/schema/datatype.py +128 -0
  142. deltacat/storage/rivulet/schema/schema.py +251 -0
  143. deltacat/storage/rivulet/serializer.py +40 -0
  144. deltacat/storage/rivulet/serializer_factory.py +42 -0
  145. deltacat/storage/rivulet/writer/__init__.py +0 -0
  146. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  147. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  148. deltacat/tests/_io/__init__.py +1 -0
  149. deltacat/tests/catalog/test_catalogs.py +324 -0
  150. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  151. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  152. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  153. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  154. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  155. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  156. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  157. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  158. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  159. deltacat/tests/compute/conftest.py +75 -0
  160. deltacat/tests/compute/converter/__init__.py +0 -0
  161. deltacat/tests/compute/converter/conftest.py +80 -0
  162. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  163. deltacat/tests/compute/converter/utils.py +123 -0
  164. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  165. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  166. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  167. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  168. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  169. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  170. deltacat/tests/compute/test_util_common.py +19 -12
  171. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  172. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  173. deltacat/tests/storage/__init__.py +0 -0
  174. deltacat/tests/storage/conftest.py +25 -0
  175. deltacat/tests/storage/main/__init__.py +0 -0
  176. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  177. deltacat/tests/storage/model/__init__.py +0 -0
  178. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  179. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  180. deltacat/tests/storage/model/test_schema.py +308 -0
  181. deltacat/tests/storage/model/test_shard.py +22 -0
  182. deltacat/tests/storage/model/test_table_version.py +110 -0
  183. deltacat/tests/storage/model/test_transaction.py +308 -0
  184. deltacat/tests/storage/rivulet/__init__.py +0 -0
  185. deltacat/tests/storage/rivulet/conftest.py +149 -0
  186. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  187. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  188. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  189. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  190. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  191. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  192. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  193. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  194. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  195. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  197. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  198. deltacat/tests/test_deltacat_api.py +39 -0
  199. deltacat/tests/test_utils/filesystem.py +14 -0
  200. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  201. deltacat/tests/test_utils/pyarrow.py +8 -15
  202. deltacat/tests/test_utils/storage.py +266 -3
  203. deltacat/tests/utils/test_daft.py +3 -3
  204. deltacat/tests/utils/test_pyarrow.py +0 -432
  205. deltacat/types/partial_download.py +1 -1
  206. deltacat/types/tables.py +1 -1
  207. deltacat/utils/export.py +59 -0
  208. deltacat/utils/filesystem.py +320 -0
  209. deltacat/utils/metafile_locator.py +73 -0
  210. deltacat/utils/pyarrow.py +36 -183
  211. deltacat-2.0.dist-info/METADATA +65 -0
  212. deltacat-2.0.dist-info/RECORD +347 -0
  213. deltacat/aws/redshift/__init__.py +0 -19
  214. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  215. deltacat/io/dataset.py +0 -73
  216. deltacat/io/read_api.py +0 -143
  217. deltacat/storage/model/delete_parameters.py +0 -40
  218. deltacat/storage/model/partition_spec.py +0 -71
  219. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  220. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  221. deltacat-1.1.35.dist-info/METADATA +0 -64
  222. deltacat-1.1.35.dist-info/RECORD +0 -219
  223. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  224. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  225. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  226. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  227. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  228. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  229. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  233. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  234. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  235. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,251 @@
1
+ from typing import Optional, List
2
+ import uuid
3
+ from pyiceberg.table.snapshots import (
4
+ Operation,
5
+ )
6
+ from pyiceberg.manifest import (
7
+ DataFileContent,
8
+ ManifestContent,
9
+ ManifestEntry,
10
+ ManifestEntryStatus,
11
+ ManifestFile,
12
+ write_manifest,
13
+ )
14
+ import itertools
15
+ from pyiceberg.utils.concurrent import ExecutorFactory
16
+ from pyiceberg.table.update.snapshot import UpdateSnapshot, _SnapshotProducer
17
+
18
+
19
+ class _ReplaceFiles(_SnapshotProducer["_ReplaceFiles"]):
20
+ """Overwrites data from the table. This will produce an OVERWRITE snapshot.
21
+
22
+ Data and delete files were added and removed in a logical overwrite operation.
23
+ """
24
+
25
+ def _existing_manifests(self) -> List[ManifestFile]:
26
+ """Determine if there are any existing manifest files."""
27
+ existing_files = []
28
+ snapshot = self._transaction.table_metadata.current_snapshot()
29
+ if snapshot:
30
+ for manifest_file in snapshot.manifests(io=self._io):
31
+ entries = manifest_file.fetch_manifest_entry(
32
+ io=self._io, discard_deleted=True
33
+ )
34
+
35
+ found_deleted_data_files = [
36
+ entry.data_file
37
+ for entry in entries
38
+ if entry.data_file in self._deleted_data_files
39
+ ]
40
+
41
+ if len(found_deleted_data_files) == 0:
42
+ existing_files.append(manifest_file)
43
+ else:
44
+ # We have to replace the manifest file without the deleted data files
45
+ if any(
46
+ entry.data_file not in found_deleted_data_files
47
+ for entry in entries
48
+ ):
49
+ with write_manifest(
50
+ format_version=self._transaction.table_metadata.format_version,
51
+ spec=self._transaction.table_metadata.specs()[
52
+ manifest_file.partition_spec_id
53
+ ],
54
+ schema=self._transaction.table_metadata.schema(),
55
+ output_file=self.new_manifest_output(),
56
+ snapshot_id=self._snapshot_id,
57
+ ) as writer:
58
+ [
59
+ writer.add_entry(
60
+ ManifestEntry(
61
+ status=ManifestEntryStatus.EXISTING,
62
+ snapshot_id=entry.snapshot_id,
63
+ sequence_number=entry.sequence_number,
64
+ file_sequence_number=entry.file_sequence_number,
65
+ data_file=entry.data_file,
66
+ )
67
+ )
68
+ for entry in entries
69
+ if entry.data_file not in found_deleted_data_files
70
+ ]
71
+ existing_files.append(writer.to_manifest_file())
72
+ return existing_files
73
+
74
+ def _deleted_entries(self) -> List[ManifestEntry]:
75
+ """To determine if we need to record any deleted entries.
76
+
77
+ With a full overwrite all the entries are considered deleted.
78
+ With partial overwrites we have to use the predicate to evaluate
79
+ which entries are affected.
80
+ """
81
+ if self._parent_snapshot_id is not None:
82
+ previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
83
+ self._parent_snapshot_id
84
+ )
85
+ if previous_snapshot is None:
86
+ # This should never happen since you cannot overwrite an empty table
87
+ raise ValueError(
88
+ f"Could not find the previous snapshot: {self._parent_snapshot_id}"
89
+ )
90
+
91
+ executor = ExecutorFactory.get_or_create()
92
+
93
+ def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]:
94
+ return [
95
+ ManifestEntry(
96
+ status=ManifestEntryStatus.DELETED,
97
+ snapshot_id=entry.snapshot_id,
98
+ sequence_number=entry.sequence_number,
99
+ file_sequence_number=entry.file_sequence_number,
100
+ data_file=entry.data_file,
101
+ )
102
+ for entry in manifest.fetch_manifest_entry(
103
+ self._io, discard_deleted=True
104
+ )
105
+ if entry.data_file.content == DataFileContent.DATA
106
+ and entry.data_file in self._deleted_data_files
107
+ ]
108
+
109
+ list_of_entries = executor.map(
110
+ _get_entries, previous_snapshot.manifests(self._io)
111
+ )
112
+ return list(itertools.chain(*list_of_entries))
113
+ else:
114
+ return []
115
+
116
+
117
+ def replace(
118
+ self,
119
+ commit_uuid: Optional[uuid.UUID] = None,
120
+ using_starting_sequence: Optional[bool] = False,
121
+ ) -> _ReplaceFiles:
122
+ return _ReplaceFiles(
123
+ commit_uuid=commit_uuid,
124
+ operation=Operation.REPLACE
125
+ if self._transaction.table_metadata.current_snapshot() is not None
126
+ else Operation.APPEND,
127
+ transaction=self._transaction,
128
+ io=self._io,
129
+ snapshot_properties=self._snapshot_properties,
130
+ using_starting_sequence=using_starting_sequence,
131
+ )
132
+
133
+
134
+ UpdateSnapshot.replace = replace
135
+
136
+
137
+ def commit_replace_snapshot(
138
+ iceberg_table, to_be_deleted_files_list, new_position_delete_files
139
+ ):
140
+ tx = iceberg_table.transaction()
141
+ snapshot_properties = {}
142
+ commit_uuid = uuid.uuid4()
143
+ update_snapshot = tx.update_snapshot(snapshot_properties=snapshot_properties)
144
+ replace_snapshot = replace(
145
+ self=update_snapshot, commit_uuid=commit_uuid, using_starting_sequence=False
146
+ )
147
+ for to_be_deleted_file in to_be_deleted_files_list:
148
+ replace_snapshot.append_data_file(to_be_deleted_file)
149
+ for to_be_added_file in new_position_delete_files:
150
+ replace_snapshot.delete_data_file(to_be_added_file)
151
+ replace_snapshot._commit()
152
+ tx.commit_transaction()
153
+
154
+
155
+ def append_delete_files_override(update_snapshot):
156
+ commit_uuid = uuid.uuid4()
157
+ return _AppendDeleteFilesOverride(
158
+ commit_uuid=commit_uuid,
159
+ operation=Operation.APPEND,
160
+ transaction=update_snapshot._transaction,
161
+ io=update_snapshot._io,
162
+ snapshot_properties=update_snapshot._snapshot_properties,
163
+ )
164
+
165
+
166
+ class _AppendDeleteFilesOverride(_SnapshotProducer):
167
+ def _manifests(self):
168
+ def _write_added_manifest():
169
+ if self._added_data_files:
170
+ with write_manifest(
171
+ format_version=self._transaction.table_metadata.format_version,
172
+ spec=self._transaction.table_metadata.spec(),
173
+ schema=self._transaction.table_metadata.schema(),
174
+ output_file=self.new_manifest_output(),
175
+ snapshot_id=self._snapshot_id,
176
+ ) as writer:
177
+ for data_file in self._added_data_files:
178
+ writer.add(
179
+ ManifestEntry(
180
+ status=ManifestEntryStatus.ADDED,
181
+ snapshot_id=self._snapshot_id,
182
+ sequence_number=None,
183
+ file_sequence_number=None,
184
+ data_file=data_file,
185
+ )
186
+ )
187
+ writer.content = self.writer_content
188
+ return [writer.to_manifest_file()]
189
+ else:
190
+ return []
191
+
192
+ executor = ExecutorFactory.get_or_create()
193
+
194
+ added_manifests = executor.submit(_write_added_manifest)
195
+ existing_manifests = executor.submit(self._existing_manifests)
196
+
197
+ return self._process_manifests(
198
+ added_manifests.result() + existing_manifests.result()
199
+ )
200
+
201
+ def writer_content(self):
202
+ return ManifestContent.DELETES
203
+
204
+ def _existing_manifests(self) -> List[ManifestFile]:
205
+ """To determine if there are any existing manifest files.
206
+
207
+ A fast append will add another ManifestFile to the ManifestList.
208
+ All the existing manifest files are considered existing.
209
+ """
210
+ existing_manifests = []
211
+
212
+ if self._parent_snapshot_id is not None:
213
+ previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
214
+ self._parent_snapshot_id
215
+ )
216
+
217
+ if previous_snapshot is None:
218
+ raise ValueError(
219
+ f"Snapshot could not be found: {self._parent_snapshot_id}"
220
+ )
221
+
222
+ for manifest in previous_snapshot.manifests(io=self._io):
223
+ if (
224
+ manifest.has_added_files()
225
+ or manifest.has_existing_files()
226
+ or manifest.added_snapshot_id == self._snapshot_id
227
+ ):
228
+ existing_manifests.append(manifest)
229
+
230
+ return existing_manifests
231
+
232
+ def _deleted_entries(self) -> List[ManifestEntry]:
233
+ """To determine if we need to record any deleted manifest entries.
234
+
235
+ In case of an append, nothing is deleted.
236
+ """
237
+ return []
238
+
239
+
240
+ def commit_append_snapshot(iceberg_table, new_position_delete_files):
241
+ with iceberg_table.transaction() as tx:
242
+ if iceberg_table.metadata.name_mapping() is None:
243
+ tx.set_properties(
244
+ **{
245
+ "schema.name-mapping.default": tx.table_metadata.schema().name_mapping.model_dump_json()
246
+ }
247
+ )
248
+ with append_delete_files_override(tx.update_snapshot()) as append_snapshot:
249
+ if new_position_delete_files:
250
+ for data_file in new_position_delete_files:
251
+ append_snapshot.append_data_file(data_file)
File without changes
@@ -0,0 +1,211 @@
1
+ import pyarrow.compute as pc
2
+
3
+ import deltacat.compute.converter.utils.iceberg_columns as sc
4
+ import pyarrow as pa
5
+
6
+ from collections import defaultdict
7
+ import ray
8
+ import logging
9
+ from deltacat.compute.converter.model.convert_input import ConvertInput
10
+ from deltacat.compute.converter.steps.dedupe import dedupe_data_files
11
+ from deltacat.compute.converter.utils.s3u import upload_table_with_retry
12
+ from deltacat.compute.converter.utils.io import (
13
+ download_data_table_and_append_iceberg_columns,
14
+ )
15
+ from deltacat.compute.converter.utils.converter_session_utils import (
16
+ partition_value_record_to_partition_value_string,
17
+ )
18
+
19
+ from deltacat import logs
20
+
21
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
22
+
23
+
24
+ @ray.remote
25
+ def convert(convert_input: ConvertInput):
26
+ convert_input_files = convert_input.convert_input_files
27
+ convert_task_index = convert_input.convert_task_index
28
+ iceberg_table_warehouse_prefix = convert_input.iceberg_table_warehouse_prefix
29
+ identifier_fields = convert_input.identifier_fields
30
+ compact_small_files = convert_input.compact_small_files
31
+ position_delete_for_multiple_data_files = (
32
+ convert_input.position_delete_for_multiple_data_files
33
+ )
34
+ max_parallel_data_file_download = convert_input.max_parallel_data_file_download
35
+ s3_file_system = convert_input.s3_file_system
36
+ if not position_delete_for_multiple_data_files:
37
+ raise NotImplementedError(
38
+ f"Distributed file level position delete compute is not supported yet"
39
+ )
40
+ if compact_small_files:
41
+ raise NotImplementedError(f"Compact previous position delete not supported yet")
42
+
43
+ logger.info(f"Starting convert task index: {convert_task_index}")
44
+
45
+ applicable_data_files = convert_input_files.applicable_data_files
46
+ applicable_equality_delete_files = (
47
+ convert_input_files.applicable_equality_delete_files
48
+ )
49
+ all_data_files_for_this_bucket = convert_input_files.all_data_files_for_dedupe
50
+
51
+ partition_value_str = partition_value_record_to_partition_value_string(
52
+ convert_input_files.partition_value
53
+ )
54
+ partition_value = convert_input_files.partition_value
55
+ iceberg_table_warehouse_prefix_with_partition = (
56
+ f"{iceberg_table_warehouse_prefix}/{partition_value_str}"
57
+ )
58
+ enforce_primary_key_uniqueness = convert_input.enforce_primary_key_uniqueness
59
+ total_pos_delete_table = []
60
+ if applicable_equality_delete_files:
61
+ (
62
+ pos_delete_after_converting_equality_delete
63
+ ) = compute_pos_delete_with_limited_parallelism(
64
+ data_files_list=applicable_data_files,
65
+ identifier_columns=identifier_fields,
66
+ equality_delete_files_list=applicable_equality_delete_files,
67
+ iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
68
+ max_parallel_data_file_download=max_parallel_data_file_download,
69
+ s3_file_system=s3_file_system,
70
+ )
71
+ if pos_delete_after_converting_equality_delete:
72
+ total_pos_delete_table.append(pos_delete_after_converting_equality_delete)
73
+
74
+ if enforce_primary_key_uniqueness:
75
+ data_files_to_dedupe = get_additional_applicable_data_files(
76
+ all_data_files=all_data_files_for_this_bucket,
77
+ data_files_downloaded=applicable_data_files,
78
+ )
79
+ pos_delete_after_dedupe = dedupe_data_files(
80
+ data_file_to_dedupe=data_files_to_dedupe,
81
+ identify_column_name_concatenated=identifier_fields[0],
82
+ identifier_columns=identifier_fields,
83
+ merge_sort_column=sc._ORDERED_RECORD_IDX_COLUMN_NAME,
84
+ )
85
+ total_pos_delete_table.append(pos_delete_after_dedupe)
86
+
87
+ total_pos_delete = pa.concat_tables(total_pos_delete_table)
88
+ to_be_added_files_list = upload_table_with_retry(
89
+ table=total_pos_delete,
90
+ s3_url_prefix=iceberg_table_warehouse_prefix_with_partition,
91
+ s3_table_writer_kwargs={},
92
+ s3_file_system=s3_file_system,
93
+ )
94
+
95
+ to_be_delete_files_dict = defaultdict()
96
+ if applicable_equality_delete_files:
97
+ to_be_delete_files_dict[partition_value] = [
98
+ equality_delete_file[1]
99
+ for equality_delete_file in applicable_equality_delete_files
100
+ ]
101
+ to_be_added_files_dict = defaultdict()
102
+ to_be_added_files_dict[partition_value] = to_be_added_files_list
103
+ return (to_be_delete_files_dict, to_be_added_files_dict)
104
+
105
+
106
+ def get_additional_applicable_data_files(all_data_files, data_files_downloaded):
107
+ data_file_to_dedupe = all_data_files
108
+ if data_files_downloaded:
109
+ data_file_to_dedupe = list(set(all_data_files) - set(data_files_downloaded))
110
+ return data_file_to_dedupe
111
+
112
+
113
+ def filter_rows_to_be_deleted(
114
+ equality_delete_table, data_file_table, identifier_columns
115
+ ):
116
+ identifier_column = identifier_columns[0]
117
+ if equality_delete_table and data_file_table:
118
+ equality_deletes = pc.is_in(
119
+ data_file_table[identifier_column],
120
+ equality_delete_table[identifier_column],
121
+ )
122
+ position_delete_table = data_file_table.filter(equality_deletes)
123
+ logger.info(f"positional_delete_table:{position_delete_table.to_pydict()}")
124
+ logger.info(f"data_file_table:{data_file_table.to_pydict()}")
125
+ logger.info(
126
+ f"length_pos_delete_table, {len(position_delete_table)}, length_data_table:{len(data_file_table)}"
127
+ )
128
+ return position_delete_table
129
+
130
+
131
+ def compute_pos_delete_converting_equality_deletes(
132
+ equality_delete_table,
133
+ data_file_table,
134
+ identifier_columns,
135
+ iceberg_table_warehouse_prefix_with_partition,
136
+ s3_file_system,
137
+ ):
138
+ new_position_delete_table = filter_rows_to_be_deleted(
139
+ data_file_table=data_file_table,
140
+ equality_delete_table=equality_delete_table,
141
+ identifier_columns=identifier_columns,
142
+ )
143
+ if new_position_delete_table:
144
+ logger.info(
145
+ f"Length of position delete table after converting from equality deletes:{len(new_position_delete_table)}"
146
+ )
147
+ else:
148
+ return None
149
+ return new_position_delete_table
150
+
151
+
152
+ def download_bucketed_table(data_files, equality_delete_files):
153
+ from deltacat.utils.pyarrow import s3_file_to_table
154
+
155
+ compacted_table = s3_file_to_table(
156
+ [data_file.file_path for data_file in data_files]
157
+ )
158
+ equality_delete_table = s3_file_to_table(
159
+ [eq_file.file_path for eq_file in equality_delete_files]
160
+ )
161
+ return compacted_table, equality_delete_table
162
+
163
+
164
+ def compute_pos_delete_with_limited_parallelism(
165
+ data_files_list,
166
+ identifier_columns,
167
+ equality_delete_files_list,
168
+ iceberg_table_warehouse_prefix_with_partition,
169
+ max_parallel_data_file_download,
170
+ s3_file_system,
171
+ ):
172
+ for data_files, equality_delete_files in zip(
173
+ data_files_list, equality_delete_files_list
174
+ ):
175
+ data_table_total = []
176
+ for data_file in data_files:
177
+ data_table = download_data_table_and_append_iceberg_columns(
178
+ data_files=data_file[1],
179
+ columns_to_download=identifier_columns,
180
+ additional_columns_to_append=[
181
+ sc._FILE_PATH_COLUMN_NAME,
182
+ sc._ORDERED_RECORD_IDX_COLUMN_NAME,
183
+ ],
184
+ sequence_number=data_file[0],
185
+ )
186
+ data_table_total.append(data_table)
187
+ data_table_total = pa.concat_tables(data_table_total)
188
+
189
+ equality_delete_table_total = []
190
+ for equality_delete in equality_delete_files:
191
+ equality_delete_table = download_data_table_and_append_iceberg_columns(
192
+ data_files=equality_delete[1],
193
+ columns_to_download=identifier_columns,
194
+ )
195
+ equality_delete_table_total.append(equality_delete_table)
196
+ equality_delete_table_total = pa.concat_tables(equality_delete_table_total)
197
+
198
+ new_pos_delete_table = compute_pos_delete_converting_equality_deletes(
199
+ equality_delete_table=equality_delete_table_total,
200
+ data_file_table=data_table_total,
201
+ iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
202
+ identifier_columns=identifier_columns,
203
+ s3_file_system=s3_file_system,
204
+ )
205
+ if not new_pos_delete_table:
206
+ logger.info("No records deleted based on equality delete converstion")
207
+
208
+ logger.info(
209
+ f"Number of records to delete based on equality delete convertion:{len(new_pos_delete_table)}"
210
+ )
211
+ return new_pos_delete_table
@@ -0,0 +1,60 @@
1
+ import pyarrow as pa
2
+ import pyarrow.compute as pc
3
+ import deltacat.compute.converter.utils.iceberg_columns as sc
4
+ from deltacat.compute.converter.utils.io import (
5
+ download_data_table_and_append_iceberg_columns,
6
+ )
7
+
8
+
9
+ def dedupe_data_files(
10
+ data_file_to_dedupe,
11
+ identify_column_name_concatenated,
12
+ identifier_columns,
13
+ merge_sort_column,
14
+ ):
15
+ data_file_table = []
16
+
17
+ # Sort data files by file sequence number first
18
+ data_file_to_dedupe = sorted(data_file_to_dedupe, key=lambda f: f[0])
19
+ for file_tuple in data_file_to_dedupe:
20
+ sequence_number = file_tuple[0]
21
+ data_file = file_tuple[1]
22
+ data_file_to_dedupe_table = download_data_table_and_append_iceberg_columns(
23
+ file=data_file,
24
+ columns_to_download=identifier_columns,
25
+ additional_columns_to_append=[
26
+ sc._FILE_PATH_COLUMN_NAME,
27
+ sc._ORDERED_RECORD_IDX_COLUMN_NAME,
28
+ ],
29
+ sequence_number=sequence_number,
30
+ )
31
+ data_file_table.append(data_file_to_dedupe_table)
32
+
33
+ final_data_to_dedupe = pa.concat_tables(data_file_table)
34
+
35
+ record_idx_iterator = iter(range(len(final_data_to_dedupe)))
36
+
37
+ # Append global record index to used as aggregate column
38
+ final_data_to_dedupe = sc.append_global_record_idx_column(
39
+ final_data_to_dedupe, record_idx_iterator
40
+ )
41
+
42
+ final_data_table_indices = final_data_to_dedupe.group_by(
43
+ identify_column_name_concatenated, use_threads=False
44
+ ).aggregate([(sc._GLOBAL_RECORD_IDX_COLUMN_NAME, "max")])
45
+
46
+ pos_delete_indices = pc.invert(
47
+ pc.is_in(
48
+ final_data_to_dedupe[sc._GLOBAL_RECORD_IDX_COLUMN_NAME],
49
+ value_set=final_data_table_indices[
50
+ f"{sc._GLOBAL_RECORD_IDX_COLUMN_NAME}_max"
51
+ ],
52
+ )
53
+ )
54
+
55
+ final_data_table_to_delete = final_data_to_dedupe.filter(pos_delete_indices)
56
+
57
+ final_data_table_to_delete = final_data_table_to_delete.drop(
58
+ [identify_column_name_concatenated, sc._GLOBAL_RECORD_IDX_COLUMN_NAME]
59
+ )
60
+ return final_data_table_to_delete
File without changes
@@ -0,0 +1,88 @@
1
+ from typing import Optional, Dict
2
+ from deltacat.exceptions import RetryableError
3
+
4
+ AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 80
5
+ AVERAGE_POS_COLUMN_SIZE_BYTES = 4
6
+ XXHASH_BYTE_PER_RECORD = 8
7
+ MEMORY_BUFFER_RATE = 1.2
8
+
9
+
10
+ def estimate_fixed_hash_columns(hash_value_size_bytes_per_record, total_record_count):
11
+ return hash_value_size_bytes_per_record * total_record_count
12
+
13
+
14
+ def get_total_record_from_iceberg_files(iceberg_files_list):
15
+ total_record_count = 0
16
+ for iceberg_files in iceberg_files_list:
17
+ total_record_count += sum(file.record_count for file in iceberg_files)
18
+ return total_record_count
19
+
20
+
21
+ def estimate_iceberg_pos_delete_additional_columns(
22
+ include_columns, num_of_record_count
23
+ ):
24
+ total_additional_columns_sizes = 0
25
+ if "file_path" in include_columns:
26
+ total_additional_columns_sizes += (
27
+ AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES * num_of_record_count
28
+ )
29
+ elif "pos" in include_columns:
30
+ total_additional_columns_sizes += (
31
+ AVERAGE_POS_COLUMN_SIZE_BYTES * num_of_record_count
32
+ )
33
+ return total_additional_columns_sizes
34
+
35
+
36
+ def estimate_convert_remote_option_resources(data_files, equality_delete_files):
37
+ data_file_record_count = get_total_record_from_iceberg_files(data_files)
38
+ equality_delete_record_count = get_total_record_from_iceberg_files(
39
+ equality_delete_files
40
+ )
41
+ hash_column_sizes = estimate_fixed_hash_columns(
42
+ XXHASH_BYTE_PER_RECORD, data_file_record_count + equality_delete_record_count
43
+ )
44
+ pos_delete_sizes = estimate_iceberg_pos_delete_additional_columns(
45
+ ["file_path", "pos"], data_file_record_count + equality_delete_record_count
46
+ )
47
+ total_memory_required = hash_column_sizes + pos_delete_sizes
48
+ return total_memory_required * MEMORY_BUFFER_RATE
49
+
50
+
51
+ def _get_task_options(
52
+ memory: float,
53
+ ray_custom_resources: Optional[Dict] = None,
54
+ scheduling_strategy: str = "SPREAD",
55
+ ) -> Dict:
56
+
57
+ # NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
58
+ # not spin up enough nodes fast and hence we see only approximately
59
+ # 20 tasks get scheduled out of 100 tasks in queue. Hence, we use SPREAD
60
+ # which is also ideal for merge and hash bucket tasks.
61
+ # https://docs.ray.io/en/latest/ray-core/scheduling/index.html
62
+ task_opts = {
63
+ "memory": memory,
64
+ "scheduling_strategy": scheduling_strategy,
65
+ }
66
+
67
+ if ray_custom_resources:
68
+ task_opts["resources"] = ray_custom_resources
69
+
70
+ task_opts["max_retries"] = 3
71
+
72
+ # List of possible botocore exceptions are available at
73
+ # https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
74
+ task_opts["retry_exceptions"] = [RetryableError]
75
+
76
+ return task_opts
77
+
78
+
79
+ def convert_resource_options_provider(index, files_for_each_bucket):
80
+ (
81
+ data_files_list,
82
+ equality_delete_files_list,
83
+ position_delete_files_list,
84
+ ) = files_for_each_bucket[1]
85
+ memory_requirement = estimate_convert_remote_option_resources(
86
+ data_files_list, equality_delete_files_list
87
+ )
88
+ return _get_task_options(memory=memory_requirement)