deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,135 @@
1
+ from collections import defaultdict
2
+ import logging
3
+ from deltacat import logs
4
+ import pyarrow.parquet as pq
5
+
6
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
7
+
8
+
9
+ def parquet_files_dict_to_iceberg_data_files(io, table_metadata, files_dict_list):
10
+ from pyiceberg.io.pyarrow import (
11
+ _check_pyarrow_schema_compatible,
12
+ data_file_statistics_from_parquet_metadata,
13
+ compute_statistics_plan,
14
+ parquet_path_to_id_mapping,
15
+ )
16
+ from pyiceberg.manifest import (
17
+ DataFile,
18
+ DataFileContent,
19
+ FileFormat,
20
+ )
21
+
22
+ data_file_content_type = DataFileContent.POSITION_DELETES
23
+ iceberg_files = []
24
+ schema = table_metadata.schema()
25
+ for files_dict in files_dict_list:
26
+ for partition_value, file_paths in files_dict.items():
27
+ for file_path in file_paths:
28
+ input_file = io.new_input(file_path)
29
+ with input_file.open() as input_stream:
30
+ parquet_metadata = pq.read_metadata(input_stream)
31
+ _check_pyarrow_schema_compatible(
32
+ schema, parquet_metadata.schema.to_arrow_schema()
33
+ )
34
+
35
+ statistics = data_file_statistics_from_parquet_metadata(
36
+ parquet_metadata=parquet_metadata,
37
+ stats_columns=compute_statistics_plan(
38
+ schema, table_metadata.properties
39
+ ),
40
+ parquet_column_mapping=parquet_path_to_id_mapping(schema),
41
+ )
42
+
43
+ data_file = DataFile(
44
+ content=data_file_content_type,
45
+ file_path=file_path,
46
+ file_format=FileFormat.PARQUET,
47
+ partition=partition_value,
48
+ # partition=Record(**{"pk": "111", "bucket": 2}),
49
+ file_size_in_bytes=len(input_file),
50
+ sort_order_id=None,
51
+ spec_id=table_metadata.default_spec_id,
52
+ equality_ids=None,
53
+ key_metadata=None,
54
+ **statistics.to_serialized_dict(),
55
+ )
56
+ iceberg_files.append(data_file)
57
+ return iceberg_files
58
+
59
+
60
+ def fetch_all_bucket_files(table):
61
+ # step 1: filter manifests using partition summaries
62
+ # the filter depends on the partition spec used to write the manifest file, so create a cache of filters for each spec id
63
+ from pyiceberg.typedef import (
64
+ KeyDefaultDict,
65
+ )
66
+
67
+ data_scan = table.scan()
68
+ snapshot = data_scan.snapshot()
69
+ if not snapshot:
70
+ return iter([])
71
+ manifest_evaluators = KeyDefaultDict(data_scan._build_manifest_evaluator)
72
+
73
+ manifests = [
74
+ manifest_file
75
+ for manifest_file in snapshot.manifests(data_scan.io)
76
+ if manifest_evaluators[manifest_file.partition_spec_id](manifest_file)
77
+ ]
78
+
79
+ # step 2: filter the data files in each manifest
80
+ # this filter depends on the partition spec used to write the manifest file
81
+ from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator
82
+ from pyiceberg.types import (
83
+ strtobool,
84
+ )
85
+ from pyiceberg.table import _min_sequence_number, _open_manifest
86
+ from pyiceberg.utils.concurrent import ExecutorFactory
87
+ from itertools import chain
88
+ from pyiceberg.manifest import DataFileContent
89
+
90
+ partition_evaluators = KeyDefaultDict(data_scan._build_partition_evaluator)
91
+ metrics_evaluator = _InclusiveMetricsEvaluator(
92
+ data_scan.table_metadata.schema(),
93
+ data_scan.row_filter,
94
+ data_scan.case_sensitive,
95
+ strtobool(data_scan.options.get("include_empty_files", "false")),
96
+ ).eval
97
+
98
+ min_sequence_number = _min_sequence_number(manifests)
99
+
100
+ # {"bucket_index": List[DataFile]}
101
+ data_entries = defaultdict(list)
102
+ equality_data_entries = defaultdict(list)
103
+ positional_delete_entries = defaultdict(list)
104
+
105
+ executor = ExecutorFactory.get_or_create()
106
+ for manifest_entry in chain(
107
+ *executor.map(
108
+ lambda args: _open_manifest(*args),
109
+ [
110
+ (
111
+ data_scan.io,
112
+ manifest,
113
+ partition_evaluators[manifest.partition_spec_id],
114
+ metrics_evaluator,
115
+ )
116
+ for manifest in manifests
117
+ if data_scan._check_sequence_number(min_sequence_number, manifest)
118
+ ],
119
+ )
120
+ ):
121
+ data_file = manifest_entry.data_file
122
+ file_sequence_number = manifest_entry.sequence_number
123
+ data_file_tuple = (file_sequence_number, data_file)
124
+ partition_value = data_file.partition
125
+ if data_file.content == DataFileContent.DATA:
126
+ data_entries[partition_value].append(data_file_tuple)
127
+ if data_file.content == DataFileContent.POSITION_DELETES:
128
+ positional_delete_entries[partition_value].append(data_file_tuple)
129
+ elif data_file.content == DataFileContent.EQUALITY_DELETES:
130
+ equality_data_entries[partition_value].append(data_file_tuple)
131
+ else:
132
+ logger.warning(
133
+ f"Unknown DataFileContent ({data_file.content}): {manifest_entry}"
134
+ )
135
+ return data_entries, equality_data_entries, positional_delete_entries
@@ -0,0 +1,251 @@
1
+ from typing import Optional, List
2
+ import uuid
3
+ from pyiceberg.table.snapshots import (
4
+ Operation,
5
+ )
6
+ from pyiceberg.manifest import (
7
+ DataFileContent,
8
+ ManifestContent,
9
+ ManifestEntry,
10
+ ManifestEntryStatus,
11
+ ManifestFile,
12
+ write_manifest,
13
+ )
14
+ import itertools
15
+ from pyiceberg.utils.concurrent import ExecutorFactory
16
+ from pyiceberg.table.update.snapshot import UpdateSnapshot, _SnapshotProducer
17
+
18
+
19
+ class _ReplaceFiles(_SnapshotProducer["_ReplaceFiles"]):
20
+ """Overwrites data from the table. This will produce an OVERWRITE snapshot.
21
+
22
+ Data and delete files were added and removed in a logical overwrite operation.
23
+ """
24
+
25
+ def _existing_manifests(self) -> List[ManifestFile]:
26
+ """Determine if there are any existing manifest files."""
27
+ existing_files = []
28
+ snapshot = self._transaction.table_metadata.current_snapshot()
29
+ if snapshot:
30
+ for manifest_file in snapshot.manifests(io=self._io):
31
+ entries = manifest_file.fetch_manifest_entry(
32
+ io=self._io, discard_deleted=True
33
+ )
34
+
35
+ found_deleted_data_files = [
36
+ entry.data_file
37
+ for entry in entries
38
+ if entry.data_file in self._deleted_data_files
39
+ ]
40
+
41
+ if len(found_deleted_data_files) == 0:
42
+ existing_files.append(manifest_file)
43
+ else:
44
+ # We have to replace the manifest file without the deleted data files
45
+ if any(
46
+ entry.data_file not in found_deleted_data_files
47
+ for entry in entries
48
+ ):
49
+ with write_manifest(
50
+ format_version=self._transaction.table_metadata.format_version,
51
+ spec=self._transaction.table_metadata.specs()[
52
+ manifest_file.partition_spec_id
53
+ ],
54
+ schema=self._transaction.table_metadata.schema(),
55
+ output_file=self.new_manifest_output(),
56
+ snapshot_id=self._snapshot_id,
57
+ ) as writer:
58
+ [
59
+ writer.add_entry(
60
+ ManifestEntry(
61
+ status=ManifestEntryStatus.EXISTING,
62
+ snapshot_id=entry.snapshot_id,
63
+ sequence_number=entry.sequence_number,
64
+ file_sequence_number=entry.file_sequence_number,
65
+ data_file=entry.data_file,
66
+ )
67
+ )
68
+ for entry in entries
69
+ if entry.data_file not in found_deleted_data_files
70
+ ]
71
+ existing_files.append(writer.to_manifest_file())
72
+ return existing_files
73
+
74
+ def _deleted_entries(self) -> List[ManifestEntry]:
75
+ """To determine if we need to record any deleted entries.
76
+
77
+ With a full overwrite all the entries are considered deleted.
78
+ With partial overwrites we have to use the predicate to evaluate
79
+ which entries are affected.
80
+ """
81
+ if self._parent_snapshot_id is not None:
82
+ previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
83
+ self._parent_snapshot_id
84
+ )
85
+ if previous_snapshot is None:
86
+ # This should never happen since you cannot overwrite an empty table
87
+ raise ValueError(
88
+ f"Could not find the previous snapshot: {self._parent_snapshot_id}"
89
+ )
90
+
91
+ executor = ExecutorFactory.get_or_create()
92
+
93
+ def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]:
94
+ return [
95
+ ManifestEntry(
96
+ status=ManifestEntryStatus.DELETED,
97
+ snapshot_id=entry.snapshot_id,
98
+ sequence_number=entry.sequence_number,
99
+ file_sequence_number=entry.file_sequence_number,
100
+ data_file=entry.data_file,
101
+ )
102
+ for entry in manifest.fetch_manifest_entry(
103
+ self._io, discard_deleted=True
104
+ )
105
+ if entry.data_file.content == DataFileContent.DATA
106
+ and entry.data_file in self._deleted_data_files
107
+ ]
108
+
109
+ list_of_entries = executor.map(
110
+ _get_entries, previous_snapshot.manifests(self._io)
111
+ )
112
+ return list(itertools.chain(*list_of_entries))
113
+ else:
114
+ return []
115
+
116
+
117
+ def replace(
118
+ self,
119
+ commit_uuid: Optional[uuid.UUID] = None,
120
+ using_starting_sequence: Optional[bool] = False,
121
+ ) -> _ReplaceFiles:
122
+ return _ReplaceFiles(
123
+ commit_uuid=commit_uuid,
124
+ operation=Operation.REPLACE
125
+ if self._transaction.table_metadata.current_snapshot() is not None
126
+ else Operation.APPEND,
127
+ transaction=self._transaction,
128
+ io=self._io,
129
+ snapshot_properties=self._snapshot_properties,
130
+ using_starting_sequence=using_starting_sequence,
131
+ )
132
+
133
+
134
+ UpdateSnapshot.replace = replace
135
+
136
+
137
+ def commit_replace_snapshot(
138
+ iceberg_table, to_be_deleted_files_list, new_position_delete_files
139
+ ):
140
+ tx = iceberg_table.transaction()
141
+ snapshot_properties = {}
142
+ commit_uuid = uuid.uuid4()
143
+ update_snapshot = tx.update_snapshot(snapshot_properties=snapshot_properties)
144
+ replace_snapshot = replace(
145
+ self=update_snapshot, commit_uuid=commit_uuid, using_starting_sequence=False
146
+ )
147
+ for to_be_deleted_file in to_be_deleted_files_list:
148
+ replace_snapshot.append_data_file(to_be_deleted_file)
149
+ for to_be_added_file in new_position_delete_files:
150
+ replace_snapshot.delete_data_file(to_be_added_file)
151
+ replace_snapshot._commit()
152
+ tx.commit_transaction()
153
+
154
+
155
+ def append_delete_files_override(update_snapshot):
156
+ commit_uuid = uuid.uuid4()
157
+ return _AppendDeleteFilesOverride(
158
+ commit_uuid=commit_uuid,
159
+ operation=Operation.APPEND,
160
+ transaction=update_snapshot._transaction,
161
+ io=update_snapshot._io,
162
+ snapshot_properties=update_snapshot._snapshot_properties,
163
+ )
164
+
165
+
166
+ class _AppendDeleteFilesOverride(_SnapshotProducer):
167
+ def _manifests(self):
168
+ def _write_added_manifest():
169
+ if self._added_data_files:
170
+ with write_manifest(
171
+ format_version=self._transaction.table_metadata.format_version,
172
+ spec=self._transaction.table_metadata.spec(),
173
+ schema=self._transaction.table_metadata.schema(),
174
+ output_file=self.new_manifest_output(),
175
+ snapshot_id=self._snapshot_id,
176
+ ) as writer:
177
+ for data_file in self._added_data_files:
178
+ writer.add(
179
+ ManifestEntry(
180
+ status=ManifestEntryStatus.ADDED,
181
+ snapshot_id=self._snapshot_id,
182
+ sequence_number=None,
183
+ file_sequence_number=None,
184
+ data_file=data_file,
185
+ )
186
+ )
187
+ writer.content = self.writer_content
188
+ return [writer.to_manifest_file()]
189
+ else:
190
+ return []
191
+
192
+ executor = ExecutorFactory.get_or_create()
193
+
194
+ added_manifests = executor.submit(_write_added_manifest)
195
+ existing_manifests = executor.submit(self._existing_manifests)
196
+
197
+ return self._process_manifests(
198
+ added_manifests.result() + existing_manifests.result()
199
+ )
200
+
201
+ def writer_content(self):
202
+ return ManifestContent.DELETES
203
+
204
+ def _existing_manifests(self) -> List[ManifestFile]:
205
+ """To determine if there are any existing manifest files.
206
+
207
+ A fast append will add another ManifestFile to the ManifestList.
208
+ All the existing manifest files are considered existing.
209
+ """
210
+ existing_manifests = []
211
+
212
+ if self._parent_snapshot_id is not None:
213
+ previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
214
+ self._parent_snapshot_id
215
+ )
216
+
217
+ if previous_snapshot is None:
218
+ raise ValueError(
219
+ f"Snapshot could not be found: {self._parent_snapshot_id}"
220
+ )
221
+
222
+ for manifest in previous_snapshot.manifests(io=self._io):
223
+ if (
224
+ manifest.has_added_files()
225
+ or manifest.has_existing_files()
226
+ or manifest.added_snapshot_id == self._snapshot_id
227
+ ):
228
+ existing_manifests.append(manifest)
229
+
230
+ return existing_manifests
231
+
232
+ def _deleted_entries(self) -> List[ManifestEntry]:
233
+ """To determine if we need to record any deleted manifest entries.
234
+
235
+ In case of an append, nothing is deleted.
236
+ """
237
+ return []
238
+
239
+
240
+ def commit_append_snapshot(iceberg_table, new_position_delete_files):
241
+ with iceberg_table.transaction() as tx:
242
+ if iceberg_table.metadata.name_mapping() is None:
243
+ tx.set_properties(
244
+ **{
245
+ "schema.name-mapping.default": tx.table_metadata.schema().name_mapping.model_dump_json()
246
+ }
247
+ )
248
+ with append_delete_files_override(tx.update_snapshot()) as append_snapshot:
249
+ if new_position_delete_files:
250
+ for data_file in new_position_delete_files:
251
+ append_snapshot.append_data_file(data_file)
File without changes
@@ -0,0 +1,211 @@
1
+ import pyarrow.compute as pc
2
+
3
+ import deltacat.compute.converter.utils.iceberg_columns as sc
4
+ import pyarrow as pa
5
+
6
+ from collections import defaultdict
7
+ import ray
8
+ import logging
9
+ from deltacat.compute.converter.model.convert_input import ConvertInput
10
+ from deltacat.compute.converter.steps.dedupe import dedupe_data_files
11
+ from deltacat.compute.converter.utils.s3u import upload_table_with_retry
12
+ from deltacat.compute.converter.utils.io import (
13
+ download_data_table_and_append_iceberg_columns,
14
+ )
15
+ from deltacat.compute.converter.utils.converter_session_utils import (
16
+ partition_value_record_to_partition_value_string,
17
+ )
18
+
19
+ from deltacat import logs
20
+
21
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
22
+
23
+
24
+ @ray.remote
25
+ def convert(convert_input: ConvertInput):
26
+ convert_input_files = convert_input.convert_input_files
27
+ convert_task_index = convert_input.convert_task_index
28
+ iceberg_table_warehouse_prefix = convert_input.iceberg_table_warehouse_prefix
29
+ identifier_fields = convert_input.identifier_fields
30
+ compact_small_files = convert_input.compact_small_files
31
+ position_delete_for_multiple_data_files = (
32
+ convert_input.position_delete_for_multiple_data_files
33
+ )
34
+ max_parallel_data_file_download = convert_input.max_parallel_data_file_download
35
+ s3_file_system = convert_input.s3_file_system
36
+ if not position_delete_for_multiple_data_files:
37
+ raise NotImplementedError(
38
+ f"Distributed file level position delete compute is not supported yet"
39
+ )
40
+ if compact_small_files:
41
+ raise NotImplementedError(f"Compact previous position delete not supported yet")
42
+
43
+ logger.info(f"Starting convert task index: {convert_task_index}")
44
+
45
+ applicable_data_files = convert_input_files.applicable_data_files
46
+ applicable_equality_delete_files = (
47
+ convert_input_files.applicable_equality_delete_files
48
+ )
49
+ all_data_files_for_this_bucket = convert_input_files.all_data_files_for_dedupe
50
+
51
+ partition_value_str = partition_value_record_to_partition_value_string(
52
+ convert_input_files.partition_value
53
+ )
54
+ partition_value = convert_input_files.partition_value
55
+ iceberg_table_warehouse_prefix_with_partition = (
56
+ f"{iceberg_table_warehouse_prefix}/{partition_value_str}"
57
+ )
58
+ enforce_primary_key_uniqueness = convert_input.enforce_primary_key_uniqueness
59
+ total_pos_delete_table = []
60
+ if applicable_equality_delete_files:
61
+ (
62
+ pos_delete_after_converting_equality_delete
63
+ ) = compute_pos_delete_with_limited_parallelism(
64
+ data_files_list=applicable_data_files,
65
+ identifier_columns=identifier_fields,
66
+ equality_delete_files_list=applicable_equality_delete_files,
67
+ iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
68
+ max_parallel_data_file_download=max_parallel_data_file_download,
69
+ s3_file_system=s3_file_system,
70
+ )
71
+ if pos_delete_after_converting_equality_delete:
72
+ total_pos_delete_table.append(pos_delete_after_converting_equality_delete)
73
+
74
+ if enforce_primary_key_uniqueness:
75
+ data_files_to_dedupe = get_additional_applicable_data_files(
76
+ all_data_files=all_data_files_for_this_bucket,
77
+ data_files_downloaded=applicable_data_files,
78
+ )
79
+ pos_delete_after_dedupe = dedupe_data_files(
80
+ data_file_to_dedupe=data_files_to_dedupe,
81
+ identify_column_name_concatenated=identifier_fields[0],
82
+ identifier_columns=identifier_fields,
83
+ merge_sort_column=sc._ORDERED_RECORD_IDX_COLUMN_NAME,
84
+ )
85
+ total_pos_delete_table.append(pos_delete_after_dedupe)
86
+
87
+ total_pos_delete = pa.concat_tables(total_pos_delete_table)
88
+ to_be_added_files_list = upload_table_with_retry(
89
+ table=total_pos_delete,
90
+ s3_url_prefix=iceberg_table_warehouse_prefix_with_partition,
91
+ s3_table_writer_kwargs={},
92
+ s3_file_system=s3_file_system,
93
+ )
94
+
95
+ to_be_delete_files_dict = defaultdict()
96
+ if applicable_equality_delete_files:
97
+ to_be_delete_files_dict[partition_value] = [
98
+ equality_delete_file[1]
99
+ for equality_delete_file in applicable_equality_delete_files
100
+ ]
101
+ to_be_added_files_dict = defaultdict()
102
+ to_be_added_files_dict[partition_value] = to_be_added_files_list
103
+ return (to_be_delete_files_dict, to_be_added_files_dict)
104
+
105
+
106
+ def get_additional_applicable_data_files(all_data_files, data_files_downloaded):
107
+ data_file_to_dedupe = all_data_files
108
+ if data_files_downloaded:
109
+ data_file_to_dedupe = list(set(all_data_files) - set(data_files_downloaded))
110
+ return data_file_to_dedupe
111
+
112
+
113
+ def filter_rows_to_be_deleted(
114
+ equality_delete_table, data_file_table, identifier_columns
115
+ ):
116
+ identifier_column = identifier_columns[0]
117
+ if equality_delete_table and data_file_table:
118
+ equality_deletes = pc.is_in(
119
+ data_file_table[identifier_column],
120
+ equality_delete_table[identifier_column],
121
+ )
122
+ position_delete_table = data_file_table.filter(equality_deletes)
123
+ logger.info(f"positional_delete_table:{position_delete_table.to_pydict()}")
124
+ logger.info(f"data_file_table:{data_file_table.to_pydict()}")
125
+ logger.info(
126
+ f"length_pos_delete_table, {len(position_delete_table)}, length_data_table:{len(data_file_table)}"
127
+ )
128
+ return position_delete_table
129
+
130
+
131
+ def compute_pos_delete_converting_equality_deletes(
132
+ equality_delete_table,
133
+ data_file_table,
134
+ identifier_columns,
135
+ iceberg_table_warehouse_prefix_with_partition,
136
+ s3_file_system,
137
+ ):
138
+ new_position_delete_table = filter_rows_to_be_deleted(
139
+ data_file_table=data_file_table,
140
+ equality_delete_table=equality_delete_table,
141
+ identifier_columns=identifier_columns,
142
+ )
143
+ if new_position_delete_table:
144
+ logger.info(
145
+ f"Length of position delete table after converting from equality deletes:{len(new_position_delete_table)}"
146
+ )
147
+ else:
148
+ return None
149
+ return new_position_delete_table
150
+
151
+
152
+ def download_bucketed_table(data_files, equality_delete_files):
153
+ from deltacat.utils.pyarrow import s3_file_to_table
154
+
155
+ compacted_table = s3_file_to_table(
156
+ [data_file.file_path for data_file in data_files]
157
+ )
158
+ equality_delete_table = s3_file_to_table(
159
+ [eq_file.file_path for eq_file in equality_delete_files]
160
+ )
161
+ return compacted_table, equality_delete_table
162
+
163
+
164
+ def compute_pos_delete_with_limited_parallelism(
165
+ data_files_list,
166
+ identifier_columns,
167
+ equality_delete_files_list,
168
+ iceberg_table_warehouse_prefix_with_partition,
169
+ max_parallel_data_file_download,
170
+ s3_file_system,
171
+ ):
172
+ for data_files, equality_delete_files in zip(
173
+ data_files_list, equality_delete_files_list
174
+ ):
175
+ data_table_total = []
176
+ for data_file in data_files:
177
+ data_table = download_data_table_and_append_iceberg_columns(
178
+ data_files=data_file[1],
179
+ columns_to_download=identifier_columns,
180
+ additional_columns_to_append=[
181
+ sc._FILE_PATH_COLUMN_NAME,
182
+ sc._ORDERED_RECORD_IDX_COLUMN_NAME,
183
+ ],
184
+ sequence_number=data_file[0],
185
+ )
186
+ data_table_total.append(data_table)
187
+ data_table_total = pa.concat_tables(data_table_total)
188
+
189
+ equality_delete_table_total = []
190
+ for equality_delete in equality_delete_files:
191
+ equality_delete_table = download_data_table_and_append_iceberg_columns(
192
+ data_files=equality_delete[1],
193
+ columns_to_download=identifier_columns,
194
+ )
195
+ equality_delete_table_total.append(equality_delete_table)
196
+ equality_delete_table_total = pa.concat_tables(equality_delete_table_total)
197
+
198
+ new_pos_delete_table = compute_pos_delete_converting_equality_deletes(
199
+ equality_delete_table=equality_delete_table_total,
200
+ data_file_table=data_table_total,
201
+ iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
202
+ identifier_columns=identifier_columns,
203
+ s3_file_system=s3_file_system,
204
+ )
205
+ if not new_pos_delete_table:
206
+ logger.info("No records deleted based on equality delete converstion")
207
+
208
+ logger.info(
209
+ f"Number of records to delete based on equality delete convertion:{len(new_pos_delete_table)}"
210
+ )
211
+ return new_pos_delete_table
@@ -0,0 +1,60 @@
1
+ import pyarrow as pa
2
+ import pyarrow.compute as pc
3
+ import deltacat.compute.converter.utils.iceberg_columns as sc
4
+ from deltacat.compute.converter.utils.io import (
5
+ download_data_table_and_append_iceberg_columns,
6
+ )
7
+
8
+
9
+ def dedupe_data_files(
10
+ data_file_to_dedupe,
11
+ identify_column_name_concatenated,
12
+ identifier_columns,
13
+ merge_sort_column,
14
+ ):
15
+ data_file_table = []
16
+
17
+ # Sort data files by file sequence number first
18
+ data_file_to_dedupe = sorted(data_file_to_dedupe, key=lambda f: f[0])
19
+ for file_tuple in data_file_to_dedupe:
20
+ sequence_number = file_tuple[0]
21
+ data_file = file_tuple[1]
22
+ data_file_to_dedupe_table = download_data_table_and_append_iceberg_columns(
23
+ file=data_file,
24
+ columns_to_download=identifier_columns,
25
+ additional_columns_to_append=[
26
+ sc._FILE_PATH_COLUMN_NAME,
27
+ sc._ORDERED_RECORD_IDX_COLUMN_NAME,
28
+ ],
29
+ sequence_number=sequence_number,
30
+ )
31
+ data_file_table.append(data_file_to_dedupe_table)
32
+
33
+ final_data_to_dedupe = pa.concat_tables(data_file_table)
34
+
35
+ record_idx_iterator = iter(range(len(final_data_to_dedupe)))
36
+
37
+ # Append global record index to used as aggregate column
38
+ final_data_to_dedupe = sc.append_global_record_idx_column(
39
+ final_data_to_dedupe, record_idx_iterator
40
+ )
41
+
42
+ final_data_table_indices = final_data_to_dedupe.group_by(
43
+ identify_column_name_concatenated, use_threads=False
44
+ ).aggregate([(sc._GLOBAL_RECORD_IDX_COLUMN_NAME, "max")])
45
+
46
+ pos_delete_indices = pc.invert(
47
+ pc.is_in(
48
+ final_data_to_dedupe[sc._GLOBAL_RECORD_IDX_COLUMN_NAME],
49
+ value_set=final_data_table_indices[
50
+ f"{sc._GLOBAL_RECORD_IDX_COLUMN_NAME}_max"
51
+ ],
52
+ )
53
+ )
54
+
55
+ final_data_table_to_delete = final_data_to_dedupe.filter(pos_delete_indices)
56
+
57
+ final_data_table_to_delete = final_data_table_to_delete.drop(
58
+ [identify_column_name_concatenated, sc._GLOBAL_RECORD_IDX_COLUMN_NAME]
59
+ )
60
+ return final_data_table_to_delete
File without changes