deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,709 @@
1
+ from typing import Optional, Dict, List, Union, Tuple
2
+
3
+ import pyarrow as pa
4
+ from pyiceberg.catalog.rest import NAMESPACE_SEPARATOR
5
+
6
+ from pyiceberg.io import load_file_io
7
+ from pyiceberg.io.pyarrow import pyarrow_to_schema, schema_to_pyarrow
8
+ from pyiceberg.catalog import Catalog
9
+ from pyiceberg.partitioning import PartitionField, PartitionSpec
10
+ from pyiceberg.schema import (
11
+ INITIAL_SCHEMA_ID,
12
+ NestedField,
13
+ Schema as IcebergSchema,
14
+ )
15
+ from pyiceberg.serializers import FromInputFile
16
+ from pyiceberg.table import (
17
+ Table as IcebergTable,
18
+ Namespace as IcebergNamespace,
19
+ TableIdentifier,
20
+ )
21
+ from pyiceberg.table.metadata import TableMetadata
22
+ from pyiceberg.table.snapshots import MetadataLogEntry, Snapshot
23
+ from pyiceberg.table.sorting import (
24
+ SortField,
25
+ SortDirection,
26
+ NullOrder as IcebergNullOrder,
27
+ SortOrder as IcebergSortOrder,
28
+ )
29
+ from pyiceberg.transforms import (
30
+ BucketTransform as IcebergBucketTransform,
31
+ HourTransform as IcebergHourTransform,
32
+ DayTransform as IcebergDayTransform,
33
+ MonthTransform as IcebergMonthTransform,
34
+ YearTransform as IcebergYearTransform,
35
+ IdentityTransform as IcebergIdentityTransform,
36
+ TruncateTransform as IcebergTruncateTransform,
37
+ VoidTransform as IcebergIcebergVoidTransform,
38
+ UnknownTransform as IcebergUnknownTransform,
39
+ Transform as IcebergTransform,
40
+ )
41
+ from pyiceberg.typedef import Identifier, EMPTY_DICT
42
+
43
+ from deltacat.exceptions import (
44
+ NamespaceNotFoundError,
45
+ TableVersionNotFoundError,
46
+ StreamNotFoundError,
47
+ TableNotFoundError,
48
+ )
49
+ from deltacat.storage import (
50
+ BucketingStrategy,
51
+ BucketTransform,
52
+ BucketTransformParameters,
53
+ DayTransform,
54
+ Field,
55
+ HourTransform,
56
+ IdentityTransform,
57
+ MonthTransform,
58
+ Namespace,
59
+ NamespaceLocator,
60
+ Schema,
61
+ StreamLocator,
62
+ Stream,
63
+ Table,
64
+ TableLocator,
65
+ TableVersion,
66
+ TableVersionLocator,
67
+ Transform,
68
+ TransformName,
69
+ TruncateTransform,
70
+ TruncateTransformParameters,
71
+ UnknownTransform,
72
+ VoidTransform,
73
+ YearTransform,
74
+ SortOrder,
75
+ NullOrder,
76
+ )
77
+ from deltacat.storage.model.interop import ModelMapper, OneWayModelMapper
78
+ from deltacat.storage.model.partition import PartitionKey, PartitionScheme
79
+ from deltacat.storage.model.sort_key import (
80
+ SortKey,
81
+ SortScheme,
82
+ )
83
+ from deltacat.storage.model.types import StreamFormat, CommitState
84
+
85
+
86
+ def _get_snapshot_for_meta(
87
+ meta: TableMetadata,
88
+ snapshot_id: int,
89
+ ) -> Snapshot:
90
+ try:
91
+ return next(s for s in meta.snapshots if s.snapshot_id == snapshot_id)
92
+ except StopIteration as e:
93
+ err_msg = f"No table snapshot with ID: {snapshot_id}"
94
+ raise ValueError(err_msg) from e
95
+
96
+
97
+ def _resolve_stream_snapshot(
98
+ meta: TableMetadata,
99
+ snapshot_id: Optional[int],
100
+ ) -> Snapshot:
101
+ sid = snapshot_id if snapshot_id else meta.current_snapshot_id
102
+ try:
103
+ return _get_snapshot_for_meta(meta, sid)
104
+ except ValueError as e:
105
+ err_msg = f"No snapshot with timestamp: {sid}.\nTable Metadata: {meta}"
106
+ raise StreamNotFoundError(err_msg) from e
107
+
108
+
109
+ def _get_metadata_for_timestamp(
110
+ timestamp: int,
111
+ meta_log: List[MetadataLogEntry],
112
+ catalog_properties: Dict[str, str] = EMPTY_DICT,
113
+ ) -> TableMetadata:
114
+ try:
115
+ meta_log_entry = next(
116
+ entry for entry in meta_log if entry.timestamp_ms == timestamp
117
+ )
118
+ except StopIteration as e:
119
+ err_msg = f"No table metadata log with timestamp: {timestamp}"
120
+ raise ValueError(err_msg) from e
121
+ io = load_file_io(
122
+ properties=catalog_properties,
123
+ location=meta_log_entry.metadata_file,
124
+ )
125
+ file = io.new_input(meta_log_entry.metadata_file)
126
+ return FromInputFile.table_metadata(file)
127
+
128
+
129
+ def _resolve_table_version_metadata(
130
+ table: Optional[IcebergTable],
131
+ timestamp: Optional[int] = None,
132
+ catalog_properties: Dict[str, str] = EMPTY_DICT,
133
+ ) -> TableMetadata:
134
+ try:
135
+ latest = table.metadata
136
+ return (
137
+ _get_metadata_for_timestamp(
138
+ timestamp,
139
+ table.metadata.metadata_log,
140
+ catalog_properties,
141
+ )
142
+ if timestamp is not None and timestamp != latest.last_updated_ms
143
+ else latest
144
+ )
145
+ except ValueError as e:
146
+ raise TableVersionNotFoundError(
147
+ f"Table version `{timestamp}` not found."
148
+ ) from e
149
+
150
+
151
+ def _resolve_table_version(
152
+ meta: TableMetadata,
153
+ timestamp: Optional[int] = None,
154
+ ) -> int:
155
+ try:
156
+ return (
157
+ next(
158
+ entry.timestamp_ms
159
+ for entry in meta.metadata_log
160
+ if entry.timestamp_ms == timestamp
161
+ )
162
+ if timestamp
163
+ else meta.last_updated_ms
164
+ )
165
+ except StopIteration as e:
166
+ err_msg = f"Table version `{timestamp}` not found."
167
+ raise TableVersionNotFoundError(err_msg) from e
168
+
169
+
170
+ def _get_current_schema_for_meta(meta: TableMetadata) -> IcebergSchema:
171
+ schema_id = meta.current_schema_id
172
+ try:
173
+ return next(schema for schema in meta.schemas if schema.schema_id == schema_id)
174
+ except StopIteration as e:
175
+ err_msg = f"No table schema with ID: {schema_id}"
176
+ raise ValueError(err_msg) from e
177
+
178
+
179
+ def _get_current_spec_for_meta(meta: TableMetadata) -> PartitionSpec:
180
+ spec_id = meta.default_spec_id
181
+ try:
182
+ return next(spec for spec in meta.partition_specs if spec.spec_id == spec_id)
183
+ except StopIteration as e:
184
+ err_msg = f"No table partition spec with ID: {spec_id}"
185
+ raise ValueError(err_msg) from e
186
+
187
+
188
+ def _get_current_sort_order_for_meta(meta: TableMetadata) -> SortOrder:
189
+ sort_order_id = meta.default_sort_order_id
190
+ try:
191
+ return next(
192
+ sort_order
193
+ for sort_order in meta.sort_orders
194
+ if sort_order.order_id == sort_order_id
195
+ )
196
+ except StopIteration as e:
197
+ err_msg = f"No table sort order with ID: {sort_order_id}"
198
+ raise ValueError(err_msg) from e
199
+
200
+
201
+ class TransformMapper(ModelMapper[IcebergTransform, Transform]):
202
+ @staticmethod
203
+ def map(
204
+ obj: Optional[IcebergTransform],
205
+ **kwargs,
206
+ ) -> Optional[Transform]:
207
+ if obj is None:
208
+ return None
209
+ if isinstance(obj, IcebergIdentityTransform):
210
+ return IdentityTransform.of()
211
+ if isinstance(obj, IcebergHourTransform):
212
+ return HourTransform.of()
213
+ if isinstance(obj, IcebergDayTransform):
214
+ return DayTransform.of()
215
+ if isinstance(obj, IcebergMonthTransform):
216
+ return MonthTransform.of()
217
+ if isinstance(obj, IcebergYearTransform):
218
+ return YearTransform.of()
219
+ if isinstance(obj, IcebergIcebergVoidTransform):
220
+ return VoidTransform.of()
221
+ if isinstance(obj, IcebergBucketTransform):
222
+ return BucketTransform.of(
223
+ BucketTransformParameters.of(
224
+ num_buckets=obj.num_buckets,
225
+ bucketing_strategy=BucketingStrategy.ICEBERG,
226
+ ),
227
+ )
228
+ if isinstance(obj, IcebergTruncateTransform):
229
+ return TruncateTransform.of(
230
+ TruncateTransformParameters.of(width=obj.width),
231
+ )
232
+ return UnknownTransform.of()
233
+
234
+ @staticmethod
235
+ def unmap(
236
+ obj: Optional[Transform],
237
+ **kwargs,
238
+ ) -> Optional[IcebergTransform]:
239
+ if obj is None:
240
+ return None
241
+ if obj.name == TransformName.IDENTITY:
242
+ return IcebergIdentityTransform()
243
+ if obj.name == TransformName.HOUR:
244
+ return IcebergHourTransform()
245
+ if obj.name == TransformName.DAY:
246
+ return IcebergDayTransform()
247
+ if obj.name == TransformName.MONTH:
248
+ return IcebergMonthTransform()
249
+ if obj.name == TransformName.YEAR:
250
+ return IcebergYearTransform()
251
+ if obj.name == TransformName.VOID:
252
+ return IcebergIcebergVoidTransform()
253
+ if obj.name == TransformName.BUCKET:
254
+ parameters = BucketTransformParameters(obj.parameters)
255
+ strategy = parameters.bucketing_strategy
256
+ if strategy == BucketingStrategy.ICEBERG:
257
+ return IcebergBucketTransform(parameters.num_buckets)
258
+ else:
259
+ err_msg = f"Unsupported Iceberg Bucketing Strategy: {strategy}."
260
+ raise ValueError(err_msg)
261
+ if obj.name == TransformName.TRUNCATE:
262
+ parameters = TruncateTransformParameters(obj.parameters)
263
+ return IcebergTruncateTransform(parameters.width)
264
+ return IcebergUnknownTransform(obj.name)
265
+
266
+
267
+ class PartitionKeyMapper(ModelMapper[PartitionField, PartitionKey]):
268
+ @staticmethod
269
+ def map(
270
+ obj: Optional[PartitionField],
271
+ schema: IcebergSchema = IcebergSchema(),
272
+ **kwargs,
273
+ ) -> Optional[PartitionKey]:
274
+ if obj is None:
275
+ return None
276
+ if not schema:
277
+ err_msg = "Schema is required for Partition Field conversion."
278
+ raise ValueError(err_msg)
279
+ field = schema.find_field(name_or_id=obj.source_id)
280
+ return PartitionKey.of(
281
+ key=[field.name],
282
+ name=obj.name,
283
+ field_id=obj.field_id,
284
+ transform=TransformMapper.map(obj.transform),
285
+ native_object=obj,
286
+ )
287
+
288
+ @staticmethod
289
+ def unmap(
290
+ obj: Optional[PartitionKey],
291
+ schema: IcebergSchema = IcebergSchema(),
292
+ case_sensitive: bool = True,
293
+ ) -> Optional[PartitionField]:
294
+ if obj is None:
295
+ return None
296
+ if not schema:
297
+ err_msg = "Schema is required for Partition Key conversion."
298
+ raise ValueError(err_msg)
299
+ if len(obj.key) > 1:
300
+ err_msg = f"Iceberg only supports transforming 1 partition field."
301
+ raise ValueError(err_msg)
302
+ field = schema.find_field(
303
+ name_or_id=obj.key[0],
304
+ case_sensitive=case_sensitive,
305
+ )
306
+ return PartitionField(
307
+ source_id=field.field_id,
308
+ field_id=obj.id if obj.id else None,
309
+ transform=TransformMapper.unmap(obj.transform),
310
+ name=obj.name,
311
+ )
312
+
313
+
314
+ class PartitionSchemeMapper(ModelMapper[PartitionSpec, PartitionScheme]):
315
+ @staticmethod
316
+ def map(
317
+ obj: Optional[PartitionSpec],
318
+ schema: IcebergSchema = IcebergSchema(),
319
+ name: Optional[str] = None,
320
+ ) -> Optional[PartitionScheme]:
321
+ if obj is None:
322
+ return None
323
+ elif not schema:
324
+ err_msg = "Schema is required for Partition Spec conversion."
325
+ raise ValueError(err_msg)
326
+ keys = [PartitionKeyMapper.map(field, schema) for field in obj.fields]
327
+ return PartitionScheme.of(
328
+ keys=keys,
329
+ name=name,
330
+ scheme_id=str(obj.spec_id),
331
+ native_object=obj,
332
+ )
333
+
334
+ @staticmethod
335
+ def unmap(
336
+ obj: Optional[PartitionScheme],
337
+ schema: IcebergSchema = IcebergSchema(),
338
+ case_sensitive: bool = True,
339
+ ) -> Optional[PartitionSpec]:
340
+ if obj is None:
341
+ return None
342
+ if not schema:
343
+ err_msg = "Schema is required for Partition Scheme conversion."
344
+ raise ValueError(err_msg)
345
+ fields = [
346
+ PartitionKeyMapper.unmap(key, schema, case_sensitive) for key in obj.keys
347
+ ]
348
+ return PartitionSpec(
349
+ fields=fields,
350
+ spec_id=int(obj.id),
351
+ )
352
+
353
+
354
+ class SortKeyMapper(ModelMapper[SortField, SortKey]):
355
+ @staticmethod
356
+ def unmap(
357
+ obj: Optional[SortKey],
358
+ schema: IcebergSchema = IcebergSchema(),
359
+ case_sensitive: bool = True,
360
+ ) -> Optional[SortField]:
361
+ if obj is None:
362
+ return None
363
+ if not schema:
364
+ err_msg = "Schema is required for Sort Key conversion."
365
+ raise ValueError(err_msg)
366
+ if len(obj.key) > 1:
367
+ err_msg = f"Iceberg only supports transforming 1 sort field."
368
+ raise ValueError(err_msg)
369
+ field = schema.find_field(
370
+ name_or_id=obj.key[0],
371
+ case_sensitive=case_sensitive,
372
+ )
373
+ direction = (
374
+ SortDirection.ASC
375
+ if obj.sort_order is SortOrder.ASCENDING
376
+ else SortDirection.DESC
377
+ if obj.sort_order is SortOrder.DESCENDING
378
+ else None
379
+ )
380
+ null_order = (
381
+ IcebergNullOrder.NULLS_FIRST
382
+ if obj.null_order is NullOrder.AT_START
383
+ else IcebergNullOrder.NULLS_LAST
384
+ if obj.null_order is NullOrder.AT_END
385
+ else None
386
+ )
387
+ return SortField(
388
+ source_id=field.field_id,
389
+ transform=TransformMapper.unmap(obj.transform),
390
+ direction=direction,
391
+ null_order=null_order,
392
+ )
393
+
394
+ @staticmethod
395
+ def map(
396
+ obj: Optional[SortField],
397
+ schema: IcebergSchema = IcebergSchema(),
398
+ **kwargs,
399
+ ) -> Optional[SortKey]:
400
+ if obj is None:
401
+ return None
402
+ if not schema:
403
+ err_msg = "Schema is required for Sort Field conversion."
404
+ raise ValueError(err_msg)
405
+ field = schema.find_field(name_or_id=obj.source_id)
406
+ return SortKey.of(
407
+ key=[field.name],
408
+ sort_order=SortOrder(obj.direction.value or "ascending"),
409
+ null_order=NullOrder(obj.null_order.value or "first"),
410
+ transform=TransformMapper.map(obj.transform),
411
+ native_object=obj,
412
+ )
413
+
414
+
415
+ class SortSchemeMapper(ModelMapper[IcebergSortOrder, SortScheme]):
416
+ @staticmethod
417
+ def map(
418
+ obj: Optional[IcebergSortOrder],
419
+ schema: IcebergSchema = IcebergSchema(),
420
+ name: Optional[str] = None,
421
+ id: Optional[str] = None,
422
+ ) -> Optional[SortScheme]:
423
+ if obj is None:
424
+ return None
425
+ elif not schema:
426
+ err_msg = "Schema is required for Sort Order conversion."
427
+ raise ValueError(err_msg)
428
+ keys = [SortKeyMapper.map(field, schema) for field in obj.fields]
429
+ return SortScheme.of(
430
+ keys=keys,
431
+ name=name,
432
+ scheme_id=id,
433
+ native_object=obj,
434
+ )
435
+
436
+ @staticmethod
437
+ def unmap(
438
+ obj: Optional[SortScheme],
439
+ schema: IcebergSchema = IcebergSchema(),
440
+ case_sensitive: bool = True,
441
+ ) -> Optional[IcebergSortOrder]:
442
+ if obj is None:
443
+ return None
444
+ if not schema:
445
+ err_msg = "Schema is required for Sort Scheme conversion."
446
+ raise ValueError(err_msg)
447
+ fields = [SortKeyMapper.unmap(key, schema, case_sensitive) for key in obj]
448
+ return IcebergSortOrder(fields=fields)
449
+
450
+
451
+ class SchemaMapper(ModelMapper[IcebergSchema, Schema]):
452
+ @staticmethod
453
+ def map(
454
+ obj: Optional[IcebergSchema],
455
+ stream_locator: Optional[StreamLocator] = None,
456
+ **kwargs,
457
+ ) -> Optional[Schema]:
458
+ if obj is None:
459
+ return None
460
+ schema: pa.Schema = schema_to_pyarrow(obj)
461
+ # use DeltaCAT fields to extract field IDs from PyArrow schema metadata
462
+ fields = [Field.of(field) for field in schema]
463
+ final_fields = []
464
+ for field in fields:
465
+ iceberg_field = obj.find_field(field.id)
466
+ final_field = Field.of(
467
+ field=field.arrow,
468
+ field_id=field.id,
469
+ is_merge_key=field.id in obj.identifier_field_ids,
470
+ doc=iceberg_field.doc,
471
+ past_default=iceberg_field.initial_default,
472
+ future_default=iceberg_field.write_default,
473
+ native_object=iceberg_field,
474
+ )
475
+ final_fields.append(final_field)
476
+ # TODO(pdames): Traverse DeltaCAT schemas to find one already related
477
+ # to this Iceberg schema.
478
+ return Schema.of(
479
+ schema=final_fields,
480
+ native_object=obj,
481
+ )
482
+
483
+ @staticmethod
484
+ def unmap(
485
+ obj: Optional[Schema], stream_locator: Optional[StreamLocator] = None, **kwargs
486
+ ) -> Optional[IcebergSchema]:
487
+ if obj is None:
488
+ return None
489
+ if isinstance(obj.arrow, pa.Schema):
490
+ schema = pyarrow_to_schema(obj.arrow)
491
+ final_fields = []
492
+ for field in obj.field_ids_to_fields.values():
493
+ iceberg_field = schema.find_field(field.id)
494
+ final_field = NestedField(
495
+ field_id=iceberg_field.field_id,
496
+ name=iceberg_field.name,
497
+ field_type=iceberg_field.field_type,
498
+ required=iceberg_field.required,
499
+ doc=field.doc,
500
+ initial_default=field.past_default,
501
+ write_default=field.future_default,
502
+ )
503
+ final_fields.append(final_field)
504
+ # TODO (pmingshi): this code was changed as a hack to get schema conversion working
505
+ # it still needs more testing
506
+ iceberg_schema = IcebergSchema(
507
+ fields=final_fields,
508
+ schema_id=INITIAL_SCHEMA_ID,
509
+ # identifier_field_ids=obj.merge_keys,
510
+ identifier_field_ids=[],
511
+ )
512
+ else:
513
+ err_msg = (
514
+ f"unsupported schema type: `{type(obj.arrow)}`. "
515
+ f"expected schema type: {pa.Schema}"
516
+ )
517
+ raise TypeError(err_msg)
518
+ return iceberg_schema
519
+
520
+
521
+ class NamespaceLocatorMapper(
522
+ ModelMapper[Union[Identifier, IcebergNamespace], NamespaceLocator]
523
+ ):
524
+ @staticmethod
525
+ def map(
526
+ obj: Optional[Union[Identifier, IcebergNamespace]], **kwargs
527
+ ) -> Optional[NamespaceLocator]:
528
+ namespace = None
529
+ if obj is None:
530
+ return None
531
+ elif isinstance(obj, IcebergNamespace):
532
+ namespace = NAMESPACE_SEPARATOR.join(obj.namespace.root[1:])
533
+ elif isinstance(obj, Tuple):
534
+ # In Iceberg, Tuple identifiers are of the form (namespace) or (namespace, table)
535
+ # In this case, just take the first element of the tuple
536
+ namespace = obj[0]
537
+ if not namespace:
538
+ err_msg = f"No namespace in identifier: {obj}"
539
+ raise NamespaceNotFoundError(err_msg)
540
+ return NamespaceLocator.of(namespace)
541
+
542
+ @staticmethod
543
+ def unmap(obj: Optional[NamespaceLocator], **kwargs) -> Optional[Identifier]:
544
+ if obj is None:
545
+ return None
546
+ return tuple(obj.namespace.split("."))
547
+
548
+
549
+ class NamespaceMapper(ModelMapper[Union[Identifier, IcebergNamespace], Namespace]):
550
+ @staticmethod
551
+ def map(
552
+ obj: Optional[Union[Identifier, IcebergNamespace]], **kwargs
553
+ ) -> Optional[Namespace]:
554
+ if obj is None:
555
+ return None
556
+ locator = NamespaceLocatorMapper.map(obj)
557
+ return Namespace.of(locator=locator, properties=None)
558
+
559
+ @staticmethod
560
+ def unmap(
561
+ obj: Optional[Namespace],
562
+ **kwargs,
563
+ ) -> Optional[Identifier]:
564
+ if obj is None:
565
+ return None
566
+ return NamespaceLocatorMapper.unmap(obj.locator)
567
+
568
+
569
+ class TableLocatorMapper(ModelMapper[Union[Identifier, TableIdentifier], TableLocator]):
570
+ @staticmethod
571
+ def map(
572
+ obj: Optional[Union[Identifier, TableIdentifier]], **kwargs
573
+ ) -> Optional[TableLocator]:
574
+ if obj is None:
575
+ return None
576
+ namespace_locator = NamespaceLocatorMapper.map(obj)
577
+ table_name = (
578
+ obj.name
579
+ if isinstance(obj, TableIdentifier)
580
+ else Catalog.table_name_from(obj)
581
+ )
582
+ if not table_name:
583
+ raise TableNotFoundError(f"No table name in identifier: {obj}")
584
+ return TableLocator.of(namespace_locator, table_name)
585
+
586
+ @staticmethod
587
+ def unmap(
588
+ obj: Optional[TableLocator], catalog_name: Optional[str] = None, **kwargs
589
+ ) -> Optional[Union[Identifier, TableIdentifier]]:
590
+ if obj is None:
591
+ return None
592
+ identifier = tuple(obj.namespace.split(".")) + (obj.table_name,)
593
+ return identifier
594
+
595
+
596
+ class TableMapper(OneWayModelMapper[IcebergTable, Table]):
597
+ @staticmethod
598
+ def map(
599
+ obj: Optional[IcebergTable],
600
+ **kwargs,
601
+ ) -> Optional[Table]:
602
+ if obj is None:
603
+ return None
604
+ locator = TableLocatorMapper.map(obj.name())
605
+ return Table.of(
606
+ locator=locator,
607
+ description=None,
608
+ properties=None,
609
+ native_object=obj,
610
+ )
611
+
612
+
613
+ class TableVersionLocatorMapper(OneWayModelMapper[IcebergTable, TableVersionLocator]):
614
+ @staticmethod
615
+ def map(
616
+ obj: Optional[IcebergTable], timestamp: Optional[int] = None, **kwargs
617
+ ) -> Optional[TableVersionLocator]:
618
+ if obj is None:
619
+ return None
620
+ table_version = _resolve_table_version(obj.metadata, timestamp)
621
+ return TableVersionLocator.of(
622
+ table_locator=TableLocatorMapper.map(obj.name()),
623
+ table_version=str(table_version),
624
+ )
625
+
626
+
627
+ class TableVersionMapper(OneWayModelMapper[IcebergTable, TableVersion]):
628
+ @staticmethod
629
+ def map(
630
+ obj: Optional[IcebergTable],
631
+ timestamp: Optional[int] = None,
632
+ catalog_properties: Dict[str, str] = EMPTY_DICT,
633
+ **kwargs,
634
+ ) -> Optional[TableVersion]:
635
+ if obj is None:
636
+ return None
637
+ metadata = _resolve_table_version_metadata(obj, timestamp, catalog_properties)
638
+ schema = _get_current_schema_for_meta(metadata)
639
+ partition_spec = _get_current_spec_for_meta(metadata)
640
+ sort_order = _get_current_sort_order_for_meta(metadata)
641
+ return TableVersion.of(
642
+ locator=TableVersionLocatorMapper.map(obj, timestamp),
643
+ schema=SchemaMapper.map(schema),
644
+ partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
645
+ description=None,
646
+ properties=obj.properties,
647
+ content_types=None,
648
+ sort_scheme=SortSchemeMapper.map(sort_order, schema),
649
+ native_object=metadata,
650
+ )
651
+
652
+
653
+ class StreamLocatorMapper(OneWayModelMapper[IcebergTable, StreamLocator]):
654
+ @staticmethod
655
+ def map(
656
+ obj: Optional[IcebergTable],
657
+ metadata_timestamp: Optional[int] = None,
658
+ snapshot_id: Optional[int] = None,
659
+ catalog_properties: Dict[str, str] = EMPTY_DICT,
660
+ **kwargs,
661
+ ) -> Optional[StreamLocator]:
662
+ if obj is None:
663
+ return None
664
+ metadata = _resolve_table_version_metadata(
665
+ obj, metadata_timestamp, catalog_properties
666
+ )
667
+ snapshot = _resolve_stream_snapshot(metadata, snapshot_id)
668
+ return StreamLocator.of(
669
+ table_version_locator=TableVersionLocatorMapper.map(
670
+ obj, metadata_timestamp
671
+ ),
672
+ stream_id=str(snapshot.snapshot_id),
673
+ stream_format=StreamFormat.ICEBERG.value,
674
+ )
675
+
676
+
677
+ class StreamMapper(OneWayModelMapper[IcebergTable, Stream]):
678
+ @staticmethod
679
+ def map(
680
+ obj: Optional[IcebergTable],
681
+ # TODO (pdames): infer state from Iceberg metadata?
682
+ state: Optional[CommitState] = CommitState.COMMITTED,
683
+ metadata_timestamp: Optional[int] = None,
684
+ snapshot_id: Optional[int] = None,
685
+ catalog_properties: Dict[str, str] = EMPTY_DICT,
686
+ **kwargs,
687
+ ) -> Optional[Stream]:
688
+ if obj is None:
689
+ return None
690
+ metadata = _resolve_table_version_metadata(
691
+ obj, metadata_timestamp, catalog_properties
692
+ )
693
+ if not metadata.snapshots:
694
+ return Stream.of(locator=None, partition_scheme=None)
695
+ snapshot = _resolve_stream_snapshot(metadata, snapshot_id)
696
+ schema = _get_current_schema_for_meta(metadata)
697
+ partition_spec = _get_current_spec_for_meta(metadata)
698
+ parent_snapshot_str = (
699
+ str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None
700
+ )
701
+ return Stream.of(
702
+ locator=StreamLocatorMapper.map(
703
+ obj, metadata_timestamp, snapshot_id, catalog_properties
704
+ ),
705
+ partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
706
+ state=state,
707
+ previous_stream_id=parent_snapshot_str,
708
+ native_object=snapshot,
709
+ )