deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -3,21 +3,45 @@ from __future__ import annotations
3
3
 
4
4
  import logging
5
5
  import copy
6
+ import base64
6
7
 
7
8
  import msgpack
8
- from typing import Optional, Any, Dict, Union, List, Callable, Tuple
9
+ from typing import Optional, Any, Dict, Union, List, Callable, Tuple, TYPE_CHECKING
9
10
 
10
11
  import pyarrow as pa
11
12
  from pyarrow import ArrowInvalid
13
+ import pandas as pd
14
+ import numpy as np
15
+
16
+ # Daft DataFrame support - required for core functionality
17
+ import daft
18
+ from daft import DataFrame as DaftDataFrame
12
19
 
13
20
  from deltacat.constants import BYTES_PER_KIBIBYTE
21
+ from deltacat.exceptions import (
22
+ SchemaCompatibilityError,
23
+ SchemaValidationError,
24
+ )
14
25
  from deltacat.storage.model.types import (
15
26
  SchemaConsistencyType,
16
27
  SortOrder,
17
28
  NullOrder,
18
29
  )
30
+ from deltacat.types.tables import (
31
+ get_table_length,
32
+ to_pyarrow,
33
+ from_pyarrow,
34
+ get_dataset_type,
35
+ SchemaEvolutionMode,
36
+ )
37
+ from deltacat.types.media import DatasetType
38
+
39
+ if TYPE_CHECKING:
40
+ from deltacat.storage.model.sort_key import SortKey
41
+
19
42
  from deltacat import logs
20
43
 
44
+
21
45
  # PyArrow Field Metadata Key used to set the Field ID when writing to Parquet.
22
46
  # See: https://arrow.apache.org/docs/cpp/parquet.html#parquet-field-id
23
47
  PARQUET_FIELD_ID_KEY_NAME = b"PARQUET:field_id"
@@ -53,6 +77,52 @@ SUBSCHEMAS_KEY_NAME = b"DELTACAT:subschemas"
53
77
  # Apache Iceberg, which sets aside this range for reserved fields
54
78
  MAX_FIELD_ID_EXCLUSIVE = 2147483447
55
79
 
80
+
81
+ def _encode_metadata_value(value: Any) -> bytes:
82
+ """
83
+ Encode a value for storage in PyArrow field metadata.
84
+
85
+ Uses msgpack for efficient serialization, then base64 encoding to ensure
86
+ UTF-8 compatibility with all Parquet readers (Polars, Daft, etc.).
87
+
88
+ Args:
89
+ value: The value to encode
90
+
91
+ Returns:
92
+ Base64-encoded msgpack bytes that are UTF-8 safe
93
+ """
94
+ msgpack_bytes = msgpack.dumps(value)
95
+ return base64.b64encode(msgpack_bytes)
96
+
97
+
98
+ def _decode_metadata_value(encoded_bytes: bytes) -> Any:
99
+ """
100
+ Decode a value from PyArrow field metadata.
101
+
102
+ Handles both new base64-encoded format and legacy raw msgpack format
103
+ for backward compatibility.
104
+
105
+ Args:
106
+ encoded_bytes: The encoded bytes from field metadata
107
+
108
+ Returns:
109
+ The decoded value
110
+
111
+ Raises:
112
+ ValueError: If the data cannot be decoded
113
+ """
114
+ try:
115
+ # Try new base64-encoded format first
116
+ msgpack_bytes = base64.b64decode(encoded_bytes)
117
+ return msgpack.loads(msgpack_bytes)
118
+ except Exception:
119
+ try:
120
+ # Fall back to legacy raw msgpack format
121
+ return msgpack.loads(encoded_bytes)
122
+ except Exception as e:
123
+ raise ValueError(f"Failed to decode metadata value: {e}") from e
124
+
125
+
56
126
  # Default name assigned to the base, unnamed single schema when a new named
57
127
  # subschema is first added.
58
128
  BASE_SCHEMA_NAME = "_base"
@@ -64,6 +134,86 @@ FieldName = str
64
134
  NestedFieldName = List[str]
65
135
  FieldLocator = Union[FieldName, NestedFieldName, FieldId]
66
136
 
137
+
138
+ class SchemaUpdateOperation(tuple):
139
+ """
140
+ Represents a single schema update operation (add, remove, or update field).
141
+
142
+ This class inherits from tuple and stores:
143
+ - operation: str ("add", "remove", "update")
144
+ - field_locator: Optional[FieldLocator] (name, path, or ID)
145
+ - field: Optional[Field] (the field data for add/update operations)
146
+ """
147
+
148
+ @staticmethod
149
+ def add_field(field: Field) -> SchemaUpdateOperation:
150
+ """Create an operation to add a new field."""
151
+ return SchemaUpdateOperation(("add", None, field))
152
+
153
+ @staticmethod
154
+ def remove_field(field_locator: FieldLocator) -> SchemaUpdateOperation:
155
+ """Create an operation to remove an existing field."""
156
+ return SchemaUpdateOperation(("remove", field_locator, None))
157
+
158
+ @staticmethod
159
+ def update_field(
160
+ field_locator: FieldLocator, field: Field
161
+ ) -> SchemaUpdateOperation:
162
+ """Create an operation to update an existing field."""
163
+ return SchemaUpdateOperation(("update", field_locator, field))
164
+
165
+ @property
166
+ def operation(self) -> str:
167
+ """The operation type: 'add', 'remove', or 'update'."""
168
+ return self[0]
169
+
170
+ @property
171
+ def field_locator(self) -> Optional[FieldLocator]:
172
+ """The field locator (name, path, or ID)."""
173
+ return self[1]
174
+
175
+ @property
176
+ def field(self) -> Optional[Field]:
177
+ """The field data (None for remove operations)."""
178
+ return self[2]
179
+
180
+ def field_locator_matches(self, other_locator: FieldLocator) -> bool:
181
+ """Check if this operation's field_locator matches the given field_locator."""
182
+ return SchemaUpdate._field_locators_match(self.field_locator, other_locator)
183
+
184
+
185
+ class SchemaUpdateOperations(List[SchemaUpdateOperation]):
186
+ """
187
+ A list of schema update operations that can be applied to a schema.
188
+
189
+ This class inherits from List[SchemaUpdateOperation] and provides convenience
190
+ methods for creating and managing schema update operations.
191
+ """
192
+
193
+ @staticmethod
194
+ def of(operations: List[SchemaUpdateOperation]) -> SchemaUpdateOperations:
195
+ """Create a SchemaUpdateOperations list from a list of operations."""
196
+ typed_operations = SchemaUpdateOperations()
197
+ for operation in operations:
198
+ if operation is not None and not isinstance(
199
+ operation, SchemaUpdateOperation
200
+ ):
201
+ operation = SchemaUpdateOperation(operation)
202
+ typed_operations.append(operation)
203
+ return typed_operations
204
+
205
+ def __getitem__(self, item):
206
+ """Override to ensure items are properly typed as SchemaUpdateOperation."""
207
+ val = super().__getitem__(item)
208
+ if val is not None and not isinstance(val, SchemaUpdateOperation):
209
+ self[item] = val = SchemaUpdateOperation(val)
210
+ return val
211
+
212
+ def __iter__(self):
213
+ for i in range(len(self)):
214
+ yield self[i] # This triggers __getitem__ conversion
215
+
216
+
67
217
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
68
218
 
69
219
 
@@ -224,6 +374,10 @@ class Field(dict):
224
374
  def merge_order(self) -> Optional[MergeOrder]:
225
375
  return Field._merge_order(self.arrow)
226
376
 
377
+ @property
378
+ def is_event_time(self) -> Optional[bool]:
379
+ return Field._is_event_time(self.arrow)
380
+
227
381
  @property
228
382
  def doc(self) -> Optional[str]:
229
383
  return Field._doc(self.arrow)
@@ -273,7 +427,7 @@ class Field(dict):
273
427
  merge_order = None
274
428
  if field.metadata:
275
429
  bytes_val = field.metadata.get(FIELD_MERGE_ORDER_KEY_NAME)
276
- merge_order = msgpack.loads(bytes_val) if bytes_val else None
430
+ merge_order = _decode_metadata_value(bytes_val) if bytes_val else None
277
431
  return merge_order
278
432
 
279
433
  @staticmethod
@@ -289,7 +443,7 @@ class Field(dict):
289
443
  default = None
290
444
  if field.metadata:
291
445
  bytes_val = field.metadata.get(FIELD_PAST_DEFAULT_KEY_NAME)
292
- default = msgpack.loads(bytes_val) if bytes_val else None
446
+ default = _decode_metadata_value(bytes_val) if bytes_val else None
293
447
  return default
294
448
 
295
449
  @staticmethod
@@ -297,7 +451,7 @@ class Field(dict):
297
451
  default = None
298
452
  if field.metadata:
299
453
  bytes_val = field.metadata.get(FIELD_FUTURE_DEFAULT_KEY_NAME)
300
- default = msgpack.loads(bytes_val) if bytes_val else None
454
+ default = _decode_metadata_value(bytes_val) if bytes_val else None
301
455
  return default
302
456
 
303
457
  @staticmethod
@@ -309,19 +463,53 @@ class Field(dict):
309
463
  return t
310
464
 
311
465
  @staticmethod
312
- def _validate_merge_key(field: pa.Field):
313
- if not (pa.types.is_string(field.type) or pa.types.is_primitive(field.type)):
314
- raise ValueError(f"Merge key {field} must be a primitive type.")
466
+ def _validate_merge_key(
467
+ field: pa.Field, consistency_type: Optional[SchemaConsistencyType] = None
468
+ ):
469
+ # Note: large_strings were explicitly allowed for compatibility with PyIceberg Iceberg Schema to PyArrow converter
470
+ if not (
471
+ pa.types.is_string(field.type)
472
+ or pa.types.is_primitive(field.type)
473
+ or pa.types.is_large_string(field.type)
474
+ ):
475
+ raise ValueError(
476
+ f"Merge key {field} must be a primitive type or large string."
477
+ )
478
+
479
+ # Merge key fields must have VALIDATE consistency type to prevent type promotion
480
+ if (
481
+ consistency_type is not None
482
+ and consistency_type != SchemaConsistencyType.VALIDATE
483
+ ):
484
+ raise ValueError(
485
+ f"Merge key field '{field.name}' must have VALIDATE consistency type, "
486
+ f"got {consistency_type}. Type promotion is not allowed for merge keys."
487
+ )
488
+
315
489
  if pa.types.is_floating(field.type):
316
490
  raise ValueError(f"Merge key {field} cannot be floating point.")
317
491
 
318
492
  @staticmethod
319
- def _validate_merge_order(field: pa.Field):
493
+ def _validate_merge_order(
494
+ field: pa.Field, consistency_type: Optional[SchemaConsistencyType] = None
495
+ ):
320
496
  if not pa.types.is_primitive(field.type):
321
497
  raise ValueError(f"Merge order {field} must be a primitive type.")
322
498
 
499
+ # Merge order fields must have VALIDATE consistency type to prevent type promotion
500
+ if (
501
+ consistency_type is not None
502
+ and consistency_type != SchemaConsistencyType.VALIDATE
503
+ ):
504
+ raise ValueError(
505
+ f"Merge order field '{field.name}' must have VALIDATE consistency type, "
506
+ f"got {consistency_type}. Type promotion is not allowed for merge order fields."
507
+ )
508
+
323
509
  @staticmethod
324
- def _validate_event_time(field: pa.Field):
510
+ def _validate_event_time(
511
+ field: pa.Field, consistency_type: Optional[SchemaConsistencyType] = None
512
+ ):
325
513
  if (
326
514
  not pa.types.is_integer(field.type)
327
515
  and not pa.types.is_floating(field.type)
@@ -329,6 +517,16 @@ class Field(dict):
329
517
  ):
330
518
  raise ValueError(f"Event time {field} must be numeric or date type.")
331
519
 
520
+ # Event time fields must have VALIDATE consistency type to prevent type promotion
521
+ if (
522
+ consistency_type is not None
523
+ and consistency_type != SchemaConsistencyType.VALIDATE
524
+ ):
525
+ raise ValueError(
526
+ f"Event time field '{field.name}' must have VALIDATE consistency type, "
527
+ f"got {consistency_type}. Type promotion is not allowed for event time fields."
528
+ )
529
+
332
530
  @staticmethod
333
531
  def _validate_default(
334
532
  default: Optional[Any],
@@ -354,22 +552,31 @@ class Field(dict):
354
552
  future_default: Optional[Any],
355
553
  consistency_type: Optional[SchemaConsistencyType],
356
554
  ) -> pa.Field:
555
+ # Auto-set future_default to past_default if past_default exists but future_default doesn't
556
+ if past_default is not None and future_default is None:
557
+ future_default = past_default
558
+
559
+ # Default critical columns (merge key, merge order, event time) to VALIDATE consistency type
560
+ # to prevent type promotion which could break merge semantics
561
+ if consistency_type is None and (is_merge_key or merge_order or is_event_time):
562
+ consistency_type = SchemaConsistencyType.VALIDATE
563
+
357
564
  meta = {}
358
565
  if is_merge_key:
359
- Field._validate_merge_key(field)
566
+ Field._validate_merge_key(field, consistency_type)
360
567
  meta[FIELD_MERGE_KEY_NAME] = str(is_merge_key)
361
568
  if merge_order:
362
- Field._validate_merge_order(field)
363
- meta[FIELD_MERGE_ORDER_KEY_NAME] = msgpack.dumps(merge_order)
569
+ Field._validate_merge_order(field, consistency_type)
570
+ meta[FIELD_MERGE_ORDER_KEY_NAME] = _encode_metadata_value(merge_order)
364
571
  if is_event_time:
365
- Field._validate_event_time(field)
572
+ Field._validate_event_time(field, consistency_type)
366
573
  meta[FIELD_EVENT_TIME_KEY_NAME] = str(is_event_time)
367
574
  if past_default is not None:
368
575
  Field._validate_default(past_default, field)
369
- meta[FIELD_PAST_DEFAULT_KEY_NAME] = msgpack.dumps(past_default)
576
+ meta[FIELD_PAST_DEFAULT_KEY_NAME] = _encode_metadata_value(past_default)
370
577
  if future_default is not None:
371
578
  Field._validate_default(future_default, field)
372
- meta[FIELD_FUTURE_DEFAULT_KEY_NAME] = msgpack.dumps(future_default)
579
+ meta[FIELD_FUTURE_DEFAULT_KEY_NAME] = _encode_metadata_value(future_default)
373
580
  if field_id is not None:
374
581
  meta[PARQUET_FIELD_ID_KEY_NAME] = str(field_id)
375
582
  if doc is not None:
@@ -383,6 +590,217 @@ class Field(dict):
383
590
  metadata=meta,
384
591
  )
385
592
 
593
+ def validate(
594
+ self,
595
+ column_type: pa.DataType,
596
+ ) -> None:
597
+ """Validate that data in a column matches this field's type and constraints.
598
+
599
+ Args:
600
+ column_datatype: PyArrow DataType containing the column data to validate
601
+
602
+ Raises:
603
+ ValueError: If data doesn't match field requirements.
604
+ """
605
+ # Check if the data type matches the field type
606
+ if not column_type.equals(self.arrow.type):
607
+ raise SchemaValidationError(
608
+ f"Data type mismatch for field '{self.arrow.name}': "
609
+ f"expected {self.arrow.type}, got {column_type}"
610
+ )
611
+
612
+ def coerce(
613
+ self,
614
+ column_data: pa.Array,
615
+ ) -> pa.Array:
616
+ """Coerce data in a column to match this field's type.
617
+
618
+ Args:
619
+ column_data: PyArrow Array containing the column data to coerce
620
+
621
+ Returns:
622
+ pa.Array: Coerced data matching this field's type
623
+
624
+ Raises:
625
+ ValueError: If data cannot be coerced to the field type
626
+ """
627
+ try:
628
+ return pa.compute.cast(column_data, self.arrow.type)
629
+ except (pa.ArrowTypeError, pa.ArrowInvalid) as e:
630
+ raise SchemaValidationError(
631
+ f"Cannot coerce data for field '{self.arrow.name}' "
632
+ f"from {column_data.type} to {self.arrow.type}: {e}"
633
+ )
634
+
635
+ def coerce_daft(
636
+ self,
637
+ dataframe: DaftDataFrame,
638
+ column_name: str,
639
+ target_type: Optional[pa.DataType] = None,
640
+ ) -> DaftDataFrame:
641
+ """Coerce a Daft DataFrame column to match this field's type.
642
+
643
+ Args:
644
+ dataframe: Daft DataFrame containing the column to coerce
645
+ column_name: Name of the column to coerce
646
+ target_type: Optional target type to coerce to (defaults to self.arrow.type)
647
+
648
+ Returns:
649
+ DaftDataFrame: DataFrame with the coerced column
650
+
651
+ Raises:
652
+ SchemaValidationError: If data cannot be coerced to the field type
653
+ """
654
+ target_arrow_type = target_type or self.arrow.type
655
+ target_daft_type = daft.DataType.from_arrow_type(target_arrow_type)
656
+
657
+ try:
658
+ # Use Daft's cast expression to coerce the column
659
+ coerced_dataframe = dataframe.with_column(
660
+ column_name, daft.col(column_name).cast(target_daft_type)
661
+ )
662
+ return coerced_dataframe
663
+ except Exception as e:
664
+ raise SchemaValidationError(
665
+ f"Cannot coerce Daft column '{column_name}' for field '{self.arrow.name}' "
666
+ f"to type {target_arrow_type}: {e}"
667
+ )
668
+
669
+ def promote_type_if_needed(
670
+ self,
671
+ column_data: pa.Array,
672
+ ) -> Tuple[pa.Array, bool]:
673
+ """Promote field type to accommodate new data when consistency type is NONE.
674
+ Use PyArrow's unify_schemas to find the most permissive type that can accommodate both
675
+ the current and new data types.
676
+
677
+ Args:
678
+ column_data: PyArrow Array containing the column data
679
+
680
+ Returns:
681
+ Tuple[pa.Array, bool]: (data, type_was_promoted)
682
+ - data: Either original data or data cast to promoted type
683
+ - type_was_promoted: True if field type should be updated
684
+
685
+ Raises:
686
+ SchemaValidationError: If column data cannot be promoted to a unified type
687
+ """
688
+ current_type = self.arrow.type
689
+ data_type = column_data.type
690
+
691
+ # Early return if types are already compatible
692
+ if current_type.equals(data_type):
693
+ return column_data, False
694
+
695
+ # Find the promoted type that can accommodate both types
696
+ promoted_type = self._find_promoted_type(current_type, data_type)
697
+
698
+ # Handle type coercion vs promotion
699
+ if promoted_type.equals(current_type):
700
+ return self._coerce_to_current_type(column_data, current_type)
701
+ else:
702
+ return self._promote_to_new_type(column_data, promoted_type)
703
+
704
+ def _coerce_to_current_type(
705
+ self,
706
+ column_data: pa.Array,
707
+ current_type: pa.DataType,
708
+ ) -> Tuple[pa.Array, bool]:
709
+ """Try to coerce data to current type without promoting the field type."""
710
+ try:
711
+ coerced_data = pa.compute.cast(column_data, current_type)
712
+ return coerced_data, False
713
+ except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError):
714
+ return column_data, False
715
+
716
+ def _promote_to_new_type(
717
+ self,
718
+ column_data: pa.Array,
719
+ promoted_type: pa.DataType,
720
+ ) -> Tuple[pa.Array, bool]:
721
+ """Try to cast data to the promoted type."""
722
+ try:
723
+ promoted_data = pa.compute.cast(column_data, promoted_type)
724
+ return promoted_data, True
725
+ except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError):
726
+ # If direct cast fails, the promotion is not valid
727
+ raise SchemaValidationError(
728
+ f"Cannot cast data for field '{self.arrow.name}' from type {column_data.type} "
729
+ f"to promoted type {promoted_type}"
730
+ )
731
+
732
+ def _cast_default_to_promoted_type(
733
+ self,
734
+ default_value: Any,
735
+ promoted_type: pa.DataType,
736
+ ) -> Optional[Any]:
737
+ """Cast a default value to match a promoted type.
738
+
739
+ Args:
740
+ default_value: The original default value
741
+ promoted_type: The new promoted type
742
+
743
+ Returns:
744
+ The default value cast to the promoted type.
745
+
746
+ Raises:
747
+ SchemaValidationError: If the default value cannot be cast to the promoted type
748
+ """
749
+ if default_value is None:
750
+ return None
751
+
752
+ try:
753
+ # Create a scalar with the original default value
754
+ original_scalar = pa.scalar(default_value)
755
+ # Cast to the promoted type
756
+ promoted_scalar = pa.compute.cast(original_scalar, promoted_type)
757
+ # Return the Python value
758
+ return promoted_scalar.as_py()
759
+ except (
760
+ pa.ArrowTypeError,
761
+ pa.ArrowInvalid,
762
+ pa.ArrowNotImplementedError,
763
+ TypeError,
764
+ ValueError,
765
+ ):
766
+ raise SchemaValidationError(
767
+ f"Cannot cast default value `{default_value}` to promoted type {promoted_type}"
768
+ )
769
+
770
+ def _find_promoted_type(
771
+ self,
772
+ current_type: pa.DataType,
773
+ new_type: pa.DataType,
774
+ ) -> Optional[pa.DataType]:
775
+ """Find the most specific type that can accommodate both current and new types
776
+ using PyArrow's unify_schemas with permissive promotion options.
777
+
778
+ Returns:
779
+ The promoted type.
780
+
781
+ Raises:
782
+ SchemaValidationError: If the types cannot be unified.
783
+ """
784
+ try:
785
+ # Create schemas with the same field name but different types
786
+ schema1 = pa.schema([("field", current_type)])
787
+ schema2 = pa.schema([("field", new_type)])
788
+
789
+ # Use PyArrow's built-in permissive type promotion
790
+ unified_schema = pa.unify_schemas(
791
+ [schema1, schema2], promote_options="permissive"
792
+ )
793
+
794
+ # Return the promoted type
795
+ return unified_schema.field("field").type
796
+
797
+ except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError):
798
+ # If unification fails, no promotion is possible
799
+ raise SchemaValidationError(
800
+ f"Cannot unify types for field '{self.arrow.name}': "
801
+ f"current type {current_type} incompatible with new data type {new_type}"
802
+ )
803
+
386
804
 
387
805
  SingleSchema = Union[List[Field], pa.Schema]
388
806
  MultiSchema = Union[Dict[SchemaName, List[Field]], Dict[SchemaName, pa.Schema]]
@@ -432,6 +850,8 @@ class Schema(dict):
432
850
  Returns:
433
851
  A new DeltaCAT Schema.
434
852
  """
853
+ if schema_id and schema_id < 0:
854
+ raise ValueError(f"Schema ID must be non-negative, got {schema_id}")
435
855
  # normalize the input as a unified pyarrow schema
436
856
  # if the input included multiple subschemas, then also save a mapping
437
857
  # from each subschema to its unique field names
@@ -454,6 +874,8 @@ class Schema(dict):
454
874
  visit=Schema._populate_fields,
455
875
  visitor_dict=visitor_dict,
456
876
  )
877
+ # recalculate max field ID after field population (in case new field IDs were assigned)
878
+ max_field_id = max(field_ids_to_fields.keys()) if field_ids_to_fields else 0
457
879
  if schema.metadata:
458
880
  schema_metadata.update(schema.metadata)
459
881
  # populate merge keys
@@ -477,7 +899,9 @@ class Schema(dict):
477
899
  schema_metadata[SCHEMA_ID_KEY_NAME] = str(schema_id)
478
900
  if schema_metadata.get(SCHEMA_ID_KEY_NAME) is None:
479
901
  schema_metadata[SCHEMA_ID_KEY_NAME] = str(0)
480
- schema_metadata[SUBSCHEMAS_KEY_NAME] = msgpack.dumps(subschema_to_field_ids)
902
+ schema_metadata[SUBSCHEMAS_KEY_NAME] = _encode_metadata_value(
903
+ subschema_to_field_ids
904
+ )
481
905
  final_schema = pyarrow_schema.with_metadata(schema_metadata)
482
906
  return Schema(
483
907
  {
@@ -546,6 +970,32 @@ class Schema(dict):
546
970
  schema_id=self.id + 1,
547
971
  )
548
972
 
973
+ def update(self, allow_incompatible_changes: bool = False) -> SchemaUpdate:
974
+ """
975
+ Create a SchemaUpdate instance for safely evolving this schema.
976
+
977
+ This method provides a convenient way to create a SchemaUpdate for this schema
978
+ without needing to call SchemaUpdate.of() directly.
979
+
980
+ Args:
981
+ allow_incompatible_changes: If True, allows changes that may break
982
+ backward compatibility. If False (default), raises SchemaCompatibilityError
983
+ for incompatible changes.
984
+
985
+ Returns:
986
+ A new SchemaUpdate instance configured for this schema
987
+
988
+ Example:
989
+ >>> schema = Schema.of([Field.of(pa.field("id", pa.int64()))])
990
+ >>> new_field = Field.of(pa.field("name", pa.string()))
991
+ >>> updated_schema = (schema.update()
992
+ ... .add_field("name", new_field)
993
+ ... .apply())
994
+ """
995
+ return SchemaUpdate.of(
996
+ self, allow_incompatible_changes=allow_incompatible_changes
997
+ )
998
+
549
999
  def field_id(self, name: Union[FieldName, NestedFieldName]) -> FieldId:
550
1000
  return Schema._field_name_to_field_id(self.arrow, name)
551
1001
 
@@ -563,125 +1013,570 @@ class Schema(dict):
563
1013
  )
564
1014
  return self.field_ids_to_fields[field_id]
565
1015
 
566
- @property
567
- def fields(self) -> List[Field]:
568
- field_ids_to_fields = self.field_ids_to_fields
569
- return list(field_ids_to_fields.values())
1016
+ def merge_order_sort_keys(self) -> Optional[List[SortKey]]:
1017
+ """Extract sort keys from fields with merge_order defined, or use event_time as fallback.
570
1018
 
571
- @property
572
- def merge_keys(self) -> Optional[List[FieldId]]:
573
- return self.get("mergeKeys")
1019
+ If explicit merge_order fields are defined, they take precedence.
1020
+ If no merge_order fields are defined but an event_time field exists, use event_time
1021
+ with DESCENDING merge_order (keep latest events by default).
574
1022
 
575
- @property
576
- def field_ids_to_fields(self) -> Dict[FieldId, Field]:
577
- return self.get("fieldIdsToFields")
1023
+ Note: The sort order is inverted because deduplication keeps the "last" record
1024
+ after sorting. To keep the record with the smallest merge_order value, we need
1025
+ to sort in DESCENDING order so that record appears last.
578
1026
 
579
- @property
580
- def arrow(self) -> pa.Schema:
581
- return self["arrow"]
1027
+ Returns:
1028
+ List of SortKey objects constructed from fields with merge_order or event_time,
1029
+ or None if neither are defined.
1030
+ """
1031
+ # First priority: explicit merge_order fields
1032
+ fields_with_merge_order = self._get_fields_with_merge_order()
1033
+ if fields_with_merge_order:
1034
+ return self._create_sort_keys_from_merge_order_fields(
1035
+ fields_with_merge_order
1036
+ )
582
1037
 
583
- @property
584
- def max_field_id(self) -> FieldId:
585
- return self["maxFieldId"]
1038
+ # Second priority: event_time field as default merge_order key
1039
+ event_time_fields = self._get_event_time_fields()
1040
+ if event_time_fields:
1041
+ return self._create_sort_keys_from_event_time_fields(event_time_fields)
586
1042
 
587
- @property
588
- def id(self) -> SchemaId:
589
- return Schema._schema_id(self.arrow)
1043
+ return None
590
1044
 
591
- @property
592
- def subschema(self, name: SchemaName) -> Optional[Schema]:
593
- subschemas = self.subschemas
594
- return subschemas.get(name) if subschemas else None
1045
+ def _validate_and_coerce_table(
1046
+ self,
1047
+ table: pa.Table,
1048
+ schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
1049
+ default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
1050
+ ) -> Tuple[pa.Table, Schema]:
1051
+ """Validate and coerce a PyArrow table to match this schema's field types and constraints.
595
1052
 
596
- @property
597
- def subschemas(self) -> Dict[SchemaName, Schema]:
598
- # return cached subschemas first if they exist
599
- subschemas = self.get("subschemas")
600
- if not subschemas:
601
- # retrieve any defined subschemas
602
- subschemas_to_field_ids = self.subschemas_to_field_ids
603
- # rebuild and return the subschema cache
604
- if subschemas_to_field_ids:
605
- subschemas = {
606
- schema_name: Schema.of(
607
- schema=pa.schema(
608
- [self.field(field_id).arrow for field_id in field_ids]
609
- ),
610
- schema_id=self.id,
611
- native_object=self.native_object,
612
- )
613
- for schema_name, field_ids in subschemas_to_field_ids.items()
614
- }
615
- self["subschemas"] = subschemas
616
- return subschemas or {}
1053
+ This method now uses SchemaUpdate for safe schema evolution, ensuring all field
1054
+ protection rules and validation are applied consistently.
617
1055
 
618
- @property
619
- def subschema_field_ids(self, name: SchemaName) -> Optional[List[FieldId]]:
620
- return self.subschemas_to_field_ids.get(name)
1056
+ Args:
1057
+ table: PyArrow Table to validate and coerce
1058
+ schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
1059
+ default_schema_consistency_type: Default consistency type for new fields in AUTO mode
621
1060
 
622
- @property
623
- def subschemas_to_field_ids(self) -> Dict[SchemaName, List[FieldId]]:
624
- return Schema._subschemas(self.arrow)
1061
+ Returns:
1062
+ Tuple[pa.Table, Schema]: Table with data validated/coerced according to schema consistency types,
1063
+ and the (potentially updated) schema
625
1064
 
626
- @property
627
- def native_object(self) -> Optional[Any]:
628
- return self.get("nativeObject")
1065
+ Raises:
1066
+ SchemaValidationError: If validation fails or coercion is not possible
1067
+ SchemaCompatibilityError: If schema evolution would break compatibility
1068
+ """
1069
+ if not self.field_ids_to_fields:
1070
+ # No fields defined in schema, return original table
1071
+ return table, self
629
1072
 
630
- @staticmethod
631
- def _schema_id(schema: pa.Schema) -> SchemaId:
632
- schema_id = None
633
- if schema.metadata:
634
- bytes_val = schema.metadata.get(SCHEMA_ID_KEY_NAME)
635
- schema_id = int(bytes_val.decode()) if bytes_val else None
636
- return schema_id
1073
+ # Setup
1074
+ field_name_to_field = self._create_field_name_mapping()
1075
+ field_updates = {} # field_name -> updated_field
1076
+ new_fields = {} # field_name -> new_field
1077
+ new_columns = []
1078
+ new_schema_fields = []
637
1079
 
638
- @staticmethod
639
- def _subschemas(
640
- schema: pa.Schema,
641
- ) -> Dict[SchemaName, List[FieldId]]:
642
- subschemas = None
643
- if schema.metadata:
644
- bytes_val = schema.metadata.get(SUBSCHEMAS_KEY_NAME)
645
- subschemas = msgpack.loads(bytes_val) if bytes_val else None
646
- return subschemas
1080
+ # Process each column in the table
1081
+ for column_name in table.column_names:
1082
+ column_data = table.column(column_name)
647
1083
 
648
- @staticmethod
649
- def _field_name_to_field_id(
650
- schema: pa.Schema,
651
- name: Union[FieldName, NestedFieldName],
652
- ) -> FieldId:
653
- if isinstance(name, str):
654
- return Field.of(schema.field(name)).id
655
- if isinstance(name, List):
656
- if not len(name):
657
- raise ValueError(f"Nested field name `{name}` is empty.")
658
- field = schema
659
- for part in name:
660
- field = field[part]
661
- return Field.of(field).id
662
- raise ValueError(f"Unknown field name type: {type(name)}")
1084
+ (
1085
+ processed_data,
1086
+ schema_field,
1087
+ field_update,
1088
+ new_field,
1089
+ ) = self._process_existing_table_column(
1090
+ column_name,
1091
+ column_data,
1092
+ field_name_to_field,
1093
+ schema_evolution_mode,
1094
+ default_schema_consistency_type,
1095
+ )
663
1096
 
664
- @staticmethod
665
- def _visit_fields(
666
- current: Union[pa.Schema, pa.Field],
667
- visit: Callable,
668
- path: NestedFieldName = [],
669
- *args,
670
- **kwargs,
671
- ) -> None:
672
- """
673
- Recursively visit all fields in a PyArrow schema, including nested
674
- fields.
1097
+ new_columns.append(processed_data)
1098
+ new_schema_fields.append(schema_field)
675
1099
 
676
- Args:
677
- current (pa.Schema or pa.Field): The schema or field to visit.
678
- visit (callable): A function that visits the current field.
679
- path (NestedFieldName): The current path to the field.
680
- *args: Additional args to pass to the visit function.
1100
+ if field_update:
1101
+ field_updates[column_name] = field_update
1102
+ if new_field:
1103
+ new_fields[column_name] = new_field
1104
+
1105
+ # Add any missing fields from schema
1106
+ table_column_names = set(table.column_names)
1107
+ self._add_missing_schema_fields(
1108
+ table, table_column_names, new_columns, new_schema_fields
1109
+ )
1110
+
1111
+ # Apply schema updates if any modifications were made
1112
+ updated_schema = self._apply_schema_updates(field_updates, new_fields)
1113
+
1114
+ return (
1115
+ pa.table(new_columns, schema=pa.schema(new_schema_fields)),
1116
+ updated_schema,
1117
+ )
1118
+
1119
+ def validate_and_coerce_dataset(
1120
+ self,
1121
+ dataset: Union[pa.Table, Any],
1122
+ schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
1123
+ default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
1124
+ ) -> Tuple[Union[pa.Table, Any], Schema]:
1125
+ """Validate and coerce a dataset to match this schema's field types and constraints.
1126
+
1127
+ Args:
1128
+ dataset: Dataset to validate and coerce (PyArrow Table, Daft DataFrame, etc.)
1129
+ schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
1130
+ default_schema_consistency_type: Default consistency type for new fields in AUTO mode
1131
+
1132
+ Returns:
1133
+ Tuple[Dataset, Schema]: Dataset with data validated/coerced according to schema consistency types,
1134
+ and the (potentially updated) schema
1135
+
1136
+ Raises:
1137
+ SchemaValidationError: If validation fails or coercion is not possible
1138
+ SchemaCompatibilityError: If schema evolution would break compatibility
1139
+ """
1140
+ # Handle PyArrow tables using existing method
1141
+ if get_dataset_type(dataset) == DatasetType.PYARROW:
1142
+ return self._validate_and_coerce_table(
1143
+ dataset,
1144
+ schema_evolution_mode,
1145
+ default_schema_consistency_type,
1146
+ )
1147
+
1148
+ # Handle Daft DataFrames without collecting to memory
1149
+ if get_dataset_type(dataset) == DatasetType.DAFT:
1150
+ return self._validate_and_coerce_daft_dataframe(
1151
+ dataset,
1152
+ schema_evolution_mode,
1153
+ default_schema_consistency_type,
1154
+ )
1155
+
1156
+ # Handle Ray Datasets by converting to Daft
1157
+ if get_dataset_type(dataset) == DatasetType.RAY_DATASET:
1158
+ daft_dataframe = dataset.to_daft()
1159
+ return self._validate_and_coerce_daft_dataframe(
1160
+ daft_dataframe,
1161
+ schema_evolution_mode,
1162
+ default_schema_consistency_type,
1163
+ )
1164
+
1165
+ # For other types, convert to PyArrow and back
1166
+ # Don't pass schema during conversion as it may contain columns not yet in the dataset
1167
+ pa_table = to_pyarrow(dataset)
1168
+ coerced_table, updated_schema = self._validate_and_coerce_table(
1169
+ pa_table,
1170
+ schema_evolution_mode,
1171
+ default_schema_consistency_type,
1172
+ )
1173
+ return from_pyarrow(coerced_table, get_dataset_type(dataset)), updated_schema
1174
+
1175
+ def coerce(
1176
+ self,
1177
+ dataset: Union[pa.Table, pd.DataFrame, np.ndarray, Any],
1178
+ manifest_entry_schema: Optional[Schema] = None,
1179
+ ) -> Union[pa.Table, pd.DataFrame, np.ndarray, Any]:
1180
+ """Coerce a dataset to match this schema using field type promotion.
1181
+
1182
+ This method processes different dataset types and applies type promotion
1183
+ using the field's promote_type_if_needed method. It handles:
1184
+ - PyArrow Tables
1185
+ - Pandas DataFrames
1186
+ - NumPy arrays (1D and 2D)
1187
+ - Polars DataFrames (if available)
1188
+ - Daft DataFrames (if available)
1189
+ - Other types with to_arrow() method
1190
+
1191
+ For each column, it:
1192
+ - Fields that exist in both dataset and schema: applies type promotion
1193
+ - Fields in dataset but not in schema: preserves as-is
1194
+ - Fields in schema but not in dataset: adds with null or past default values
1195
+ - Reorders columns to match schema order
1196
+
1197
+ Args:
1198
+ dataset: Dataset to coerce to this schema
1199
+ manifest_entry_schema: Original manifest entry schema used to write the dataset.
1200
+
1201
+ Returns:
1202
+ Dataset of the same type, coerced to match this schema.
1203
+
1204
+ Raises:
1205
+ SchemaValidationError: If coercion fails
1206
+ """
1207
+ if not self.field_ids_to_fields:
1208
+ # No fields defined in schema, return original dataset
1209
+ return dataset
1210
+
1211
+ # Convert dataset to PyArrow table for processing
1212
+ pa_table = to_pyarrow(
1213
+ dataset,
1214
+ schema=manifest_entry_schema.arrow if manifest_entry_schema else None,
1215
+ )
1216
+
1217
+ # Process columns using field coercion
1218
+ coerced_columns, coerced_fields = self._coerce_table_columns(pa_table)
1219
+
1220
+ # Reorder columns to match schema order
1221
+ reordered_columns, reordered_fields = self._reorder_columns_to_schema(
1222
+ coerced_columns, coerced_fields, pa_table
1223
+ )
1224
+
1225
+ # Create new table with processed columns
1226
+ coerced_table = pa.table(reordered_columns, schema=pa.schema(reordered_fields))
1227
+
1228
+ # Convert back to original dataset type
1229
+ return from_pyarrow(coerced_table, get_dataset_type(dataset))
1230
+
1231
+ def _validate_and_coerce_daft_dataframe(
1232
+ self,
1233
+ dataframe: Any, # DaftDataFrame type
1234
+ schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
1235
+ default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
1236
+ ) -> Tuple[Any, Schema]:
1237
+ """Validate and coerce a Daft DataFrame without collecting to memory.
1238
+
1239
+ This method processes Daft DataFrames column by column using Daft expressions
1240
+ for validation and coercion, avoiding memory collection.
1241
+
1242
+ Args:
1243
+ dataframe: Daft DataFrame to validate and coerce
1244
+ schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
1245
+ default_schema_consistency_type: Default consistency type for new fields in AUTO mode
1246
+
1247
+ Returns:
1248
+ Tuple[DaftDataFrame, Schema]: Processed DataFrame and updated schema
1249
+
1250
+ Raises:
1251
+ SchemaValidationError: If validation fails or coercion is not possible
1252
+ SchemaCompatibilityError: If schema evolution would break compatibility
1253
+ """
1254
+ if not self.field_ids_to_fields:
1255
+ # No fields defined in schema, return original dataframe
1256
+ return dataframe, self
1257
+
1258
+ # Setup
1259
+ field_name_to_field = self._create_field_name_mapping()
1260
+ field_updates = {} # field_name -> updated_field
1261
+ new_fields = {} # field_name -> new_field
1262
+ processed_dataframe = dataframe
1263
+
1264
+ # Process each column in the dataframe
1265
+ for column_name in dataframe.column_names:
1266
+ column_type = dataframe.schema()[column_name].dtype.to_arrow_dtype()
1267
+
1268
+ (
1269
+ processed_dataframe,
1270
+ schema_field,
1271
+ field_update,
1272
+ new_field,
1273
+ ) = self._process_existing_daft_column(
1274
+ processed_dataframe,
1275
+ column_name,
1276
+ column_type,
1277
+ field_name_to_field,
1278
+ schema_evolution_mode,
1279
+ default_schema_consistency_type,
1280
+ )
1281
+
1282
+ if field_update:
1283
+ field_updates[column_name] = field_update
1284
+ if new_field:
1285
+ new_fields[column_name] = new_field
1286
+
1287
+ # Add any missing fields from schema
1288
+ dataframe_column_names = set(dataframe.column_names)
1289
+ processed_dataframe = self._add_missing_schema_fields_daft(
1290
+ processed_dataframe, dataframe_column_names
1291
+ )
1292
+
1293
+ # Apply schema updates if any modifications were made
1294
+ updated_schema = self._apply_schema_updates(field_updates, new_fields)
1295
+
1296
+ return processed_dataframe, updated_schema
1297
+
1298
+ def _process_existing_daft_column(
1299
+ self,
1300
+ dataframe: Any, # DaftDataFrame type
1301
+ column_name: str,
1302
+ column_type: pa.DataType,
1303
+ field_name_to_field: Dict[str, Field],
1304
+ schema_evolution_mode: Optional[SchemaEvolutionMode],
1305
+ default_schema_consistency_type: Optional[SchemaConsistencyType],
1306
+ ) -> Tuple[Any, pa.Field, Optional[Field], Optional[Field]]:
1307
+ """Process a Daft DataFrame column that exists in the dataset.
1308
+
1309
+ Args:
1310
+ dataframe: Daft DataFrame to process
1311
+ column_name: Name of the column to process
1312
+ column_type: PyArrow DataType of the column
1313
+ field_name_to_field: Mapping from field names to Field objects
1314
+ schema_evolution_mode: How to handle fields not in schema
1315
+ default_schema_consistency_type: Default consistency type for new fields
1316
+
1317
+ Returns:
1318
+ Tuple of (processed_dataframe, schema_field, field_update, new_field)
1319
+ """
1320
+ if column_name in field_name_to_field:
1321
+ # Field exists in schema - validate/coerce according to consistency type
1322
+ field = field_name_to_field[column_name]
1323
+
1324
+ if field.consistency_type == SchemaConsistencyType.VALIDATE:
1325
+ field.validate(column_type)
1326
+ return dataframe, field.arrow, None, None
1327
+ elif field.consistency_type == SchemaConsistencyType.COERCE:
1328
+ coerced_dataframe = field.coerce_daft(dataframe, column_name)
1329
+ return coerced_dataframe, field.arrow, None, None
1330
+ else:
1331
+ # NONE or no consistency type - use type promotion
1332
+ return self._handle_daft_type_promotion(
1333
+ dataframe, column_name, column_type, field
1334
+ )
1335
+ else:
1336
+ # Field not in schema - handle based on evolution mode
1337
+ return self._handle_new_daft_field(
1338
+ dataframe,
1339
+ column_name,
1340
+ column_type,
1341
+ schema_evolution_mode,
1342
+ default_schema_consistency_type,
1343
+ )
1344
+
1345
+ def _handle_daft_type_promotion(
1346
+ self,
1347
+ dataframe: Any, # DaftDataFrame type
1348
+ column_name: str,
1349
+ column_type: pa.DataType,
1350
+ field: Field,
1351
+ ) -> Tuple[Any, pa.Field, Optional[Field], Optional[Field]]:
1352
+ """Handle type promotion for a Daft column with NONE consistency type."""
1353
+ # Create a dummy array to check type promotion
1354
+ dummy_array = pa.array([None], type=column_type)
1355
+ promoted_data, type_was_promoted = field.promote_type_if_needed(dummy_array)
1356
+
1357
+ if type_was_promoted:
1358
+ # Cast the Daft column to the promoted type
1359
+ promoted_dataframe = field.coerce_daft(
1360
+ dataframe, column_name, promoted_data.type
1361
+ )
1362
+
1363
+ # Cast default values to match the promoted type
1364
+ promoted_past_default = (
1365
+ field._cast_default_to_promoted_type(
1366
+ field.past_default, promoted_data.type
1367
+ )
1368
+ if field.past_default is not None
1369
+ else None
1370
+ )
1371
+ promoted_future_default = (
1372
+ field._cast_default_to_promoted_type(
1373
+ field.future_default, promoted_data.type
1374
+ )
1375
+ if field.future_default is not None
1376
+ else None
1377
+ )
1378
+
1379
+ # Create updated field with promoted type
1380
+ promoted_field = pa.field(
1381
+ field.arrow.name,
1382
+ promoted_data.type,
1383
+ field.arrow.nullable,
1384
+ field.arrow.metadata,
1385
+ )
1386
+
1387
+ updated_field = Field.of(
1388
+ promoted_field,
1389
+ field_id=field.id,
1390
+ past_default=promoted_past_default,
1391
+ future_default=promoted_future_default,
1392
+ consistency_type=field.consistency_type,
1393
+ path=field.path,
1394
+ native_object=field.native_object,
1395
+ )
1396
+
1397
+ return promoted_dataframe, promoted_field, updated_field, None
1398
+ else:
1399
+ return dataframe, field.arrow, None, None
1400
+
1401
+ def _handle_new_daft_field(
1402
+ self,
1403
+ dataframe: Any, # DaftDataFrame type
1404
+ column_name: str,
1405
+ column_type: pa.DataType,
1406
+ schema_evolution_mode: Optional[SchemaEvolutionMode],
1407
+ default_schema_consistency_type: Optional[SchemaConsistencyType],
1408
+ ) -> Tuple[Any, pa.Field, Optional[Field], Optional[Field]]:
1409
+ """Handle a field that's not in the schema for Daft DataFrames."""
1410
+ if schema_evolution_mode == SchemaEvolutionMode.AUTO:
1411
+ # Create new field with default consistency type
1412
+ next_field_id = self.max_field_id + 1
1413
+ new_field = Field.of(
1414
+ field=pa.field(column_name, column_type),
1415
+ field_id=next_field_id,
1416
+ consistency_type=default_schema_consistency_type
1417
+ or SchemaConsistencyType.NONE,
1418
+ )
1419
+ return dataframe, new_field.arrow, None, new_field
1420
+ else:
1421
+ # MANUAL mode or not specified - raise error
1422
+ raise SchemaValidationError(
1423
+ f"Field '{column_name}' is not present in the schema and schema evolution mode is '{schema_evolution_mode}'"
1424
+ )
1425
+
1426
+ def _add_missing_schema_fields_daft(
1427
+ self,
1428
+ dataframe: Any, # DaftDataFrame type
1429
+ dataframe_column_names: set,
1430
+ ) -> Any:
1431
+ """Add columns for fields that exist in schema but not in Daft DataFrame."""
1432
+ processed_dataframe = dataframe
1433
+
1434
+ for field in self.field_ids_to_fields.values():
1435
+ if field.arrow.name not in dataframe_column_names:
1436
+ # Add column with null values or default value to Daft DataFrame
1437
+ if field.future_default is not None:
1438
+ # Convert default value to Daft literal
1439
+ processed_dataframe = processed_dataframe.with_column(
1440
+ field.arrow.name,
1441
+ daft.lit(field.future_default).cast(
1442
+ daft.DataType.from_arrow_type(field.arrow.type)
1443
+ ),
1444
+ )
1445
+ elif field.arrow.nullable:
1446
+ # Add null column
1447
+ processed_dataframe = processed_dataframe.with_column(
1448
+ field.arrow.name,
1449
+ daft.lit(None).cast(
1450
+ daft.DataType.from_arrow_type(field.arrow.type)
1451
+ ),
1452
+ )
1453
+ else:
1454
+ raise SchemaValidationError(
1455
+ f"Field '{field.arrow.name}' is required but not present and no future_default is set"
1456
+ )
1457
+
1458
+ return processed_dataframe
1459
+
1460
+ @property
1461
+ def fields(self) -> List[Field]:
1462
+ field_ids_to_fields = self.field_ids_to_fields
1463
+ return list(field_ids_to_fields.values())
1464
+
1465
+ @property
1466
+ def merge_keys(self) -> Optional[List[FieldId]]:
1467
+ return self.get("mergeKeys")
1468
+
1469
+ @property
1470
+ def field_ids_to_fields(self) -> Dict[FieldId, Field]:
1471
+ return self.get("fieldIdsToFields")
1472
+
1473
+ @property
1474
+ def arrow(self) -> pa.Schema:
1475
+ return self["arrow"]
1476
+
1477
+ @property
1478
+ def max_field_id(self) -> FieldId:
1479
+ return self["maxFieldId"]
1480
+
1481
+ @property
1482
+ def id(self) -> SchemaId:
1483
+ return Schema._schema_id(self.arrow)
1484
+
1485
+ @property
1486
+ def subschema(self, name: SchemaName) -> Optional[Schema]:
1487
+ subschemas = self.subschemas
1488
+ return subschemas.get(name) if subschemas else None
1489
+
1490
+ @property
1491
+ def subschemas(self) -> Dict[SchemaName, Schema]:
1492
+ # return cached subschemas first if they exist
1493
+ subschemas = self.get("subschemas")
1494
+ if not subschemas:
1495
+ # retrieve any defined subschemas
1496
+ subschemas_to_field_ids = self.subschemas_to_field_ids
1497
+ # rebuild and return the subschema cache
1498
+ if subschemas_to_field_ids:
1499
+ subschemas = {
1500
+ schema_name: Schema.of(
1501
+ schema=pa.schema(
1502
+ [self.field(field_id).arrow for field_id in field_ids]
1503
+ ),
1504
+ schema_id=self.id,
1505
+ native_object=self.native_object,
1506
+ )
1507
+ for schema_name, field_ids in subschemas_to_field_ids.items()
1508
+ }
1509
+ self["subschemas"] = subschemas
1510
+ return subschemas or {}
1511
+
1512
+ @property
1513
+ def subschema_field_ids(self, name: SchemaName) -> Optional[List[FieldId]]:
1514
+ return self.subschemas_to_field_ids.get(name)
1515
+
1516
+ @property
1517
+ def subschemas_to_field_ids(self) -> Dict[SchemaName, List[FieldId]]:
1518
+ return Schema._subschemas(self.arrow)
1519
+
1520
+ @property
1521
+ def native_object(self) -> Optional[Any]:
1522
+ return self.get("nativeObject")
1523
+
1524
+ @staticmethod
1525
+ def _schema_id(schema: pa.Schema) -> SchemaId:
1526
+ schema_id = None
1527
+ if schema.metadata:
1528
+ bytes_val = schema.metadata.get(SCHEMA_ID_KEY_NAME)
1529
+ schema_id = int(bytes_val.decode()) if bytes_val else None
1530
+ return schema_id
1531
+
1532
+ @staticmethod
1533
+ def _subschemas(
1534
+ schema: pa.Schema,
1535
+ ) -> Dict[SchemaName, List[FieldId]]:
1536
+ subschemas = None
1537
+ if schema.metadata:
1538
+ bytes_val = schema.metadata.get(SUBSCHEMAS_KEY_NAME)
1539
+ subschemas = _decode_metadata_value(bytes_val) if bytes_val else None
1540
+ return subschemas
1541
+
1542
+ @staticmethod
1543
+ def _field_name_to_field_id(
1544
+ schema: pa.Schema,
1545
+ name: Union[FieldName, NestedFieldName],
1546
+ ) -> FieldId:
1547
+ if isinstance(name, str):
1548
+ return Field.of(schema.field(name)).id
1549
+ if isinstance(name, List):
1550
+ if not len(name):
1551
+ raise ValueError(f"Nested field name `{name}` is empty.")
1552
+ field = schema
1553
+ for part in name:
1554
+ field = field[part]
1555
+ return Field.of(field).id
1556
+ raise ValueError(f"Unknown field name type: {type(name)}")
1557
+
1558
+ @staticmethod
1559
+ def _visit_fields(
1560
+ current: Union[pa.Schema, pa.Field],
1561
+ visit: Callable,
1562
+ path: Optional[NestedFieldName] = None,
1563
+ *args,
1564
+ **kwargs,
1565
+ ) -> None:
1566
+ """
1567
+ Recursively visit all fields in a PyArrow schema, including nested
1568
+ fields.
1569
+
1570
+ Args:
1571
+ current: The schema or field to visit.
1572
+ visit: A function that visits the current field.
1573
+ path: The current path to the field.
1574
+ *args: Additional args to pass to the visit function.
681
1575
  **kwargs: Additional keyword args to pass to the visit function.
682
1576
  Returns:
683
1577
  None
684
1578
  """
1579
+ path = [] if path is None else path
685
1580
  if isinstance(current, pa.Schema):
686
1581
  for field in current:
687
1582
  Schema._visit_fields(
@@ -750,14 +1645,13 @@ class Schema(dict):
750
1645
  visitor_dict: Dict[str, Any],
751
1646
  ) -> None:
752
1647
  field_ids_to_fields = visitor_dict["fieldIdsToFields"]
753
- max_field_id = (
754
- visitor_dict["maxFieldId"] + len(field_ids_to_fields)
755
- ) % MAX_FIELD_ID_EXCLUSIVE
756
1648
  dc_field = Field.of(field)
757
1649
  if dc_field is not None and dc_field.id is not None:
758
1650
  field_id = dc_field.id
759
1651
  else:
760
- field_id = max_field_id
1652
+ field_id = (
1653
+ visitor_dict["maxFieldId"] + len(field_ids_to_fields)
1654
+ ) % MAX_FIELD_ID_EXCLUSIVE
761
1655
 
762
1656
  if (dupe := field_ids_to_fields.get(field_id)) is not None:
763
1657
  raise ValueError(
@@ -846,47 +1740,1421 @@ class Schema(dict):
846
1740
  return pa.unify_schemas(all_schemas), subschema_to_field_names
847
1741
  return Schema._to_pyarrow_schema(schema), {} # SingleSchema
848
1742
 
849
- @staticmethod
850
- def _del_subschema(
851
- name: SchemaName,
852
- subschemas: Dict[SchemaName, Schema],
853
- ) -> Dict[SchemaName, Schema]:
854
- deleted_subschema = subschemas.pop(name, None)
855
- if deleted_subschema is None:
856
- raise ValueError(f"Subschema `{name}` does not exist.")
857
- return subschemas
1743
+ def _get_fields_with_merge_order(self) -> List[Field]:
1744
+ """Get all fields that have merge_order defined.
858
1745
 
859
- @staticmethod
860
- def _add_subschema(
861
- name: SchemaName,
862
- schema: SingleSchema,
863
- subschemas: Dict[SchemaName, Schema],
864
- ) -> Dict[SchemaName, Schema]:
865
- Schema._validate_schema_name(name)
866
- if name == BASE_SCHEMA_NAME:
867
- raise ValueError(
868
- f"Cannot add subschema with reserved name: {BASE_SCHEMA_NAME}"
1746
+ Returns:
1747
+ List of fields with merge_order defined, or empty list if none
1748
+ """
1749
+ return [field for field in self.fields if field.merge_order is not None]
1750
+
1751
+ def _create_sort_keys_from_merge_order_fields(
1752
+ self, fields_with_merge_order: List[Field]
1753
+ ) -> List[SortKey]:
1754
+ """Create sort keys from fields with explicit merge_order.
1755
+
1756
+ Args:
1757
+ fields_with_merge_order: List of fields with merge_order defined
1758
+
1759
+ Returns:
1760
+ List of SortKey objects with inverted sort order for deduplication
1761
+ """
1762
+ from deltacat.storage.model.sort_key import SortKey
1763
+
1764
+ sort_keys = []
1765
+ for field in fields_with_merge_order:
1766
+ merge_order = field.merge_order
1767
+ desired_sort_order = merge_order[0]
1768
+
1769
+ # Invert the sort order because deduplication keeps the "last" record
1770
+ # ASCENDING merge_order (keep smallest) → DESCENDING sort (smallest appears last)
1771
+ # DESCENDING merge_order (keep largest) → ASCENDING sort (largest appears last)
1772
+ if desired_sort_order == SortOrder.ASCENDING:
1773
+ actual_sort_order = SortOrder.DESCENDING
1774
+ else:
1775
+ actual_sort_order = SortOrder.ASCENDING
1776
+
1777
+ sort_key = SortKey.of(
1778
+ key=[field.arrow.name],
1779
+ sort_order=actual_sort_order,
1780
+ null_order=merge_order[1], # NullOrder (AT_START/AT_END)
869
1781
  )
870
- if name in subschemas:
871
- raise ValueError(f"Subschema `{name}` already exists.")
872
- for key, val in subschemas.items():
873
- subschemas[key] = val.arrow
874
- subschemas[name] = schema
875
- return subschemas
1782
+ sort_keys.append(sort_key)
1783
+ return sort_keys
876
1784
 
1785
+ def _get_event_time_fields(self) -> List[Field]:
1786
+ """Get all fields marked as event_time.
877
1787
 
878
- class SchemaList(List[Schema]):
879
- @staticmethod
880
- def of(items: List[Schema]) -> SchemaList:
881
- typed_items = SchemaList()
882
- for item in items:
883
- if item is not None and not isinstance(item, Schema):
884
- item = Schema(item)
885
- typed_items.append(item)
886
- return typed_items
1788
+ Returns:
1789
+ List of event_time fields, or empty list if none
1790
+ """
1791
+ return [field for field in self.fields if field.is_event_time]
887
1792
 
888
- def __getitem__(self, item):
889
- val = super().__getitem__(item)
890
- if val is not None and not isinstance(val, Schema):
891
- self[item] = val = Schema(val)
892
- return val
1793
+ def _create_sort_keys_from_event_time_fields(
1794
+ self, event_time_fields: List[Field]
1795
+ ) -> List:
1796
+ """Create sort keys from event_time fields with default DESCENDING merge_order.
1797
+
1798
+ Args:
1799
+ event_time_fields: List of event_time fields
1800
+
1801
+ Returns:
1802
+ List of SortKey objects with ASCENDING sort order (inverted from DESCENDING merge_order)
1803
+ """
1804
+ from deltacat.storage.model.sort_key import SortKey
1805
+
1806
+ sort_keys = []
1807
+ for field in event_time_fields:
1808
+ sort_key = SortKey.of(
1809
+ key=[field.arrow.name],
1810
+ sort_order=SortOrder.ASCENDING, # Inverted: DESCENDING merge_order → ASCENDING sort
1811
+ null_order=NullOrder.AT_END,
1812
+ )
1813
+ sort_keys.append(sort_key)
1814
+ return sort_keys
1815
+
1816
+ def _create_field_name_mapping(self) -> Dict[str, Field]:
1817
+ """Create a mapping from field names to Field objects."""
1818
+ field_name_to_field = {}
1819
+ for field in self.field_ids_to_fields.values():
1820
+ field_name_to_field[field.arrow.name] = field
1821
+ return field_name_to_field
1822
+
1823
+ def _process_existing_table_column(
1824
+ self,
1825
+ column_name: str,
1826
+ column_data: pa.Array,
1827
+ field_name_to_field: Dict[str, Field],
1828
+ schema_evolution_mode: Optional[SchemaEvolutionMode],
1829
+ default_schema_consistency_type: Optional[SchemaConsistencyType],
1830
+ ) -> Tuple[pa.Array, pa.Field, Optional[Field], Optional[Field]]:
1831
+ """Process a column that exists in the table.
1832
+
1833
+ Returns:
1834
+ Tuple of (processed_column_data, schema_field, field_update, new_field)
1835
+ """
1836
+ if column_name in field_name_to_field:
1837
+ # Field exists in schema - validate/coerce according to consistency type
1838
+ field = field_name_to_field[column_name]
1839
+
1840
+ if field.consistency_type == SchemaConsistencyType.VALIDATE:
1841
+ field.validate(column_data.type)
1842
+ return column_data, field.arrow, None, None
1843
+ elif field.consistency_type == SchemaConsistencyType.COERCE:
1844
+ coerced_data = field.coerce(column_data)
1845
+ return coerced_data, field.arrow, None, None
1846
+ else:
1847
+ # NONE or no consistency type - use type promotion
1848
+ return self._handle_type_promotion(column_name, column_data, field)
1849
+ else:
1850
+ # Field not in schema - handle based on evolution mode
1851
+ return self._handle_new_field(
1852
+ column_name,
1853
+ column_data,
1854
+ schema_evolution_mode,
1855
+ default_schema_consistency_type,
1856
+ )
1857
+
1858
+ def _handle_type_promotion(
1859
+ self, column_name: str, column_data: pa.Array, field: Field
1860
+ ) -> Tuple[pa.Array, pa.Field, Optional[Field], Optional[Field]]:
1861
+ """Handle type promotion for a field with NONE consistency type."""
1862
+ promoted_data, type_was_promoted = field.promote_type_if_needed(column_data)
1863
+
1864
+ if type_was_promoted:
1865
+ # Cast default values to match the promoted type
1866
+ promoted_past_default = (
1867
+ field._cast_default_to_promoted_type(
1868
+ field.past_default, promoted_data.type
1869
+ )
1870
+ if field.past_default is not None
1871
+ else None
1872
+ )
1873
+
1874
+ promoted_future_default = (
1875
+ field._cast_default_to_promoted_type(
1876
+ field.future_default, promoted_data.type
1877
+ )
1878
+ if field.future_default is not None
1879
+ else None
1880
+ )
1881
+
1882
+ # Create updated field with same properties but new type and cast defaults
1883
+ promoted_field = pa.field(
1884
+ field.arrow.name,
1885
+ promoted_data.type,
1886
+ nullable=field.arrow.nullable,
1887
+ metadata=field.arrow.metadata,
1888
+ )
1889
+
1890
+ updated_field = Field.of(
1891
+ promoted_field,
1892
+ field_id=field.id,
1893
+ is_merge_key=field.is_merge_key,
1894
+ merge_order=field.merge_order,
1895
+ is_event_time=field.is_event_time,
1896
+ doc=field.doc,
1897
+ past_default=promoted_past_default,
1898
+ future_default=promoted_future_default,
1899
+ consistency_type=field.consistency_type,
1900
+ path=field.path,
1901
+ native_object=field.native_object,
1902
+ )
1903
+
1904
+ return promoted_data, promoted_field, updated_field, None
1905
+ else:
1906
+ return promoted_data, field.arrow, None, None
1907
+
1908
+ def _handle_new_field(
1909
+ self,
1910
+ column_name: str,
1911
+ column_data: pa.Array,
1912
+ schema_evolution_mode: Optional[SchemaEvolutionMode],
1913
+ default_schema_consistency_type: Optional[SchemaConsistencyType],
1914
+ ) -> Tuple[pa.Array, pa.Field, Optional[Field], Optional[Field]]:
1915
+ """Handle a field that's not in the schema."""
1916
+ if schema_evolution_mode == SchemaEvolutionMode.AUTO:
1917
+ # Create new field with default consistency type
1918
+ next_field_id = self.max_field_id + 1
1919
+ new_field = Field.of(
1920
+ pa.field(column_name, column_data.type, nullable=True),
1921
+ field_id=next_field_id,
1922
+ consistency_type=default_schema_consistency_type
1923
+ or SchemaConsistencyType.NONE,
1924
+ )
1925
+ return column_data, new_field.arrow, None, new_field
1926
+ else:
1927
+ # MANUAL mode or disabled - raise error
1928
+ raise SchemaValidationError(
1929
+ f"Field '{column_name}' is not present in the schema and schema evolution mode is '{schema_evolution_mode}'"
1930
+ )
1931
+
1932
+ def _add_missing_schema_fields(
1933
+ self,
1934
+ table: pa.Table,
1935
+ table_column_names: set,
1936
+ new_columns: List[pa.Array],
1937
+ new_schema_fields: List[pa.Field],
1938
+ ) -> None:
1939
+ """Add columns for fields that exist in schema but not in table."""
1940
+ for field in self.field_ids_to_fields.values():
1941
+ if field.arrow.name not in table_column_names:
1942
+ # Use future_default if available, otherwise check if nullable
1943
+ if field.future_default is not None:
1944
+ # Create column with future_default value
1945
+ default_array = pa.array(
1946
+ [field.future_default] * get_table_length(table),
1947
+ type=field.arrow.type,
1948
+ )
1949
+ new_columns.append(default_array)
1950
+ elif field.arrow.nullable:
1951
+ # Backfill with nulls if field is nullable
1952
+ null_column = pa.nulls(
1953
+ get_table_length(table), type=field.arrow.type
1954
+ )
1955
+ new_columns.append(null_column)
1956
+ else:
1957
+ # Field is not nullable and no future_default - error
1958
+ raise SchemaValidationError(
1959
+ f"Field '{field.arrow.name}' is required but not present and no future_default is set"
1960
+ )
1961
+ new_schema_fields.append(field.arrow)
1962
+
1963
+ def _apply_schema_updates(
1964
+ self, field_updates: Dict[str, Field], new_fields: Dict[str, Field]
1965
+ ) -> Schema:
1966
+ """Apply collected schema updates and return the updated schema."""
1967
+ if not field_updates and not new_fields:
1968
+ return self
1969
+
1970
+ # Initialize schema update with allow_incompatible_changes=True for type promotion
1971
+ schema_update = self.update(allow_incompatible_changes=True)
1972
+
1973
+ # Apply field updates
1974
+ for field_name, updated_field in field_updates.items():
1975
+ schema_update = schema_update._update_field(field_name, updated_field)
1976
+
1977
+ # Apply new fields
1978
+ for field_name, new_field in new_fields.items():
1979
+ schema_update = schema_update.add_field(new_field)
1980
+
1981
+ # Apply all updates
1982
+ return schema_update.apply()
1983
+
1984
+ def _process_existing_columns_for_coercion(
1985
+ self, pa_table: pa.Table, field_name_to_field: Dict[str, Field]
1986
+ ) -> Tuple[List[pa.Array], List[pa.Field]]:
1987
+ """Process columns that exist in the table for coercion.
1988
+
1989
+ Args:
1990
+ pa_table: PyArrow table to process
1991
+ field_name_to_field: Mapping from field names to Field objects
1992
+
1993
+ Returns:
1994
+ Tuple of (processed columns, corresponding fields)
1995
+ """
1996
+ new_columns = []
1997
+ new_schema_fields = []
1998
+
1999
+ for column_name in pa_table.column_names:
2000
+ column_data = pa_table.column(column_name)
2001
+
2002
+ if column_name in field_name_to_field:
2003
+ # Field exists in target schema - use promote_type_if_needed for coercion
2004
+ field = field_name_to_field[column_name]
2005
+ promoted_data, _ = field.promote_type_if_needed(column_data)
2006
+ new_columns.append(promoted_data)
2007
+ new_schema_fields.append(field.arrow)
2008
+ else:
2009
+ # Field not in target schema - preserve as-is
2010
+ new_columns.append(column_data)
2011
+ new_schema_fields.append(pa.field(column_name, column_data.type))
2012
+
2013
+ return new_columns, new_schema_fields
2014
+
2015
+ def _add_missing_fields_for_coercion(
2016
+ self,
2017
+ pa_table: pa.Table,
2018
+ field_name_to_field: Dict[str, Field],
2019
+ existing_columns: List[pa.Array],
2020
+ existing_fields: List[pa.Field],
2021
+ ) -> Tuple[List[pa.Array], List[pa.Field]]:
2022
+ """Add columns for fields that exist in schema but not in table.
2023
+
2024
+ Args:
2025
+ pa_table: Original PyArrow table
2026
+ field_name_to_field: Mapping from field names to Field objects
2027
+ existing_columns: Columns already processed
2028
+ existing_fields: Fields already processed
2029
+
2030
+ Returns:
2031
+ Tuple of (all columns including added ones, all corresponding fields)
2032
+ """
2033
+ all_columns = existing_columns.copy()
2034
+ all_fields = existing_fields.copy()
2035
+
2036
+ # Add any missing fields from target schema with null values or past_default values
2037
+ target_field_names = {
2038
+ field.arrow.name for field in self.field_ids_to_fields.values()
2039
+ }
2040
+ table_field_names = set(pa_table.column_names)
2041
+
2042
+ for field_name in target_field_names - table_field_names:
2043
+ field = field_name_to_field[field_name]
2044
+
2045
+ # Check if field has past_default value and use it instead of nulls
2046
+ if field.past_default is not None:
2047
+ # Create array filled with past_default value
2048
+ default_column = pa.array(
2049
+ [field.past_default] * get_table_length(pa_table),
2050
+ type=field.arrow.type,
2051
+ )
2052
+ all_columns.append(default_column)
2053
+ else:
2054
+ # Use null values as before
2055
+ null_column = pa.nulls(
2056
+ get_table_length(pa_table), type=field.arrow.type
2057
+ )
2058
+ all_columns.append(null_column)
2059
+
2060
+ all_fields.append(field.arrow)
2061
+
2062
+ return all_columns, all_fields
2063
+
2064
+ def _coerce_table_columns(
2065
+ self, pa_table: pa.Table
2066
+ ) -> Tuple[List[pa.Array], List[pa.Field]]:
2067
+ """Process table columns using field coercion and add missing fields.
2068
+
2069
+ Args:
2070
+ pa_table: PyArrow table to process
2071
+
2072
+ Returns:
2073
+ Tuple of (list of coerced columns, list of corresponding fields)
2074
+ """
2075
+ # Create mapping from field names to Field objects
2076
+ field_name_to_field = self._create_field_name_mapping()
2077
+
2078
+ # Process existing columns in the table
2079
+ (
2080
+ processed_columns,
2081
+ processed_fields,
2082
+ ) = self._process_existing_columns_for_coercion(pa_table, field_name_to_field)
2083
+
2084
+ # Add any missing fields from target schema
2085
+ all_columns, all_fields = self._add_missing_fields_for_coercion(
2086
+ pa_table, field_name_to_field, processed_columns, processed_fields
2087
+ )
2088
+
2089
+ return all_columns, all_fields
2090
+
2091
+ def _reorder_columns_to_schema(
2092
+ self, columns: List[pa.Array], fields: List[pa.Field], original_table: pa.Table
2093
+ ) -> Tuple[List[pa.Array], List[pa.Field]]:
2094
+ """Reorder columns to match schema order, preserving extra fields.
2095
+
2096
+ Args:
2097
+ columns: List of processed columns
2098
+ fields: List of corresponding field schemas
2099
+ original_table: Original table for field name ordering
2100
+
2101
+ Returns:
2102
+ Tuple of (reordered columns, reordered fields)
2103
+ """
2104
+ # Reorder columns to match schema order
2105
+ reordered_columns = []
2106
+ reordered_fields = []
2107
+ schema_field_names = [
2108
+ field.arrow.name for field in self.field_ids_to_fields.values()
2109
+ ]
2110
+
2111
+ # Add schema fields in schema order
2112
+ for field_name in schema_field_names:
2113
+ for i, field in enumerate(fields):
2114
+ if field.name == field_name:
2115
+ reordered_columns.append(columns[i])
2116
+ reordered_fields.append(field)
2117
+ break
2118
+
2119
+ # Add any extra fields that aren't in schema (preserve original order)
2120
+ target_field_names = set(schema_field_names)
2121
+ table_field_names = set(original_table.column_names)
2122
+ extra_field_names = table_field_names - target_field_names
2123
+
2124
+ for field_name in original_table.column_names:
2125
+ if field_name in extra_field_names:
2126
+ for i, field in enumerate(fields):
2127
+ if field.name == field_name:
2128
+ reordered_columns.append(columns[i])
2129
+ reordered_fields.append(field)
2130
+ break
2131
+
2132
+ return reordered_columns, reordered_fields
2133
+
2134
+ @staticmethod
2135
+ def _del_subschema(
2136
+ name: SchemaName,
2137
+ subschemas: Dict[SchemaName, Schema],
2138
+ ) -> Dict[SchemaName, Schema]:
2139
+ deleted_subschema = subschemas.pop(name, None)
2140
+ if deleted_subschema is None:
2141
+ raise ValueError(f"Subschema `{name}` does not exist.")
2142
+ return subschemas
2143
+
2144
+ @staticmethod
2145
+ def _add_subschema(
2146
+ name: SchemaName,
2147
+ schema: SingleSchema,
2148
+ subschemas: Dict[SchemaName, Schema],
2149
+ ) -> Dict[SchemaName, Schema]:
2150
+ Schema._validate_schema_name(name)
2151
+ if name == BASE_SCHEMA_NAME:
2152
+ raise ValueError(
2153
+ f"Cannot add subschema with reserved name: {BASE_SCHEMA_NAME}"
2154
+ )
2155
+ if name in subschemas:
2156
+ raise ValueError(f"Subschema `{name}` already exists.")
2157
+ for key, val in subschemas.items():
2158
+ subschemas[key] = val.arrow
2159
+ subschemas[name] = schema
2160
+ return subschemas
2161
+
2162
+
2163
+ class SchemaList(List[Schema]):
2164
+ @staticmethod
2165
+ def of(items: List[Schema]) -> SchemaList:
2166
+ typed_items = SchemaList()
2167
+ for item in items:
2168
+ if item is not None and not isinstance(item, Schema):
2169
+ item = Schema(item)
2170
+ typed_items.append(item)
2171
+ return typed_items
2172
+
2173
+ def __getitem__(self, item):
2174
+ val = super().__getitem__(item)
2175
+ if val is not None and not isinstance(val, Schema):
2176
+ self[item] = val = Schema(val)
2177
+ return val
2178
+
2179
+ def __iter__(self):
2180
+ for i in range(len(self)):
2181
+ yield self[i] # This triggers __getitem__ conversion
2182
+
2183
+
2184
+ class SchemaUpdate(dict):
2185
+ """
2186
+ Provides safe schema evolution capabilities for DeltaCAT schemas.
2187
+
2188
+ SchemaUpdate allows users to:
2189
+ 1. Add new fields to a schema
2190
+ 2. Remove existing fields from a schema
2191
+ 3. Update existing fields with compatible changes
2192
+ 4. Validate schema compatibility to prevent breaking existing dataset consumers
2193
+
2194
+ The class enforces backward compatibility by default to ensure that table
2195
+ consumer jobs writtten using PyArrow, Pandas, Polars, Ray Data, Daft, and other
2196
+ dataset types continue to work after schema changes.
2197
+
2198
+ Example:
2199
+ Using Schema.update():
2200
+ >>> schema = Schema.of([Field.of(pa.field("id", pa.int64()))])
2201
+ >>> new_field = Field.of(pa.field("name", pa.string()))
2202
+ >>> updated_schema = (schema.update()
2203
+ ... .add_field("name", new_field)
2204
+ ... .apply())
2205
+
2206
+ Using SchemaUpdate.of():
2207
+ >>> schema = Schema.of([Field.of(pa.field("id", pa.int64()))])
2208
+ >>> update = SchemaUpdate.of(schema)
2209
+ >>> new_field = Field.of(pa.field("name", pa.string()))
2210
+ >>> updated_schema = update.add_field("name", new_field).apply()
2211
+ """
2212
+
2213
+ @staticmethod
2214
+ def of(
2215
+ base_schema: Schema, allow_incompatible_changes: bool = False
2216
+ ) -> SchemaUpdate:
2217
+ """
2218
+ Create a SchemaUpdate for the given base schema.
2219
+
2220
+ Args:
2221
+ base_schema: The original schema to update
2222
+ allow_incompatible_changes: If True, allows changes that may break
2223
+ backward compatibility. If False (default), raises SchemaCompatibilityError
2224
+ for incompatible changes.
2225
+
2226
+ Returns:
2227
+ A new SchemaUpdate instance
2228
+ """
2229
+ return SchemaUpdate(
2230
+ {
2231
+ "baseSchema": base_schema,
2232
+ "allowIncompatibleChanges": allow_incompatible_changes,
2233
+ "operations": SchemaUpdateOperations.of([]),
2234
+ }
2235
+ )
2236
+
2237
+ @property
2238
+ def base_schema(self) -> Schema:
2239
+ """Get the base schema being updated."""
2240
+ return self["baseSchema"]
2241
+
2242
+ @base_schema.setter
2243
+ def base_schema(self, value: Schema) -> None:
2244
+ """Set the base schema being updated."""
2245
+ self["baseSchema"] = value
2246
+
2247
+ @property
2248
+ def allow_incompatible_changes(self) -> bool:
2249
+ """Get whether incompatible changes are allowed."""
2250
+ return self["allowIncompatibleChanges"]
2251
+
2252
+ @allow_incompatible_changes.setter
2253
+ def allow_incompatible_changes(self, value: bool) -> None:
2254
+ """Set whether incompatible changes are allowed."""
2255
+ self["allowIncompatibleChanges"] = value
2256
+
2257
+ @property
2258
+ def operations(self) -> SchemaUpdateOperations:
2259
+ """Get the list of pending operations."""
2260
+ return self["operations"]
2261
+
2262
+ @operations.setter
2263
+ def operations(self, value: SchemaUpdateOperations) -> None:
2264
+ """Set the list of pending operations."""
2265
+ self["operations"] = value
2266
+
2267
+ def add_field(
2268
+ self,
2269
+ new_field: Field,
2270
+ ) -> SchemaUpdate:
2271
+ """
2272
+ Add a new field to the schema.
2273
+
2274
+ Args:
2275
+ field_locator: Location identifier for the new field (name, nested path, or ID)
2276
+ new_field: The Field object to add
2277
+
2278
+ Returns:
2279
+ Self for method chaining
2280
+
2281
+ Raises:
2282
+ SchemaCompatibilityError: If field already exists or addition would break compatibility
2283
+ """
2284
+ self.operations.append(SchemaUpdateOperation.add_field(new_field))
2285
+ return self
2286
+
2287
+ def remove_field(self, field_locator: FieldLocator) -> SchemaUpdate:
2288
+ """
2289
+ Remove an existing field from the schema.
2290
+
2291
+ Args:
2292
+ field_locator: Location identifier for the field to remove
2293
+
2294
+ Returns:
2295
+ Self for method chaining
2296
+
2297
+ Raises:
2298
+ SchemaCompatibilityError: If field doesn't exist or removal would break compatibility
2299
+ """
2300
+ self.operations.append(SchemaUpdateOperation.remove_field(field_locator))
2301
+ return self
2302
+
2303
+ def rename_field(
2304
+ self,
2305
+ field_locator: FieldLocator,
2306
+ new_name: str,
2307
+ ) -> SchemaUpdate:
2308
+ """
2309
+ Rename an existing field while keeping all other properties the same.
2310
+
2311
+ Args:
2312
+ field_locator: Location identifier for the field to rename
2313
+ new_name: The new name for the field
2314
+
2315
+ Returns:
2316
+ Self for method chaining
2317
+
2318
+ Raises:
2319
+ SchemaCompatibilityError: If field doesn't exist or rename would break compatibility
2320
+ """
2321
+ # Get the existing field
2322
+ existing_field = self._get_existing_field(field_locator)
2323
+
2324
+ # Create a deep copy of the field
2325
+ updated_field = copy.deepcopy(existing_field)
2326
+
2327
+ # Update only the arrow field name
2328
+ updated_field["arrow"] = pa.field(
2329
+ new_name,
2330
+ existing_field.arrow.type,
2331
+ nullable=existing_field.arrow.nullable,
2332
+ metadata=existing_field.arrow.metadata,
2333
+ )
2334
+
2335
+ return self._update_field(field_locator, updated_field)
2336
+
2337
+ def update_field_type(
2338
+ self, field_locator: FieldLocator, new_type: pa.DataType
2339
+ ) -> SchemaUpdate:
2340
+ """
2341
+ Update the PyArrow data type of an existing field while keeping all other properties the same.
2342
+
2343
+ Args:
2344
+ field_locator: Location identifier for the field to update
2345
+ new_type: The new PyArrow data type for the field
2346
+
2347
+ Returns:
2348
+ Self for method chaining
2349
+
2350
+ Raises:
2351
+ SchemaCompatibilityError: If field doesn't exist or type change would break compatibility
2352
+ """
2353
+ # Get the existing field
2354
+ existing_field = self._get_existing_field(field_locator)
2355
+
2356
+ # Create a deep copy of the field
2357
+ updated_field = copy.deepcopy(existing_field)
2358
+
2359
+ # Update only the arrow field type
2360
+ updated_field["arrow"] = pa.field(
2361
+ existing_field.arrow.name,
2362
+ new_type,
2363
+ nullable=existing_field.arrow.nullable,
2364
+ metadata=existing_field.arrow.metadata,
2365
+ )
2366
+
2367
+ return self._update_field(field_locator, updated_field)
2368
+
2369
+ def update_field_doc(
2370
+ self,
2371
+ field_locator: FieldLocator,
2372
+ new_doc: Optional[str],
2373
+ ) -> SchemaUpdate:
2374
+ """
2375
+ Update the documentation of an existing field while keeping all other properties the same.
2376
+
2377
+ Args:
2378
+ field_locator: Location identifier for the field to update
2379
+ new_doc: The new documentation string for the field
2380
+
2381
+ Returns:
2382
+ Self for method chaining
2383
+
2384
+ Raises:
2385
+ SchemaCompatibilityError: If field doesn't exist
2386
+ """
2387
+ # Get the existing field
2388
+ existing_field = self._get_existing_field(field_locator)
2389
+
2390
+ # Create a deep copy of the field
2391
+ updated_field = copy.deepcopy(existing_field)
2392
+
2393
+ # Update the arrow field metadata to set/remove doc
2394
+ new_metadata = copy.deepcopy(existing_field.arrow.metadata)
2395
+ new_metadata.pop(FIELD_DOC_KEY_NAME, None)
2396
+ if new_doc is not None:
2397
+ new_metadata[FIELD_DOC_KEY_NAME] = new_doc
2398
+
2399
+ updated_field["arrow"] = pa.field(
2400
+ existing_field.arrow.name,
2401
+ existing_field.arrow.type,
2402
+ nullable=existing_field.arrow.nullable,
2403
+ metadata=new_metadata if new_metadata else None,
2404
+ )
2405
+
2406
+ return self._update_field(field_locator, updated_field)
2407
+
2408
+ def update_field_nullability(
2409
+ self, field_locator: FieldLocator, nullable: bool
2410
+ ) -> SchemaUpdate:
2411
+ """
2412
+ Update the nullability of an existing field while keeping all other properties the same.
2413
+
2414
+ Args:
2415
+ field_locator: Location identifier for the field to update
2416
+ nullable: Whether the field should allow null values
2417
+
2418
+ Returns:
2419
+ Self for method chaining
2420
+
2421
+ Raises:
2422
+ SchemaCompatibilityError: If field doesn't exist or nullability change would break compatibility
2423
+ """
2424
+ # Get the existing field
2425
+ existing_field = self._get_existing_field(field_locator)
2426
+
2427
+ # Create a deep copy of the field
2428
+ updated_field = copy.deepcopy(existing_field)
2429
+
2430
+ # Update only the arrow field nullability
2431
+ updated_field["arrow"] = pa.field(
2432
+ existing_field.arrow.name,
2433
+ existing_field.arrow.type,
2434
+ nullable=nullable,
2435
+ metadata=existing_field.arrow.metadata,
2436
+ )
2437
+
2438
+ return self._update_field(field_locator, updated_field)
2439
+
2440
+ def update_field_consistency_type(
2441
+ self,
2442
+ field_locator: FieldLocator,
2443
+ consistency_type: Optional[SchemaConsistencyType],
2444
+ ) -> SchemaUpdate:
2445
+ """
2446
+ Update the schema consistency type of an existing field while keeping all other properties the same.
2447
+
2448
+ Args:
2449
+ field_locator: Location identifier for the field to update
2450
+ consistency_type: The new schema consistency type for the field
2451
+
2452
+ Returns:
2453
+ Self for method chaining
2454
+
2455
+ Raises:
2456
+ SchemaCompatibilityError: If field doesn't exist
2457
+ """
2458
+ # Get the existing field
2459
+ existing_field = self._get_existing_field(field_locator)
2460
+
2461
+ # Create a deep copy of the field
2462
+ updated_field = copy.deepcopy(existing_field)
2463
+
2464
+ # Update the arrow field metadata to set/remove consistency type
2465
+ new_metadata = copy.deepcopy(existing_field.arrow.metadata)
2466
+ new_metadata.pop(FIELD_CONSISTENCY_TYPE_KEY_NAME, None)
2467
+
2468
+ if consistency_type is not None:
2469
+ new_metadata[FIELD_CONSISTENCY_TYPE_KEY_NAME] = consistency_type.value
2470
+
2471
+ updated_field["arrow"] = pa.field(
2472
+ existing_field.arrow.name,
2473
+ existing_field.arrow.type,
2474
+ nullable=existing_field.arrow.nullable,
2475
+ metadata=new_metadata if new_metadata else None,
2476
+ )
2477
+
2478
+ return self._update_field(field_locator, updated_field)
2479
+
2480
+ def update_field_future_default(
2481
+ self, field_locator: FieldLocator, future_default: Optional[Any]
2482
+ ) -> SchemaUpdate:
2483
+ """
2484
+ Update the future default value of an existing field while keeping all other properties the same.
2485
+ The future default is validated to ensure it's compatible with the field's data type.
2486
+
2487
+ Args:
2488
+ field_locator: Location identifier for the field to update
2489
+ future_default: The new future default value for the field
2490
+
2491
+ Returns:
2492
+ Self for method chaining
2493
+
2494
+ Raises:
2495
+ SchemaCompatibilityError: If field doesn't exist
2496
+ ValueError: If future_default is not compatible with the field's data type
2497
+ """
2498
+ # Get the existing field
2499
+ existing_field = self._get_existing_field(field_locator)
2500
+
2501
+ # Validate that the future_default is compatible with the field's type
2502
+ if future_default is not None:
2503
+ self._validate_default_value(existing_field.arrow.type, future_default)
2504
+
2505
+ # Create a deep copy of the field
2506
+ updated_field = copy.deepcopy(existing_field)
2507
+
2508
+ # Update the arrow field metadata to set/remove future default
2509
+ new_metadata = copy.deepcopy(existing_field.arrow.metadata)
2510
+ new_metadata.pop(FIELD_FUTURE_DEFAULT_KEY_NAME, None)
2511
+
2512
+ if future_default is not None:
2513
+ new_metadata[FIELD_FUTURE_DEFAULT_KEY_NAME] = _encode_metadata_value(
2514
+ future_default
2515
+ )
2516
+
2517
+ updated_field["arrow"] = pa.field(
2518
+ existing_field.arrow.name,
2519
+ existing_field.arrow.type,
2520
+ nullable=existing_field.arrow.nullable,
2521
+ metadata=new_metadata if new_metadata else None,
2522
+ )
2523
+
2524
+ return self._update_field(field_locator, updated_field)
2525
+
2526
+ def _update_field(
2527
+ self, field_locator: FieldLocator, updated_field: Field
2528
+ ) -> SchemaUpdate:
2529
+ """
2530
+ Update an existing field with compatible changes. This is the protected method
2531
+ that handles the general case of field updates.
2532
+
2533
+ Args:
2534
+ field_locator: Location identifier for the field to update
2535
+ updated_field: The new Field object to replace the existing field
2536
+
2537
+ Returns:
2538
+ Self for method chaining
2539
+
2540
+ Raises:
2541
+ SchemaCompatibilityError: If field doesn't exist or update would break compatibility
2542
+ """
2543
+ self.operations.append(
2544
+ SchemaUpdateOperation.update_field(field_locator, updated_field)
2545
+ )
2546
+ return self
2547
+
2548
+ def _get_existing_field(self, field_locator: FieldLocator) -> Field:
2549
+ """
2550
+ Helper method to retrieve an existing field, accounting for pending operations.
2551
+
2552
+ Args:
2553
+ field_locator: Location identifier for the field to retrieve
2554
+
2555
+ Returns:
2556
+ The existing Field object (with any pending updates applied)
2557
+
2558
+ Raises:
2559
+ SchemaCompatibilityError: If field doesn't exist
2560
+ """
2561
+ field_name = self._get_field_name(field_locator)
2562
+ # Search for the field in the base schema
2563
+ base_field = None
2564
+ for field in self.base_schema.fields:
2565
+ field_field_name = field.path[0] if field.path else f"field_{field.id}"
2566
+ if field_field_name == field_name:
2567
+ base_field = field
2568
+ break
2569
+
2570
+ if base_field is None:
2571
+ # Field not found
2572
+ raise SchemaCompatibilityError(
2573
+ f"Field '{field_name}' does not exist in schema", field_locator
2574
+ )
2575
+
2576
+ # Apply any pending operations that affect this field to get the current state
2577
+ current_field = copy.deepcopy(base_field)
2578
+
2579
+ for operation in self.operations:
2580
+ if operation.field_locator_matches(field_locator):
2581
+ # Apply this operation to get the cumulative state
2582
+ current_field = operation.field
2583
+
2584
+ return current_field
2585
+
2586
+ def _validate_default_value(
2587
+ self, arrow_type: pa.DataType, default_value: Any
2588
+ ) -> None:
2589
+ """
2590
+ Helper method to validate that a default value is compatible with a PyArrow data type.
2591
+
2592
+ Args:
2593
+ arrow_type: The PyArrow data type to validate against
2594
+ default_value: The default value to validate
2595
+
2596
+ Raises:
2597
+ ValueError: If the default value is not compatible with the data type
2598
+ """
2599
+ try:
2600
+ # Try to create a PyArrow array with the default value to validate compatibility
2601
+ pa.array([default_value], type=arrow_type)
2602
+ except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError, ValueError) as e:
2603
+ raise ValueError(
2604
+ f"Default value {default_value} is not compatible with type {arrow_type}: {e}"
2605
+ )
2606
+
2607
+ def apply(self) -> Schema:
2608
+ """
2609
+ Apply all pending operations and return the updated schema.
2610
+
2611
+ Returns:
2612
+ New Schema object with all updates applied
2613
+
2614
+ Raises:
2615
+ SchemaCompatibilityError: If any operation would break backward compatibility
2616
+ and allow_incompatible_changes is False
2617
+ """
2618
+ # Start with a copy of the base schema
2619
+ updated_fields = list(self.base_schema.fields)
2620
+ field_name_to_index = {
2621
+ field.path[0] if field.path else f"field_{field.id}": i
2622
+ for i, field in enumerate(updated_fields)
2623
+ }
2624
+
2625
+ # Track next available field ID for new fields during schema evolution
2626
+ next_available_field_id = self.base_schema.max_field_id + 1
2627
+ if next_available_field_id >= MAX_FIELD_ID_EXCLUSIVE:
2628
+ # Just raise an error instead of wrapping to 0, since this
2629
+ # breaks our guarantee of unique field IDs across schema
2630
+ # evolution history (e.g., we may overflow on a schema with IDs
2631
+ # 0-1MM or 2, 10, etc. already assigned).
2632
+ raise SchemaCompatibilityError(
2633
+ f"Schema Field ID overflow: {next_available_field_id} >= {MAX_FIELD_ID_EXCLUSIVE}",
2634
+ )
2635
+
2636
+ # Validate no conflicting operations before applying
2637
+ self._validate_no_conflicting_operations()
2638
+
2639
+ # Apply operations in order
2640
+ for operation in self.operations:
2641
+ if operation.operation == "add":
2642
+ next_available_field_id = self._apply_add_field(
2643
+ updated_fields,
2644
+ field_name_to_index,
2645
+ operation.field,
2646
+ next_available_field_id,
2647
+ )
2648
+ elif operation.operation == "remove":
2649
+ self._apply_remove_field(
2650
+ updated_fields,
2651
+ field_name_to_index,
2652
+ operation.field_locator,
2653
+ )
2654
+ elif operation.operation == "update":
2655
+ self._apply_update_field(
2656
+ updated_fields,
2657
+ field_name_to_index,
2658
+ operation.field_locator,
2659
+ operation.field,
2660
+ )
2661
+
2662
+ # Create new schema from updated fields with incremented schema ID
2663
+ new_schema = Schema.of(updated_fields, schema_id=self.base_schema.id + 1)
2664
+
2665
+ # Ensure max_field_id never decreases, even when fields are removed
2666
+ # This prevents field ID reuse across schema evolution history
2667
+ if new_schema.max_field_id < self.base_schema.max_field_id:
2668
+ new_schema["maxFieldId"] = self.base_schema.max_field_id
2669
+
2670
+ return new_schema
2671
+
2672
+ def _validate_no_conflicting_operations(self) -> None:
2673
+ """Validate that operations don't conflict with each other."""
2674
+ field_operations = {} # field_name -> set of operations
2675
+
2676
+ for operation in self.operations:
2677
+ field_name = None
2678
+
2679
+ if operation.operation == "add" and operation.field:
2680
+ field_name = operation.field.arrow.name
2681
+ elif (
2682
+ operation.operation in ("remove", "update") and operation.field_locator
2683
+ ):
2684
+ # Extract field name from locator
2685
+ if isinstance(operation.field_locator, str):
2686
+ field_name = operation.field_locator
2687
+ elif hasattr(operation.field_locator, "name"):
2688
+ field_name = operation.field_locator.name
2689
+ elif (
2690
+ isinstance(operation.field_locator, list)
2691
+ and operation.field_locator
2692
+ ):
2693
+ field_name = operation.field_locator[0]
2694
+
2695
+ if field_name:
2696
+ if field_name not in field_operations:
2697
+ field_operations[field_name] = set()
2698
+ field_operations[field_name].add(operation.operation)
2699
+
2700
+ # Check for conflicting operations on same field
2701
+ for field_name, operations in field_operations.items():
2702
+ if len(operations) > 1:
2703
+ unique_ops = set(operations)
2704
+ # Allow multiple update operations on same field (they are cumulative)
2705
+ if unique_ops == {"update"}:
2706
+ continue # Multiple updates on same field are allowed
2707
+ # Any other combination is conflicting
2708
+ message_suffix = f"Cannot perform {', '.join(sorted(unique_ops))} operations on the same field"
2709
+
2710
+ raise ValueError(
2711
+ f"Conflicting operations detected on field '{field_name}': {sorted(operations)}. "
2712
+ f"{message_suffix}."
2713
+ )
2714
+
2715
+ def _apply_add_field(
2716
+ self,
2717
+ fields: List[Field],
2718
+ field_name_to_index: Dict[str, int],
2719
+ new_field: Field,
2720
+ next_available_field_id: int,
2721
+ ) -> int:
2722
+ """Apply add field operation with compatibility validation.
2723
+
2724
+ Args:
2725
+ fields: List of existing fields to append to
2726
+ field_name_to_index: Mapping of field names to indices
2727
+ new_field: The field to add (user-specified field_id will be ignored)
2728
+ next_available_field_id: The next available field ID to assign
2729
+
2730
+ Returns:
2731
+ The next available field ID for subsequent operations
2732
+ """
2733
+ field_name = new_field.arrow.name
2734
+
2735
+ # Check if field already exists
2736
+ if field_name in field_name_to_index:
2737
+ raise SchemaCompatibilityError(
2738
+ f"Field '{field_name}' already exists in schema",
2739
+ )
2740
+
2741
+ # Validate compatibility for new field
2742
+ if not self.allow_incompatible_changes:
2743
+ self._validate_add_field_compatibility(new_field)
2744
+
2745
+ # For add operations, ignore user-specified field ID and auto-assign
2746
+ auto_assigned_field_id = next_available_field_id
2747
+
2748
+ # Create a copy of the field with auto-assigned field ID and correct path
2749
+ field_with_auto_id = Field.of(
2750
+ new_field.arrow,
2751
+ field_id=auto_assigned_field_id,
2752
+ is_merge_key=new_field.is_merge_key,
2753
+ merge_order=new_field.merge_order,
2754
+ is_event_time=new_field.is_event_time,
2755
+ doc=new_field.doc,
2756
+ past_default=new_field.past_default,
2757
+ future_default=new_field.future_default,
2758
+ consistency_type=new_field.consistency_type,
2759
+ path=[field_name],
2760
+ native_object=new_field.native_object,
2761
+ )
2762
+
2763
+ # Add the field
2764
+ fields.append(field_with_auto_id)
2765
+ field_name_to_index[field_name] = len(fields) - 1
2766
+
2767
+ # Return next available field ID
2768
+ return next_available_field_id + 1
2769
+
2770
+ def _apply_remove_field(
2771
+ self,
2772
+ fields: List[Field],
2773
+ field_name_to_index: Dict[str, int],
2774
+ field_locator: FieldLocator,
2775
+ ) -> None:
2776
+ """Apply remove field operation with compatibility validation."""
2777
+ field_name = self._get_field_name(field_locator)
2778
+
2779
+ # Check if field exists
2780
+ if field_name not in field_name_to_index:
2781
+ raise SchemaCompatibilityError(
2782
+ f"Field '{field_name}' does not exist in schema", field_locator
2783
+ )
2784
+
2785
+ # Validate compatibility for field removal
2786
+ if not self.allow_incompatible_changes:
2787
+ field_index = field_name_to_index[field_name]
2788
+ self._validate_remove_field_compatibility(
2789
+ fields[field_index], field_locator
2790
+ )
2791
+
2792
+ # Remove the field
2793
+ field_index = field_name_to_index[field_name]
2794
+ fields.pop(field_index)
2795
+
2796
+ # Update indices
2797
+ del field_name_to_index[field_name]
2798
+ for name, index in field_name_to_index.items():
2799
+ if index > field_index:
2800
+ field_name_to_index[name] = index - 1
2801
+
2802
+ def _apply_update_field(
2803
+ self,
2804
+ fields: List[Field],
2805
+ field_name_to_index: Dict[str, int],
2806
+ field_locator: FieldLocator,
2807
+ updated_field: Field,
2808
+ ) -> None:
2809
+ """Apply update field operation with compatibility validation."""
2810
+ field_name = self._get_field_name(field_locator)
2811
+
2812
+ # Check if field exists
2813
+ if field_name not in field_name_to_index:
2814
+ raise SchemaCompatibilityError(
2815
+ f"Field '{field_name}' does not exist in schema", field_locator
2816
+ )
2817
+
2818
+ field_index = field_name_to_index[field_name]
2819
+ old_field = fields[field_index]
2820
+
2821
+ # Validate compatibility for field update
2822
+ if not self.allow_incompatible_changes:
2823
+ self._validate_update_field_compatibility(
2824
+ old_field, updated_field, field_locator
2825
+ )
2826
+
2827
+ # Get the new field name from the updated field
2828
+ new_field_name = updated_field.arrow.name
2829
+
2830
+ # Create a copy of the updated field with the correct path
2831
+ field_with_path = Field.of(
2832
+ updated_field.arrow,
2833
+ field_id=updated_field.id,
2834
+ is_merge_key=updated_field.is_merge_key,
2835
+ merge_order=updated_field.merge_order,
2836
+ is_event_time=updated_field.is_event_time,
2837
+ doc=updated_field.doc,
2838
+ past_default=updated_field.past_default,
2839
+ future_default=updated_field.future_default,
2840
+ consistency_type=updated_field.consistency_type,
2841
+ path=[new_field_name],
2842
+ native_object=updated_field.native_object,
2843
+ )
2844
+
2845
+ # Update the field
2846
+ fields[field_index] = field_with_path
2847
+
2848
+ # If field name changed (rename), update the mapping
2849
+ if field_name != new_field_name:
2850
+ del field_name_to_index[field_name]
2851
+ field_name_to_index[new_field_name] = field_index
2852
+
2853
+ def _get_field_name(self, field_locator: FieldLocator) -> str:
2854
+ """Extract field name from various field locator types."""
2855
+ if isinstance(field_locator, str):
2856
+ return field_locator
2857
+ elif isinstance(field_locator, list):
2858
+ return field_locator[0] if field_locator else ""
2859
+ elif isinstance(field_locator, int):
2860
+ # For field ID, try to find the corresponding field
2861
+ try:
2862
+ field = self.base_schema.field(field_locator)
2863
+ return field.path[0] if field.path else f"field_{field_locator}"
2864
+ except Exception:
2865
+ return f"field_{field_locator}"
2866
+ else:
2867
+ raise ValueError(f"Invalid field locator type: {type(field_locator)}")
2868
+
2869
+ @staticmethod
2870
+ def _field_locators_match(locator1: FieldLocator, locator2: FieldLocator) -> bool:
2871
+ """Check if two field locators refer to the same field."""
2872
+ # For simplicity, convert both to string names and compare
2873
+ # This works because we primarily use field names in our operations
2874
+ if isinstance(locator1, str) and isinstance(locator2, str):
2875
+ return locator1 == locator2
2876
+ elif isinstance(locator1, list) and isinstance(locator2, list):
2877
+ return locator1 == locator2
2878
+ elif isinstance(locator1, int) and isinstance(locator2, int):
2879
+ return locator1 == locator2
2880
+ else:
2881
+ # Convert to strings and compare (this is a simplified approach)
2882
+ str1 = (
2883
+ locator1
2884
+ if isinstance(locator1, str)
2885
+ else (
2886
+ locator1[0]
2887
+ if isinstance(locator1, list) and locator1
2888
+ else str(locator1)
2889
+ )
2890
+ )
2891
+ str2 = (
2892
+ locator2
2893
+ if isinstance(locator2, str)
2894
+ else (
2895
+ locator2[0]
2896
+ if isinstance(locator2, list) and locator2
2897
+ else str(locator2)
2898
+ )
2899
+ )
2900
+ return str1 == str2
2901
+
2902
+ def _validate_add_field_compatibility(self, new_field: Field) -> None:
2903
+ """Validate that adding a new field won't break compatibility."""
2904
+ field_name = new_field.arrow.name
2905
+ arrow_field = new_field.arrow
2906
+
2907
+ # Check if field is nullable or has default values
2908
+ is_nullable = arrow_field.nullable
2909
+ has_past_default = new_field.past_default is not None
2910
+ has_future_default = new_field.future_default is not None
2911
+
2912
+ if not (is_nullable or has_past_default or has_future_default):
2913
+ raise SchemaCompatibilityError(
2914
+ f"Adding non-nullable field '{field_name}' without "
2915
+ f"default values would break compatibility with existing data",
2916
+ )
2917
+
2918
+ def _validate_remove_field_compatibility(
2919
+ self, field: Field, field_locator: FieldLocator
2920
+ ) -> None:
2921
+ """Validate that removing a field won't break compatibility."""
2922
+ field_name = self._get_field_name(field_locator)
2923
+
2924
+ # Check for protected field types that should never be removed
2925
+ if field.is_merge_key:
2926
+ raise SchemaCompatibilityError(
2927
+ f"Cannot remove merge key field '{field_name}'. "
2928
+ f"Merge keys are critical for data integrity and cannot be removed.",
2929
+ field_locator,
2930
+ )
2931
+
2932
+ if field.merge_order is not None:
2933
+ raise SchemaCompatibilityError(
2934
+ f"Cannot remove merge order field '{field_name}'. "
2935
+ f"Fields with merge_order are critical for data ordering and cannot be removed.",
2936
+ field_locator,
2937
+ )
2938
+
2939
+ if field.is_event_time:
2940
+ raise SchemaCompatibilityError(
2941
+ f"Cannot remove event time field '{field_name}'. "
2942
+ f"Event time fields are critical for temporal operations and cannot be removed.",
2943
+ field_locator,
2944
+ )
2945
+
2946
+ # Removing fields generally breaks compatibility for consumers expecting them
2947
+ raise SchemaCompatibilityError(
2948
+ f"Removing field '{field_name}' would break compatibility with existing consumers. "
2949
+ f"Set allow_incompatible_changes=True to force removal.",
2950
+ field_locator,
2951
+ )
2952
+
2953
+ def _validate_update_field_compatibility(
2954
+ self, old_field: Field, new_field: Field, field_locator: FieldLocator
2955
+ ) -> None:
2956
+ """Validate that updating a field won't break compatibility."""
2957
+ old_arrow = old_field.arrow
2958
+ new_arrow = new_field.arrow
2959
+ field_name = self._get_field_name(field_locator)
2960
+
2961
+ # Protect critical field attributes that should never be changed
2962
+ if old_field.is_merge_key != new_field.is_merge_key:
2963
+ raise SchemaCompatibilityError(
2964
+ f"Cannot change merge key status for field '{field_name}'. "
2965
+ f"Merge key designation is critical for data integrity and cannot be modified.",
2966
+ field_locator,
2967
+ )
2968
+
2969
+ if old_field.merge_order != new_field.merge_order:
2970
+ raise SchemaCompatibilityError(
2971
+ f"Cannot change merge order for field '{field_name}'. "
2972
+ f"Merge order is critical for data consistency and cannot be modified.",
2973
+ field_locator,
2974
+ )
2975
+
2976
+ if old_field.is_event_time != new_field.is_event_time:
2977
+ raise SchemaCompatibilityError(
2978
+ f"Cannot change event time status for field '{field_name}'. "
2979
+ f"Event time designation is critical for temporal operations and cannot be modified.",
2980
+ field_locator,
2981
+ )
2982
+
2983
+ # Validate schema consistency type evolution rules
2984
+ self._validate_consistency_type_evolution(old_field, new_field, field_locator)
2985
+
2986
+ # Protect past_default immutability
2987
+ if old_field.past_default != new_field.past_default:
2988
+ raise SchemaCompatibilityError(
2989
+ f"Cannot change past_default for field '{field_name}'. "
2990
+ f"The past_default value is immutable once set to maintain data consistency.",
2991
+ field_locator,
2992
+ )
2993
+
2994
+ # Check for duplicate field IDs (if field ID is being changed)
2995
+ if old_field.id != new_field.id and new_field.id is not None:
2996
+ existing_field_ids = {
2997
+ f.id
2998
+ for f in self.base_schema.fields
2999
+ if f.id is not None and f != old_field
3000
+ }
3001
+ if new_field.id in existing_field_ids:
3002
+ raise SchemaCompatibilityError(
3003
+ f"Cannot update field '{field_name}' to use duplicate field ID {new_field.id}. "
3004
+ f"Field IDs must be unique across all fields in the schema.",
3005
+ field_locator,
3006
+ )
3007
+
3008
+ # Check data type compatibility
3009
+ if not self._is_type_compatible(old_arrow.type, new_arrow.type):
3010
+ raise SchemaCompatibilityError(
3011
+ f"Cannot change field '{field_name}' from {old_arrow.type} to {new_arrow.type}. "
3012
+ f"This change would break compatibility with PyArrow, Pandas, Polars, Ray Data, and Daft.",
3013
+ field_locator,
3014
+ )
3015
+
3016
+ # Check nullability - making a field non-nullable is incompatible
3017
+ if old_arrow.nullable and not new_arrow.nullable:
3018
+ # Only allow if we have past/future defaults to fill null values
3019
+ has_past_default = new_field.past_default is not None
3020
+ has_future_default = new_field.future_default is not None
3021
+
3022
+ if not (has_past_default and has_future_default):
3023
+ raise SchemaCompatibilityError(
3024
+ f"Cannot make nullable field '{field_name}' non-nullable without "
3025
+ f"providing both past_default and future_default values",
3026
+ field_locator,
3027
+ )
3028
+
3029
+ def _validate_consistency_type_evolution(
3030
+ self, old_field: Field, new_field: Field, field_locator: FieldLocator
3031
+ ) -> None:
3032
+ """
3033
+ Validate schema consistency type evolution rules.
3034
+
3035
+ Allowed transitions:
3036
+ - COERCE -> VALIDATE
3037
+ - VALIDATE -> COERCE
3038
+ - COERCE -> NONE
3039
+ - VALIDATE -> NONE
3040
+
3041
+ Forbidden transitions:
3042
+ - NONE -> COERCE
3043
+ - NONE -> VALIDATE
3044
+ """
3045
+ old_type = old_field.consistency_type
3046
+ new_type = new_field.consistency_type
3047
+ field_name = self._get_field_name(field_locator)
3048
+
3049
+ # If types are the same, no validation needed
3050
+ if old_type == new_type:
3051
+ return
3052
+
3053
+ # Handle None values (treat as no consistency type set)
3054
+ if old_type is None and new_type is None:
3055
+ return
3056
+
3057
+ # Allow transitions from any type to NONE (relaxing constraints)
3058
+ if new_type == SchemaConsistencyType.NONE or new_type is None:
3059
+ return
3060
+
3061
+ # Allow transitions between COERCE and VALIDATE (bidirectional)
3062
+ if old_type in (
3063
+ SchemaConsistencyType.COERCE,
3064
+ SchemaConsistencyType.VALIDATE,
3065
+ ) and new_type in (
3066
+ SchemaConsistencyType.COERCE,
3067
+ SchemaConsistencyType.VALIDATE,
3068
+ ):
3069
+ return
3070
+
3071
+ # Allow transitions from None to COERCE or VALIDATE (adding constraints)
3072
+ if old_type is None and new_type in (
3073
+ SchemaConsistencyType.COERCE,
3074
+ SchemaConsistencyType.VALIDATE,
3075
+ ):
3076
+ return
3077
+
3078
+ # Forbid transitions from NONE to COERCE or VALIDATE (tightening constraints)
3079
+ if old_type == SchemaConsistencyType.NONE and new_type in (
3080
+ SchemaConsistencyType.COERCE,
3081
+ SchemaConsistencyType.VALIDATE,
3082
+ ):
3083
+ raise SchemaCompatibilityError(
3084
+ f"Cannot change consistency type for field '{field_name}' from {old_type.value} to {new_type.value}. "
3085
+ f"Transitioning from NONE to {new_type.value} would tighten validation constraints "
3086
+ f"and potentially break existing data processing.",
3087
+ field_locator,
3088
+ )
3089
+
3090
+ # If we get here, it's an unexpected combination
3091
+ raise SchemaCompatibilityError(
3092
+ f"Invalid consistency type transition for field '{field_name}' from "
3093
+ f"{old_type.value if old_type else 'None'} to {new_type.value if new_type else 'None'}.",
3094
+ field_locator,
3095
+ )
3096
+
3097
+ def _is_type_compatible(self, old_type: pa.DataType, new_type: pa.DataType) -> bool:
3098
+ """
3099
+ Check if changing from old_type to new_type is backward compatible.
3100
+
3101
+ Compatible changes include:
3102
+ - Same type
3103
+ - Widening numeric types (int32 -> int64, float32 -> float64)
3104
+ - Making string/binary types longer
3105
+ - Adding fields to struct types
3106
+ - Making list/map value types more permissive
3107
+ """
3108
+ # Same type is always compatible
3109
+ if old_type.equals(new_type):
3110
+ return True
3111
+
3112
+ # Numeric type widening
3113
+ if pa.types.is_integer(old_type) and pa.types.is_integer(new_type):
3114
+ # Check bit width and signedness using string representation
3115
+ old_signed = "int" in str(old_type) and "uint" not in str(old_type)
3116
+ new_signed = "int" in str(new_type) and "uint" not in str(new_type)
3117
+ return new_type.bit_width >= old_type.bit_width and old_signed == new_signed
3118
+
3119
+ if pa.types.is_floating(old_type) and pa.types.is_floating(new_type):
3120
+ return new_type.bit_width >= old_type.bit_width
3121
+
3122
+ # Integer to float promotion
3123
+ if pa.types.is_integer(old_type) and pa.types.is_floating(new_type):
3124
+ return True
3125
+
3126
+ # String/binary type compatibility
3127
+ if pa.types.is_string(old_type) and pa.types.is_string(new_type):
3128
+ return True
3129
+ if pa.types.is_binary(old_type) and pa.types.is_binary(new_type):
3130
+ return True
3131
+
3132
+ # Struct type compatibility (new fields can be added)
3133
+ if pa.types.is_struct(old_type) and pa.types.is_struct(new_type):
3134
+ old_names = {field.name for field in old_type}
3135
+ new_names = {field.name for field in new_type}
3136
+
3137
+ # All old fields must exist in new type
3138
+ if not old_names.issubset(new_names):
3139
+ return False
3140
+
3141
+ # Check compatibility of common fields
3142
+ for old_field in old_type:
3143
+ new_field = new_type.field(old_field.name)
3144
+ if not self._is_type_compatible(old_field.type, new_field.type):
3145
+ return False
3146
+
3147
+ return True
3148
+
3149
+ # List type compatibility
3150
+ if pa.types.is_list(old_type) and pa.types.is_list(new_type):
3151
+ return self._is_type_compatible(old_type.value_type, new_type.value_type)
3152
+
3153
+ # Map type compatibility
3154
+ if pa.types.is_map(old_type) and pa.types.is_map(new_type):
3155
+ return self._is_type_compatible(
3156
+ old_type.key_type, new_type.key_type
3157
+ ) and self._is_type_compatible(old_type.item_type, new_type.item_type)
3158
+
3159
+ # Default: types are incompatible
3160
+ return False