deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1925 @@
1
+ """
2
+ Tests for SchemaUpdate functionality.
3
+
4
+ Note: These tests are in a separate file from test_schema.py to avoid test contamination issues.
5
+ Some tests in test_schema.py appear to modify global state that affects SchemaUpdate tests
6
+ when run together. Running these tests in isolation ensures they pass consistently.
7
+
8
+ To run both test suites together successfully, run SchemaUpdate tests first:
9
+ pytest test_schema_update.py test_schema.py
10
+ """
11
+
12
+ import pytest
13
+ import pyarrow as pa
14
+
15
+ from deltacat.storage.model.schema import (
16
+ Schema,
17
+ Field,
18
+ SchemaUpdate,
19
+ MAX_FIELD_ID_EXCLUSIVE,
20
+ )
21
+ from deltacat.storage.model.types import SchemaConsistencyType, SortOrder
22
+ from deltacat.storage.model.schema import MergeOrder
23
+ from deltacat.exceptions import SchemaCompatibilityError
24
+
25
+
26
+ @pytest.fixture(scope="function")
27
+ def base_schema():
28
+ """Simple base schema for testing SchemaUpdate operations."""
29
+ return Schema.of(
30
+ [
31
+ Field.of(
32
+ pa.field("id", pa.int64(), nullable=False),
33
+ field_id=1,
34
+ is_merge_key=True,
35
+ ),
36
+ Field.of(pa.field("name", pa.string(), nullable=True), field_id=2),
37
+ Field.of(pa.field("age", pa.int32(), nullable=True), field_id=3),
38
+ ]
39
+ )
40
+
41
+
42
+ @pytest.fixture(scope="function")
43
+ def complex_schema():
44
+ """More complex schema for advanced testing."""
45
+ return Schema.of(
46
+ [
47
+ Field.of(
48
+ pa.field("user_id", pa.int64(), nullable=False),
49
+ field_id=1,
50
+ is_merge_key=True,
51
+ ),
52
+ Field.of(pa.field("email", pa.string(), nullable=False), field_id=2),
53
+ Field.of(pa.field("score", pa.float32(), nullable=True), field_id=3),
54
+ Field.of(
55
+ pa.field(
56
+ "metadata",
57
+ pa.struct(
58
+ [
59
+ pa.field("created_at", pa.timestamp("us")),
60
+ pa.field("tags", pa.list_(pa.string())),
61
+ ]
62
+ ),
63
+ nullable=True,
64
+ ),
65
+ field_id=4,
66
+ ),
67
+ ]
68
+ )
69
+
70
+
71
+ @pytest.fixture(scope="function")
72
+ def protected_fields_schema():
73
+ """Schema with protected fields for testing field protection rules."""
74
+ return Schema.of(
75
+ [
76
+ Field.of(
77
+ pa.field("id", pa.int64(), nullable=False),
78
+ field_id=1,
79
+ is_merge_key=True,
80
+ ),
81
+ Field.of(
82
+ pa.field("timestamp", pa.int64(), nullable=False),
83
+ field_id=2,
84
+ is_event_time=True,
85
+ ), # Use int64 for event time
86
+ Field.of(
87
+ pa.field("priority", pa.int32(), nullable=True),
88
+ field_id=3,
89
+ merge_order=MergeOrder.of(SortOrder.ASCENDING),
90
+ ),
91
+ Field.of(
92
+ pa.field("data", pa.string(), nullable=True),
93
+ field_id=4,
94
+ past_default="default",
95
+ consistency_type=SchemaConsistencyType.COERCE,
96
+ ),
97
+ ]
98
+ )
99
+
100
+
101
+ class TestSchemaUpdate:
102
+ """Comprehensive tests for SchemaUpdate class."""
103
+
104
+ def test_init(self, base_schema):
105
+ """Test SchemaUpdate initialization."""
106
+ update = SchemaUpdate.of(base_schema)
107
+ assert update.base_schema == base_schema
108
+ assert not update.allow_incompatible_changes
109
+ assert len(update.operations) == 0
110
+
111
+ update_permissive = SchemaUpdate.of(
112
+ base_schema, allow_incompatible_changes=True
113
+ )
114
+ assert update_permissive.allow_incompatible_changes
115
+
116
+ def test_add_field_success(self, base_schema):
117
+ """Test successfully adding a new nullable field."""
118
+ new_field = Field.of(pa.field("email", pa.string(), nullable=True), field_id=4)
119
+
120
+ update = SchemaUpdate.of(base_schema)
121
+ result_schema = update.add_field(new_field).apply()
122
+
123
+ assert len(result_schema.fields) == 4
124
+ # Verify the field was added with correct properties
125
+ added_field = result_schema.field("email")
126
+ assert added_field.arrow.name == "email"
127
+ assert added_field.arrow.type == pa.string()
128
+ assert added_field.arrow.nullable is True
129
+ assert added_field.id == 4
130
+ assert result_schema.field("id") == base_schema.field(
131
+ "id"
132
+ ) # Original fields preserved
133
+
134
+ def test_add_field_with_past_default(self, base_schema):
135
+ """Test adding field with past_default is allowed."""
136
+ new_field = Field.of(
137
+ pa.field("status", pa.string(), nullable=False),
138
+ field_id=4,
139
+ past_default="active",
140
+ )
141
+
142
+ update = SchemaUpdate.of(base_schema)
143
+ result_schema = update.add_field(new_field).apply()
144
+
145
+ assert len(result_schema.fields) == 4
146
+ # Verify the field was added with correct properties
147
+ added_field = result_schema.field("status")
148
+ assert added_field.arrow.name == "status"
149
+ assert added_field.arrow.type == pa.string()
150
+ assert added_field.arrow.nullable is False
151
+ assert added_field.id == 4
152
+ assert added_field.past_default == "active"
153
+
154
+ def test_add_field_with_future_default(self, base_schema):
155
+ """Test adding field with future_default is allowed."""
156
+ new_field = Field.of(
157
+ pa.field("priority", pa.int32(), nullable=False),
158
+ field_id=4,
159
+ future_default=1,
160
+ )
161
+
162
+ update = SchemaUpdate.of(base_schema)
163
+ result_schema = update.add_field(new_field).apply()
164
+
165
+ assert len(result_schema.fields) == 4
166
+ # Verify the field was added with correct properties
167
+ added_field = result_schema.field("priority")
168
+ assert added_field.arrow.name == "priority"
169
+ assert added_field.arrow.type == pa.int32()
170
+ assert added_field.arrow.nullable is False
171
+ assert added_field.id == 4
172
+ assert added_field.future_default == 1
173
+
174
+ def test_add_field_non_nullable_without_defaults_fails(self, base_schema):
175
+ """Test that adding non-nullable field without defaults fails."""
176
+ new_field = Field.of(
177
+ pa.field("required_field", pa.string(), nullable=False), field_id=4
178
+ )
179
+
180
+ update = SchemaUpdate.of(base_schema)
181
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
182
+ update.add_field(new_field).apply()
183
+
184
+ assert "non-nullable field" in str(exc_info.value)
185
+ assert "without default values" in str(exc_info.value)
186
+
187
+ def test_add_field_non_nullable_allowed_with_flag(self, base_schema):
188
+ """Test adding non-nullable field succeeds with allow_incompatible_changes=True."""
189
+ new_field = Field.of(
190
+ pa.field("required_field", pa.string(), nullable=False), field_id=4
191
+ )
192
+
193
+ update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
194
+ result_schema = update.add_field(new_field).apply()
195
+
196
+ assert len(result_schema.fields) == 4
197
+ # Verify the field was added with correct properties
198
+ added_field = result_schema.field("required_field")
199
+ assert added_field.arrow.name == "required_field"
200
+ assert added_field.arrow.type == pa.string()
201
+ assert added_field.arrow.nullable is False
202
+ assert added_field.id == 4
203
+
204
+ def test_add_existing_field_fails(self, base_schema):
205
+ """Test that adding a field that already exists fails."""
206
+ duplicate_field = Field.of(pa.field("name", pa.string()), field_id=5)
207
+
208
+ update = SchemaUpdate.of(base_schema)
209
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
210
+ update.add_field(duplicate_field).apply()
211
+
212
+ assert "already exists" in str(exc_info.value)
213
+
214
+ def test_add_field_id_overflow_raises_error(self):
215
+ """Adding a field when max_field_id is MAX-1 should overflow and error.
216
+
217
+ Base schema has field IDs at 0 and MAX_FIELD_ID_EXCLUSIVE - 1. Adding a
218
+ new field should attempt to auto-assign the next ID which overflows back
219
+ to 0, causing a duplicate ID error.
220
+ """
221
+ base = Schema.of(
222
+ [
223
+ Field.of(
224
+ pa.field("id_max_minus_one", pa.int64(), nullable=True),
225
+ field_id=MAX_FIELD_ID_EXCLUSIVE - 1,
226
+ ),
227
+ ]
228
+ )
229
+
230
+ # Add a new nullable field (compatibility-wise OK). The ID is ignored
231
+ # and will be auto-assigned, which should overflow and raise ValueError.
232
+ update = SchemaUpdate.of(base)
233
+ new_field = Field.of(pa.field("overflow", pa.int64(), nullable=True))
234
+
235
+ with pytest.raises(SchemaCompatibilityError):
236
+ update.add_field(new_field).apply()
237
+
238
+ def test_remove_field_fails_by_default(self, base_schema):
239
+ """Test that removing fields fails by default for compatibility."""
240
+ update = SchemaUpdate.of(base_schema)
241
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
242
+ update.remove_field("age").apply()
243
+
244
+ assert "would break compatibility" in str(exc_info.value)
245
+ assert "allow_incompatible_changes=True" in str(exc_info.value)
246
+ assert exc_info.value.field_locator == "age"
247
+
248
+ def test_remove_field_succeeds_with_flag(self, base_schema):
249
+ """Test removing field succeeds with allow_incompatible_changes=True."""
250
+ update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
251
+ result_schema = update.remove_field("age").apply()
252
+
253
+ assert len(result_schema.fields) == 2
254
+ field_names = [f.path[0] for f in result_schema.fields if f.path]
255
+ assert "age" not in field_names
256
+ assert "id" in field_names
257
+ assert "name" in field_names
258
+
259
+ def test_remove_nonexistent_field_fails(self, base_schema):
260
+ """Test removing a field that doesn't exist fails."""
261
+ update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
262
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
263
+ update.remove_field("nonexistent").apply()
264
+
265
+ assert "does not exist" in str(exc_info.value)
266
+ assert exc_info.value.field_locator == "nonexistent"
267
+
268
+ def test_update_field_compatible_type_widening(self, base_schema):
269
+ """Test updating field with compatible type widening (int32 -> int64)."""
270
+ update = SchemaUpdate.of(base_schema)
271
+ result_schema = update.update_field_type("age", pa.int64()).apply()
272
+
273
+ updated_age_field = result_schema.field("age")
274
+ assert updated_age_field.arrow.type == pa.int64()
275
+ assert updated_age_field.arrow.name == "age"
276
+ assert updated_age_field.id == 3
277
+
278
+ def test_update_field_compatible_nullability_change(self, base_schema):
279
+ """Test making nullable field non-nullable fails without defaults."""
280
+ # This should fail because we're making a nullable field non-nullable without defaults
281
+ update = SchemaUpdate.of(base_schema)
282
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
283
+ update.update_field_nullability("name", False).apply()
284
+
285
+ assert "non-nullable without" in str(exc_info.value)
286
+ assert "past_default and future_default" in str(exc_info.value)
287
+
288
+ def test_update_field_incompatible_nullability_fails(self, base_schema):
289
+ """Test making nullable field non-nullable fails without defaults."""
290
+ update = SchemaUpdate.of(base_schema)
291
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
292
+ update.update_field_nullability("name", False).apply()
293
+
294
+ assert "non-nullable without" in str(exc_info.value)
295
+ assert "past_default and future_default" in str(exc_info.value)
296
+
297
+ def test_update_field_incompatible_type_fails(self, base_schema):
298
+ """Test updating field with incompatible type change fails."""
299
+ # int32 -> string is incompatible
300
+ update = SchemaUpdate.of(base_schema)
301
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
302
+ update.update_field_type("age", pa.string()).apply()
303
+
304
+ assert "would break compatibility" in str(exc_info.value)
305
+ assert "PyArrow, Pandas, Polars, Ray Data, and Daft" in str(exc_info.value)
306
+
307
+ def test_update_field_incompatible_allowed_with_flag(self, base_schema):
308
+ """Test incompatible field update succeeds with allow_incompatible_changes=True."""
309
+ update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
310
+ result_schema = update.update_field_type("age", pa.string()).apply()
311
+
312
+ updated_age_field = result_schema.field("age")
313
+ assert updated_age_field.arrow.type == pa.string()
314
+ assert updated_age_field.arrow.name == "age"
315
+ assert updated_age_field.id == 3
316
+
317
+ def test_update_nonexistent_field_fails(self, base_schema):
318
+ """Test updating a field that doesn't exist fails."""
319
+ update = SchemaUpdate.of(base_schema)
320
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
321
+ update.update_field_type("nonexistent", pa.string()).apply()
322
+
323
+ assert "does not exist" in str(exc_info.value)
324
+
325
+ def test_method_chaining(self, base_schema):
326
+ """Test that SchemaUpdate methods support fluent chaining."""
327
+ new_field1 = Field.of(pa.field("email", pa.string(), nullable=True), field_id=4)
328
+ new_field2 = Field.of(
329
+ pa.field("score", pa.float64(), nullable=True), field_id=5
330
+ )
331
+
332
+ result_schema = (
333
+ SchemaUpdate.of(base_schema)
334
+ .add_field(new_field1)
335
+ .add_field(new_field2)
336
+ .update_field_type("age", pa.int64())
337
+ .apply()
338
+ )
339
+
340
+ assert len(result_schema.fields) == 5
341
+
342
+ # Verify email field
343
+ email_field = result_schema.field("email")
344
+ assert email_field.arrow.name == "email"
345
+ assert email_field.arrow.type == pa.string()
346
+ assert email_field.id == 4
347
+
348
+ # Verify score field
349
+ score_field = result_schema.field("score")
350
+ assert score_field.arrow.name == "score"
351
+ assert score_field.arrow.type == pa.float64()
352
+ assert score_field.id == 5
353
+
354
+ # Verify updated age field
355
+ age_field = result_schema.field("age")
356
+ assert age_field.arrow.type == pa.int64()
357
+ assert age_field.arrow.name == "age"
358
+ assert age_field.id == 3
359
+
360
+ def test_complex_struct_field_operations(self, complex_schema):
361
+ """Test operations on schemas with complex struct fields."""
362
+ # Add a new nested struct field
363
+ new_struct_field = Field.of(
364
+ pa.field(
365
+ "preferences",
366
+ pa.struct(
367
+ [
368
+ pa.field("theme", pa.string()),
369
+ pa.field("notifications", pa.bool_()),
370
+ ]
371
+ ),
372
+ nullable=True,
373
+ ),
374
+ field_id=5,
375
+ )
376
+
377
+ update = SchemaUpdate.of(complex_schema)
378
+ result_schema = update.add_field(new_struct_field).apply()
379
+
380
+ assert len(result_schema.fields) == 5
381
+ # Verify the struct field was added correctly
382
+ prefs_field = result_schema.field("preferences")
383
+ assert prefs_field.arrow.name == "preferences"
384
+ assert prefs_field.id == 5
385
+ assert pa.types.is_struct(prefs_field.arrow.type)
386
+
387
+ def test_field_locator_types(self, base_schema):
388
+ """Test different types of field locators (string, list, int)."""
389
+ new_field = Field.of(pa.field("test", pa.string(), nullable=True), field_id=4)
390
+
391
+ # Test string locator
392
+ update1 = SchemaUpdate.of(base_schema)
393
+ result1 = update1.add_field(new_field).apply()
394
+ assert len(result1.fields) == 4
395
+
396
+ # Test list locator (nested field path)
397
+ update2 = SchemaUpdate.of(base_schema)
398
+ result2 = update2.add_field(new_field).apply()
399
+ assert len(result2.fields) == 4
400
+
401
+ # Test int locator for updates (using existing field ID)
402
+ update3 = SchemaUpdate.of(base_schema)
403
+ result3 = update3.update_field_type(3, pa.int64()).apply() # Update by field ID
404
+ assert result3.field("age").arrow.type == pa.int64()
405
+
406
+ def test_type_compatibility_validation(self):
407
+ """Test the _is_type_compatible method with various type combinations."""
408
+ base_schema_simple = Schema.of(
409
+ [Field.of(pa.field("test", pa.int32()), field_id=1)]
410
+ )
411
+ update = SchemaUpdate.of(base_schema_simple)
412
+
413
+ # Test numeric widening (compatible)
414
+ assert update._is_type_compatible(pa.int32(), pa.int64())
415
+ assert update._is_type_compatible(pa.float32(), pa.float64())
416
+ assert update._is_type_compatible(pa.int32(), pa.float64())
417
+
418
+ # Test incompatible changes
419
+ assert not update._is_type_compatible(pa.int64(), pa.int32()) # narrowing
420
+ assert not update._is_type_compatible(
421
+ pa.string(), pa.int32()
422
+ ) # different types
423
+ assert not update._is_type_compatible(
424
+ pa.float64(), pa.string()
425
+ ) # different types
426
+
427
+ # Test string/binary compatibility
428
+ assert update._is_type_compatible(pa.string(), pa.string())
429
+ assert update._is_type_compatible(pa.binary(), pa.binary())
430
+
431
+ # Test struct compatibility
432
+ old_struct = pa.struct([pa.field("a", pa.int32())])
433
+ new_struct_compatible = pa.struct(
434
+ [pa.field("a", pa.int32()), pa.field("b", pa.string())]
435
+ )
436
+ new_struct_incompatible = pa.struct(
437
+ [pa.field("b", pa.string())]
438
+ ) # missing field "a"
439
+
440
+ assert update._is_type_compatible(old_struct, new_struct_compatible)
441
+ assert not update._is_type_compatible(old_struct, new_struct_incompatible)
442
+
443
+ # Test list compatibility
444
+ assert update._is_type_compatible(pa.list_(pa.int32()), pa.list_(pa.int64()))
445
+ assert not update._is_type_compatible(
446
+ pa.list_(pa.int64()), pa.list_(pa.int32())
447
+ )
448
+
449
+ def test_error_field_locator_attribute(self, base_schema):
450
+ """Test that SchemaCompatibilityError includes field_locator."""
451
+ new_field = Field.of(
452
+ pa.field("required", pa.string(), nullable=False), field_id=4
453
+ )
454
+
455
+ update = SchemaUpdate.of(base_schema)
456
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
457
+ update.add_field(new_field).apply()
458
+
459
+ assert "Adding non-nullable field" in str(exc_info.value)
460
+
461
+ def test_operations_applied_in_order(self, base_schema):
462
+ """Test that operations are applied in the order they were added."""
463
+ original_field_id = base_schema.field_id("name")
464
+ result_schema = (
465
+ base_schema.update()
466
+ .update_field_doc("name", "first name")
467
+ .update_field_doc("name", "last name")
468
+ .update_field_doc("name", "middle name")
469
+ .update_field_doc("name", "full name")
470
+ .apply()
471
+ )
472
+ # Verify that the result reflects the last rename operation
473
+ actual_field = result_schema.field(original_field_id)
474
+ assert actual_field.arrow.name == "name"
475
+ assert actual_field.doc == "full name"
476
+ assert actual_field.arrow.type == pa.string()
477
+ assert actual_field.id == original_field_id
478
+
479
+ def test_duplicate_field_id_validation_add_field(self, base_schema):
480
+ """Test that adding a field with duplicate field ID succeeds because IDs are auto-assigned.
481
+
482
+ Note: This test behavior changed after implementing automatic field ID assignment.
483
+ User-specified field IDs are now ignored to prevent conflicts.
484
+ """
485
+ # Try to add a field with ID 2, which already exists for 'name' field
486
+ # This should succeed because the user-specified ID will be ignored
487
+ duplicate_id_field = Field.of(
488
+ pa.field("new_field", pa.string(), nullable=True), field_id=2
489
+ )
490
+
491
+ update = SchemaUpdate.of(base_schema)
492
+ result_schema = update.add_field(duplicate_id_field).apply()
493
+
494
+ # Field should be added with auto-assigned ID (4), not the conflicting ID (2)
495
+ new_field = result_schema.field("new_field")
496
+ assert new_field.id == 4 # Auto-assigned, not 2
497
+ assert new_field.arrow.name == "new_field"
498
+
499
+ # Original field with ID 2 should be unchanged
500
+ assert result_schema.field("name").id == 2
501
+
502
+ def test_duplicate_field_id_validation_update_field(self, base_schema):
503
+ """Test that updating a field to use duplicate field ID fails."""
504
+ # Try to update 'age' field to use ID 1, which already exists for 'id' field
505
+ updated_field = Field.of(pa.field("age", pa.int32(), nullable=True), field_id=1)
506
+
507
+ update = SchemaUpdate.of(base_schema)
508
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
509
+ update._update_field("age", updated_field).apply()
510
+
511
+ assert "duplicate field ID 1" in str(exc_info.value)
512
+ assert exc_info.value.field_locator == "age"
513
+
514
+ def test_cannot_remove_all_fields(self, base_schema):
515
+ """Test that removing all fields fails."""
516
+
517
+ update = SchemaUpdate.of(base_schema, True)
518
+ with pytest.raises(ValueError) as exc_info:
519
+ update.remove_field("name").remove_field("age").remove_field("id").apply()
520
+
521
+ assert "Schema must contain at least one field." in str(exc_info.value)
522
+
523
+ def test_cannot_remove_merge_key_field(self, protected_fields_schema):
524
+ """Test that removing merge key fields is forbidden."""
525
+ update = SchemaUpdate.of(protected_fields_schema)
526
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
527
+ update.remove_field("id").apply()
528
+
529
+ assert "Cannot remove merge key field" in str(exc_info.value)
530
+ assert "critical for data integrity" in str(exc_info.value)
531
+
532
+ def test_cannot_remove_event_time_field(self, protected_fields_schema):
533
+ """Test that removing event time fields is forbidden."""
534
+ update = SchemaUpdate.of(protected_fields_schema)
535
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
536
+ update.remove_field("timestamp").apply()
537
+
538
+ assert "Cannot remove event time field" in str(exc_info.value)
539
+ assert "critical for temporal operations" in str(exc_info.value)
540
+
541
+ def test_cannot_remove_merge_order_field(self, protected_fields_schema):
542
+ """Test that removing merge order fields is forbidden."""
543
+ update = SchemaUpdate.of(protected_fields_schema)
544
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
545
+ update.remove_field("priority").apply()
546
+
547
+ assert "Cannot remove merge order field" in str(exc_info.value)
548
+ assert "critical for data ordering" in str(exc_info.value)
549
+
550
+ def test_cannot_change_merge_key_status(self, protected_fields_schema):
551
+ """Test that changing merge key status is forbidden."""
552
+ # Try to make merge key field not a merge key
553
+ updated_field = Field.of(
554
+ pa.field("id", pa.int64(), nullable=False), field_id=1, is_merge_key=False
555
+ )
556
+
557
+ update = SchemaUpdate.of(protected_fields_schema)
558
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
559
+ update._update_field("id", updated_field).apply()
560
+
561
+ assert "Cannot change merge key status" in str(exc_info.value)
562
+ assert "critical for data integrity" in str(exc_info.value)
563
+
564
+ def test_cannot_change_event_time_status(self, protected_fields_schema):
565
+ """Test that changing event time status is forbidden."""
566
+ # Try to make event time field not an event time field
567
+ updated_field = Field.of(
568
+ pa.field("timestamp", pa.timestamp("us"), nullable=False),
569
+ field_id=2,
570
+ is_event_time=False,
571
+ )
572
+
573
+ update = SchemaUpdate.of(protected_fields_schema)
574
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
575
+ update._update_field("timestamp", updated_field).apply()
576
+
577
+ assert "Cannot change event time status" in str(exc_info.value)
578
+ assert "critical for temporal operations" in str(exc_info.value)
579
+
580
+ def test_cannot_change_merge_order(self, protected_fields_schema):
581
+ """Test that changing merge order is forbidden."""
582
+ # Try to change merge order from ASCENDING to DESCENDING
583
+ updated_field = Field.of(
584
+ pa.field("priority", pa.int32(), nullable=True),
585
+ field_id=3,
586
+ merge_order=MergeOrder.of(SortOrder.DESCENDING),
587
+ )
588
+
589
+ update = SchemaUpdate.of(protected_fields_schema)
590
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
591
+ update._update_field("priority", updated_field).apply()
592
+
593
+ assert "Cannot change merge order" in str(exc_info.value)
594
+ assert "critical for data consistency" in str(exc_info.value)
595
+
596
+ def test_cannot_change_past_default(self, protected_fields_schema):
597
+ """Test that changing past_default is forbidden."""
598
+ # Try to change past_default from "default" to "new_default"
599
+ updated_field = Field.of(
600
+ pa.field("data", pa.string(), nullable=True),
601
+ field_id=4,
602
+ past_default="new_default",
603
+ consistency_type=SchemaConsistencyType.COERCE,
604
+ )
605
+
606
+ update = SchemaUpdate.of(protected_fields_schema)
607
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
608
+ update._update_field("data", updated_field).apply()
609
+
610
+ assert "Cannot change past_default" in str(exc_info.value)
611
+ assert "immutable once set" in str(exc_info.value)
612
+
613
+ def test_consistency_type_evolution_coerce_to_validate(self, base_schema):
614
+ """Test allowed transition from COERCE to VALIDATE."""
615
+ # First add a field with COERCE consistency type
616
+ coerce_field = Field.of(
617
+ pa.field("test", pa.string(), nullable=True),
618
+ field_id=4,
619
+ consistency_type=SchemaConsistencyType.COERCE,
620
+ )
621
+ schema_with_coerce = (
622
+ SchemaUpdate.of(base_schema).add_field(coerce_field).apply()
623
+ )
624
+
625
+ # Now update it to VALIDATE - this should be allowed
626
+ update = SchemaUpdate.of(schema_with_coerce)
627
+ result_schema = update.update_field_consistency_type(
628
+ "test", SchemaConsistencyType.VALIDATE
629
+ ).apply()
630
+
631
+ updated_field = result_schema.field("test")
632
+ assert updated_field.consistency_type == SchemaConsistencyType.VALIDATE
633
+
634
+ def test_consistency_type_evolution_validate_to_coerce(self, base_schema):
635
+ """Test allowed transition from VALIDATE to COERCE."""
636
+ # First add a field with VALIDATE consistency type
637
+ validate_field = Field.of(
638
+ pa.field("test", pa.string(), nullable=True),
639
+ field_id=4,
640
+ consistency_type=SchemaConsistencyType.VALIDATE,
641
+ )
642
+ schema_with_validate = (
643
+ SchemaUpdate.of(base_schema).add_field(validate_field).apply()
644
+ )
645
+
646
+ # Now update it to COERCE - this should be allowed
647
+ update = SchemaUpdate.of(schema_with_validate)
648
+ result_schema = update.update_field_consistency_type(
649
+ "test", SchemaConsistencyType.COERCE
650
+ ).apply()
651
+
652
+ updated_field = result_schema.field("test")
653
+ assert updated_field.consistency_type == SchemaConsistencyType.COERCE
654
+
655
+ def test_consistency_type_evolution_to_none_allowed(self, base_schema):
656
+ """Test allowed transition from COERCE/VALIDATE to NONE."""
657
+ # First add a field with COERCE consistency type
658
+ coerce_field = Field.of(
659
+ pa.field("test", pa.string(), nullable=True),
660
+ field_id=4,
661
+ consistency_type=SchemaConsistencyType.COERCE,
662
+ )
663
+ schema_with_coerce = (
664
+ SchemaUpdate.of(base_schema).add_field(coerce_field).apply()
665
+ )
666
+
667
+ # Now update it to NONE - this should be allowed (relaxing constraints)
668
+ update = SchemaUpdate.of(schema_with_coerce)
669
+ result_schema = update.update_field_consistency_type(
670
+ "test", SchemaConsistencyType.NONE
671
+ ).apply()
672
+
673
+ updated_field = result_schema.field("test")
674
+ assert updated_field.consistency_type == SchemaConsistencyType.NONE
675
+
676
+ def test_consistency_type_evolution_none_to_coerce_forbidden(self, base_schema):
677
+ """Test forbidden transition from NONE to COERCE."""
678
+ # First add a field with NONE consistency type
679
+ none_field = Field.of(
680
+ pa.field("test", pa.string(), nullable=True),
681
+ field_id=4,
682
+ consistency_type=SchemaConsistencyType.NONE,
683
+ )
684
+ schema_with_none = SchemaUpdate.of(base_schema).add_field(none_field).apply()
685
+
686
+ # Now try to update it to COERCE - this should fail
687
+ update = SchemaUpdate.of(schema_with_none)
688
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
689
+ update.update_field_consistency_type(
690
+ "test", SchemaConsistencyType.COERCE
691
+ ).apply()
692
+
693
+ assert "Cannot change consistency type" in str(exc_info.value)
694
+ assert "from none to coerce" in str(exc_info.value)
695
+ assert "tighten validation constraints" in str(exc_info.value)
696
+
697
+ def test_consistency_type_evolution_none_to_validate_forbidden(self, base_schema):
698
+ """Test forbidden transition from NONE to VALIDATE."""
699
+ # First add a field with NONE consistency type
700
+ none_field = Field.of(
701
+ pa.field("test", pa.string(), nullable=True),
702
+ field_id=4,
703
+ consistency_type=SchemaConsistencyType.NONE,
704
+ )
705
+ schema_with_none = SchemaUpdate.of(base_schema).add_field(none_field).apply()
706
+
707
+ # Now try to update it to VALIDATE - this should fail
708
+ update = SchemaUpdate.of(schema_with_none)
709
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
710
+ update.update_field_consistency_type(
711
+ "test", SchemaConsistencyType.VALIDATE
712
+ ).apply()
713
+
714
+ assert "Cannot change consistency type" in str(exc_info.value)
715
+ assert "from none to validate" in str(exc_info.value)
716
+ assert "tighten validation constraints" in str(exc_info.value)
717
+
718
+ def test_protected_fields_allowed_with_incompatible_flag(
719
+ self, protected_fields_schema
720
+ ):
721
+ """Test that protected field changes are allowed with allow_incompatible_changes=True."""
722
+ # Should be able to remove merge key field with the flag
723
+ update = SchemaUpdate.of(
724
+ protected_fields_schema, allow_incompatible_changes=True
725
+ )
726
+ result_schema = update.remove_field("id").apply()
727
+
728
+ assert len(result_schema.fields) == 3
729
+ field_names = [f.path[0] for f in result_schema.fields if f.path]
730
+ assert "id" not in field_names
731
+
732
+ def test_duplicate_field_id_still_forbidden_with_flag(self, base_schema):
733
+ """Test that duplicate field IDs are prevented through auto-assignment even with allow_incompatible_changes=True.
734
+
735
+ Note: This test behavior changed after implementing automatic field ID assignment.
736
+ Duplicate field ID conflicts can no longer occur because IDs are auto-assigned.
737
+ """
738
+
739
+ test_schema = Schema.of(
740
+ [
741
+ Field.of(
742
+ pa.field("foo", pa.int64(), nullable=False),
743
+ field_id=1,
744
+ is_merge_key=True,
745
+ ),
746
+ Field.of(pa.field("name", pa.string(), nullable=True), field_id=2),
747
+ Field.of(pa.field("age", pa.int32(), nullable=True), field_id=3),
748
+ ]
749
+ )
750
+ # User specifies conflicting field ID, but it will be ignored and auto-assigned
751
+ duplicate_id_field = Field.of(
752
+ pa.field("new_field", pa.string(), nullable=True), field_id=1
753
+ )
754
+
755
+ update = SchemaUpdate.of(test_schema, allow_incompatible_changes=True)
756
+ result_schema = update.add_field(duplicate_id_field).apply()
757
+
758
+ # Field should be added with auto-assigned ID (4), not the conflicting ID (1)
759
+ new_field = result_schema.field("new_field")
760
+ assert new_field.id == 4 # Auto-assigned, not 1
761
+ assert new_field.arrow.name == "new_field"
762
+
763
+ # Original field with ID 1 should be unchanged
764
+ assert result_schema.field("foo").id == 1
765
+
766
+ # Verify no duplicate field IDs in final schema
767
+ field_ids = [field.id for field in result_schema.fields]
768
+ assert len(field_ids) == len(
769
+ set(field_ids)
770
+ ), f"Duplicate field IDs found: {field_ids}"
771
+
772
+ def test_rename_field_success(self, base_schema):
773
+ """Test successfully renaming a field."""
774
+ update = SchemaUpdate.of(base_schema)
775
+ result_schema = update.rename_field("name", "full_name").apply()
776
+
777
+ # Original field should be gone
778
+ field_names = [f.path[0] for f in result_schema.fields if f.path]
779
+ assert "name" not in field_names
780
+ assert "full_name" in field_names
781
+
782
+ # Renamed field should have same properties except name
783
+ original_field = base_schema.field("name")
784
+ renamed_field = result_schema.field("full_name")
785
+
786
+ assert renamed_field.arrow.name == "full_name"
787
+ assert renamed_field.arrow.type == original_field.arrow.type
788
+ assert renamed_field.arrow.nullable == original_field.arrow.nullable
789
+ assert renamed_field.id == original_field.id
790
+ assert renamed_field.doc == original_field.doc
791
+ assert renamed_field.consistency_type == original_field.consistency_type
792
+
793
+ def test_rename_field_nonexistent_field_fails(self, base_schema):
794
+ """Test renaming a field that doesn't exist fails."""
795
+ update = SchemaUpdate.of(base_schema)
796
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
797
+ update.rename_field("nonexistent", "new_name").apply()
798
+
799
+ assert "does not exist" in str(exc_info.value)
800
+ assert exc_info.value.field_locator == "nonexistent"
801
+
802
+ def test_update_field_type_success(self, base_schema):
803
+ """Test successfully updating field type."""
804
+ update = SchemaUpdate.of(base_schema)
805
+ result_schema = update.update_field_type("age", pa.int64()).apply()
806
+
807
+ # Field should have new type but same other properties
808
+ original_field = base_schema.field("age")
809
+ updated_field = result_schema.field("age")
810
+
811
+ assert updated_field.arrow.type == pa.int64()
812
+ assert updated_field.arrow.name == original_field.arrow.name
813
+ assert updated_field.arrow.nullable == original_field.arrow.nullable
814
+ assert updated_field.id == original_field.id
815
+ assert updated_field.doc == original_field.doc
816
+ assert updated_field.consistency_type == original_field.consistency_type
817
+
818
+ def test_update_field_type_incompatible_fails(self, base_schema):
819
+ """Test updating field type with incompatible type fails."""
820
+ update = SchemaUpdate.of(base_schema)
821
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
822
+ update.update_field_type("age", pa.string()).apply()
823
+
824
+ assert "would break compatibility" in str(exc_info.value)
825
+
826
+ def test_update_field_type_incompatible_allowed_with_flag(self, base_schema):
827
+ """Test incompatible type update succeeds with allow_incompatible_changes=True."""
828
+ update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
829
+ result_schema = update.update_field_type("age", pa.string()).apply()
830
+
831
+ updated_field = result_schema.field("age")
832
+ assert updated_field.arrow.type == pa.string()
833
+
834
+ def test_update_field_type_nonexistent_field_fails(self, base_schema):
835
+ """Test updating type of a field that doesn't exist fails."""
836
+ update = SchemaUpdate.of(base_schema)
837
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
838
+ update.update_field_type("nonexistent", pa.string()).apply()
839
+
840
+ assert "does not exist" in str(exc_info.value)
841
+ assert exc_info.value.field_locator == "nonexistent"
842
+
843
+ def test_update_field_doc_success(self, base_schema):
844
+ """Test successfully updating field documentation."""
845
+ update = SchemaUpdate.of(base_schema)
846
+ result_schema = update.update_field_doc(
847
+ "name", "Full name of the person"
848
+ ).apply()
849
+
850
+ # Field should have new doc but same other properties
851
+ original_field = base_schema.field("name")
852
+ updated_field = result_schema.field("name")
853
+
854
+ assert updated_field.doc == "Full name of the person"
855
+ assert updated_field.arrow.name == original_field.arrow.name
856
+ assert updated_field.arrow.type == original_field.arrow.type
857
+ assert updated_field.arrow.nullable == original_field.arrow.nullable
858
+ assert updated_field.id == original_field.id
859
+ assert updated_field.consistency_type == original_field.consistency_type
860
+
861
+ def test_update_field_doc_to_none(self, base_schema):
862
+ """Test updating field documentation to None."""
863
+ # First set some doc
864
+ schema_with_doc = (
865
+ SchemaUpdate.of(base_schema)
866
+ .update_field_doc("name", "Original doc")
867
+ .apply()
868
+ )
869
+
870
+ # Then update to None
871
+ update = SchemaUpdate.of(schema_with_doc)
872
+ result_schema = update.update_field_doc("name", None).apply()
873
+
874
+ updated_field = result_schema.field("name")
875
+ assert updated_field.doc is None
876
+
877
+ def test_update_field_doc_nonexistent_field_fails(self, base_schema):
878
+ """Test updating doc of a field that doesn't exist fails."""
879
+ update = SchemaUpdate.of(base_schema)
880
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
881
+ update.update_field_doc("nonexistent", "Some doc").apply()
882
+
883
+ assert "does not exist" in str(exc_info.value)
884
+ assert exc_info.value.field_locator == "nonexistent"
885
+
886
+ def test_update_field_nullability_success(self, base_schema):
887
+ """Test successfully updating field nullability."""
888
+ # Make nullable field non-nullable (should succeed with allow_incompatible_changes)
889
+ update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
890
+ result_schema = update.update_field_nullability("name", False).apply()
891
+
892
+ # Field should have new nullability but same other properties
893
+ original_field = base_schema.field("name")
894
+ updated_field = result_schema.field("name")
895
+
896
+ assert updated_field.arrow.nullable is False
897
+ assert updated_field.arrow.name == original_field.arrow.name
898
+ assert updated_field.arrow.type == original_field.arrow.type
899
+ assert updated_field.id == original_field.id
900
+ assert updated_field.doc == original_field.doc
901
+ assert updated_field.consistency_type == original_field.consistency_type
902
+
903
+ def test_update_field_nullability_incompatible_fails(self, base_schema):
904
+ """Test making nullable field non-nullable fails without flag."""
905
+ update = SchemaUpdate.of(base_schema)
906
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
907
+ update.update_field_nullability("name", False).apply()
908
+
909
+ assert "non-nullable without" in str(exc_info.value)
910
+
911
+ def test_update_field_nullability_nonexistent_field_fails(self, base_schema):
912
+ """Test updating nullability of a field that doesn't exist fails."""
913
+ update = SchemaUpdate.of(base_schema)
914
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
915
+ update.update_field_nullability("nonexistent", True).apply()
916
+
917
+ assert "does not exist" in str(exc_info.value)
918
+ assert exc_info.value.field_locator == "nonexistent"
919
+
920
+ def test_update_field_consistency_type_success(self, base_schema):
921
+ """Test successfully updating field consistency type."""
922
+ update = SchemaUpdate.of(base_schema)
923
+ result_schema = update.update_field_consistency_type(
924
+ "name", SchemaConsistencyType.VALIDATE
925
+ ).apply()
926
+
927
+ # Field should have new consistency type but same other properties
928
+ original_field = base_schema.field("name")
929
+ updated_field = result_schema.field("name")
930
+
931
+ assert updated_field.consistency_type == SchemaConsistencyType.VALIDATE
932
+ assert updated_field.arrow.name == original_field.arrow.name
933
+ assert updated_field.arrow.type == original_field.arrow.type
934
+ assert updated_field.arrow.nullable == original_field.arrow.nullable
935
+ assert updated_field.id == original_field.id
936
+ assert updated_field.doc == original_field.doc
937
+
938
+ def test_update_field_consistency_type_to_none(self, base_schema):
939
+ """Test updating field consistency type to None."""
940
+ # First set some consistency type
941
+ schema_with_coerce = (
942
+ SchemaUpdate.of(base_schema)
943
+ .update_field_consistency_type("name", SchemaConsistencyType.COERCE)
944
+ .apply()
945
+ )
946
+
947
+ # Then update to None
948
+ update = SchemaUpdate.of(schema_with_coerce)
949
+ result_schema = update.update_field_consistency_type("name", None).apply()
950
+
951
+ updated_field = result_schema.field("name")
952
+ assert updated_field.consistency_type is None
953
+
954
+ def test_update_field_consistency_type_nonexistent_field_fails(self, base_schema):
955
+ """Test updating consistency type of a field that doesn't exist fails."""
956
+ update = SchemaUpdate.of(base_schema)
957
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
958
+ update.update_field_consistency_type(
959
+ "nonexistent", SchemaConsistencyType.VALIDATE
960
+ ).apply()
961
+
962
+ assert "does not exist" in str(exc_info.value)
963
+ assert exc_info.value.field_locator == "nonexistent"
964
+
965
+ def test_update_field_future_default_success(self, base_schema):
966
+ """Test successfully updating field future default."""
967
+ update = SchemaUpdate.of(base_schema)
968
+ result_schema = update.update_field_future_default("name", "Unknown").apply()
969
+
970
+ # Field should have new future default but same other properties
971
+ original_field = base_schema.field("name")
972
+ updated_field = result_schema.field("name")
973
+
974
+ assert updated_field.future_default == "Unknown"
975
+ assert updated_field.arrow.name == original_field.arrow.name
976
+ assert updated_field.arrow.type == original_field.arrow.type
977
+ assert updated_field.arrow.nullable == original_field.arrow.nullable
978
+ assert updated_field.id == original_field.id
979
+ assert updated_field.doc == original_field.doc
980
+ assert updated_field.consistency_type == original_field.consistency_type
981
+ assert updated_field.past_default == original_field.past_default
982
+
983
+ def test_update_field_future_default_to_none(self, base_schema):
984
+ """Test updating field future default to None."""
985
+ # First set some future default
986
+ schema_with_default = (
987
+ SchemaUpdate.of(base_schema)
988
+ .update_field_future_default("name", "Default Name")
989
+ .apply()
990
+ )
991
+
992
+ # Then update to None
993
+ update = SchemaUpdate.of(schema_with_default)
994
+ result_schema = update.update_field_future_default("name", None).apply()
995
+
996
+ updated_field = result_schema.field("name")
997
+ assert updated_field.future_default is None
998
+
999
+ def test_update_field_future_default_invalid_type_fails(self, base_schema):
1000
+ """Test updating field future default with incompatible type fails."""
1001
+ update = SchemaUpdate.of(base_schema)
1002
+ with pytest.raises(ValueError) as exc_info:
1003
+ update.update_field_future_default(
1004
+ "name", 123
1005
+ ).apply() # int for string field
1006
+
1007
+ assert "not compatible with type" in str(exc_info.value)
1008
+
1009
+ def test_update_field_future_default_nonexistent_field_fails(self, base_schema):
1010
+ """Test updating future default of a field that doesn't exist fails."""
1011
+ update = SchemaUpdate.of(base_schema)
1012
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
1013
+ update.update_field_future_default("nonexistent", "value").apply()
1014
+
1015
+ assert "does not exist" in str(exc_info.value)
1016
+ assert exc_info.value.field_locator == "nonexistent"
1017
+
1018
+ def test_method_chaining_with_metadata_preservation(self, base_schema):
1019
+ """Test that chaining operations on the same field preserves metadata correctly."""
1020
+ result_schema = (
1021
+ SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
1022
+ .update_field_type("age", pa.int64())
1023
+ .update_field_consistency_type("age", SchemaConsistencyType.VALIDATE)
1024
+ .update_field_future_default("age", 0)
1025
+ .apply()
1026
+ )
1027
+
1028
+ age_field = result_schema.field("age")
1029
+ assert age_field.arrow.type == pa.int64()
1030
+ assert age_field.consistency_type == SchemaConsistencyType.VALIDATE
1031
+ assert age_field.future_default == 0
1032
+
1033
+ def test_individual_methods_work_correctly(self, base_schema):
1034
+ """Test that each method works correctly on its own."""
1035
+ # Test doc update
1036
+ result1 = (
1037
+ SchemaUpdate.of(base_schema).update_field_doc("name", "Full name").apply()
1038
+ )
1039
+ assert result1.field("name").doc == "Full name"
1040
+
1041
+ # Test nullability update
1042
+ result2 = (
1043
+ SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
1044
+ .update_field_nullability("name", False)
1045
+ .apply()
1046
+ )
1047
+ assert result2.field("name").arrow.nullable is False
1048
+
1049
+ # Test rename
1050
+ result3 = SchemaUpdate.of(base_schema).rename_field("name", "full_name").apply()
1051
+ assert result3.field("full_name").arrow.name == "full_name"
1052
+
1053
+ # Test multiple independent operations
1054
+ result4 = (
1055
+ SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
1056
+ .update_field_type("age", pa.int64())
1057
+ .update_field_consistency_type("name", SchemaConsistencyType.VALIDATE)
1058
+ .apply()
1059
+ )
1060
+
1061
+ assert result4.field("age").arrow.type == pa.int64()
1062
+ assert result4.field("name").consistency_type == SchemaConsistencyType.VALIDATE
1063
+
1064
+ def test_method_chaining_different_fields(self, base_schema):
1065
+ """Test chaining operations on different fields."""
1066
+ result_schema = (
1067
+ SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
1068
+ .update_field_type("age", pa.int64())
1069
+ .update_field_doc("name", "Updated name")
1070
+ .update_field_consistency_type("id", SchemaConsistencyType.VALIDATE)
1071
+ .apply()
1072
+ )
1073
+
1074
+ age_field = result_schema.field("age")
1075
+ assert age_field.arrow.type == pa.int64()
1076
+
1077
+ name_field = result_schema.field("name")
1078
+ assert name_field.doc == "Updated name"
1079
+
1080
+ id_field = result_schema.field("id")
1081
+ assert id_field.consistency_type == SchemaConsistencyType.VALIDATE
1082
+
1083
+ def test_user_friendly_methods_vs_protected_update_field(self, base_schema):
1084
+ """Test that user-friendly methods produce same results as protected _update_field."""
1085
+ # Using update_field_type should be equivalent to using _update_field with manually constructed field
1086
+ update1 = SchemaUpdate.of(base_schema)
1087
+ result1 = update1.update_field_type("age", pa.int64()).apply()
1088
+
1089
+ # Manually construct the updated field
1090
+ original_field = base_schema.field("age")
1091
+ new_arrow_field = pa.field(
1092
+ original_field.arrow.name,
1093
+ pa.int64(),
1094
+ nullable=original_field.arrow.nullable,
1095
+ metadata=original_field.arrow.metadata,
1096
+ )
1097
+ updated_field = Field.of(
1098
+ new_arrow_field,
1099
+ field_id=original_field.id,
1100
+ is_merge_key=original_field.is_merge_key,
1101
+ merge_order=original_field.merge_order,
1102
+ is_event_time=original_field.is_event_time,
1103
+ doc=original_field.doc,
1104
+ past_default=original_field.past_default,
1105
+ future_default=original_field.future_default,
1106
+ consistency_type=original_field.consistency_type,
1107
+ native_object=original_field.native_object,
1108
+ )
1109
+
1110
+ update2 = SchemaUpdate.of(base_schema)
1111
+ result2 = update2._update_field("age", updated_field).apply()
1112
+
1113
+ # Results should be equivalent
1114
+ field1 = result1.field("age")
1115
+ field2 = result2.field("age")
1116
+
1117
+ assert field1.arrow.type == field2.arrow.type
1118
+ assert field1.arrow.name == field2.arrow.name
1119
+ assert field1.arrow.nullable == field2.arrow.nullable
1120
+ assert field1.id == field2.id
1121
+ assert field1.doc == field2.doc
1122
+ assert field1.consistency_type == field2.consistency_type
1123
+
1124
+ def test_schema_update_convenience_method(self, base_schema):
1125
+ """Test Schema.update() convenience method."""
1126
+ # Start with a schema that has a field we can modify
1127
+ base_schema = (
1128
+ base_schema.update()
1129
+ .add_field(Field.of(pa.field("score", pa.int32(), nullable=True)))
1130
+ .apply()
1131
+ )
1132
+
1133
+ # Test update_field_type (compatible type widening)
1134
+ result1 = base_schema.update().update_field_type("score", pa.int64()).apply()
1135
+ score_field = result1.field("score")
1136
+ assert score_field.arrow.type == pa.int64()
1137
+ assert score_field.arrow.name == "score"
1138
+ assert score_field.id == 4 # Auto-assigned, not 10
1139
+
1140
+ # Test update_field_doc
1141
+ result2 = base_schema.update().update_field_doc("score", "User score").apply()
1142
+ score_field = result2.field("score")
1143
+ assert score_field.doc == "User score"
1144
+ assert score_field.arrow.type == pa.int32() # Type unchanged
1145
+
1146
+ # Test update_field_consistency_type
1147
+ result3 = (
1148
+ base_schema.update()
1149
+ .update_field_consistency_type("score", SchemaConsistencyType.VALIDATE)
1150
+ .apply()
1151
+ )
1152
+ score_field = result3.field("score")
1153
+ assert score_field.consistency_type == SchemaConsistencyType.VALIDATE
1154
+ assert score_field.arrow.type == pa.int32() # Type unchanged
1155
+
1156
+ # Test update_field_future_default
1157
+ result4 = base_schema.update().update_field_future_default("score", 100).apply()
1158
+ score_field = result4.field("score")
1159
+ assert score_field.future_default == 100
1160
+ assert score_field.arrow.type == pa.int32() # Type unchanged
1161
+
1162
+ # Test rename_field
1163
+ result5 = base_schema.update().rename_field("score", "user_score").apply()
1164
+ field_names = [f.path[0] for f in result5.fields if f.path]
1165
+ assert "score" not in field_names
1166
+ assert "user_score" in field_names
1167
+ renamed_field = result5.field("user_score")
1168
+ assert renamed_field.arrow.name == "user_score"
1169
+ assert renamed_field.arrow.type == pa.int32()
1170
+ assert renamed_field.id == 4 # Retains original auto-assigne field ID
1171
+
1172
+ # Test method chaining
1173
+ result6 = (
1174
+ base_schema.update()
1175
+ .update_field_type("score", pa.int64())
1176
+ .update_field_doc("score", "User score in points")
1177
+ .update_field_consistency_type("score", SchemaConsistencyType.COERCE)
1178
+ .update_field_future_default("score", 0)
1179
+ .apply()
1180
+ )
1181
+
1182
+ final_score_field = result6.field("score")
1183
+ assert final_score_field.arrow.type == pa.int64()
1184
+ assert final_score_field.doc == "User score in points"
1185
+ assert final_score_field.consistency_type == SchemaConsistencyType.COERCE
1186
+ assert final_score_field.future_default == 0
1187
+
1188
+ def test_add_multiple_fields_unique_field_ids(self, base_schema):
1189
+ """Test adding multiple fields in one SchemaUpdate gets unique, incremental field IDs."""
1190
+ # Create multiple new fields with different types to add simultaneously
1191
+ # Note: Field IDs specified here will be ignored and auto-assigned
1192
+ new_field1 = Field.of(
1193
+ pa.field("email", pa.string(), nullable=True),
1194
+ field_id=999, # Will be ignored, auto-assigned to 4
1195
+ )
1196
+ new_field2 = Field.of(
1197
+ pa.field("score", pa.float64(), nullable=True),
1198
+ field_id=888, # Will be ignored, auto-assigned to 5
1199
+ )
1200
+ new_field3 = Field.of(
1201
+ pa.field("active", pa.bool_(), nullable=False),
1202
+ field_id=777, # Will be ignored, auto-assigned to 6
1203
+ past_default=True,
1204
+ )
1205
+ new_field4 = Field.of(
1206
+ pa.field("created_at", pa.timestamp("us"), nullable=True),
1207
+ field_id=666, # Will be ignored, auto-assigned to 7
1208
+ )
1209
+
1210
+ # Add all fields in a single SchemaUpdate operation
1211
+ update = SchemaUpdate.of(base_schema)
1212
+ result_schema = (
1213
+ update.add_field(new_field1)
1214
+ .add_field(new_field2)
1215
+ .add_field(new_field3)
1216
+ .add_field(new_field4)
1217
+ .apply()
1218
+ )
1219
+
1220
+ # Verify all fields were added successfully
1221
+ assert len(result_schema.fields) == 7 # 3 original + 4 new
1222
+
1223
+ # Verify each field has the expected unique field ID
1224
+ email_field = result_schema.field("email")
1225
+ assert email_field.id == 4
1226
+ assert email_field.arrow.name == "email"
1227
+ assert email_field.arrow.type == pa.string()
1228
+
1229
+ score_field = result_schema.field("score")
1230
+ assert score_field.id == 5
1231
+ assert score_field.arrow.name == "score"
1232
+ assert score_field.arrow.type == pa.float64()
1233
+
1234
+ active_field = result_schema.field("active")
1235
+ assert active_field.id == 6
1236
+ assert active_field.arrow.name == "active"
1237
+ assert active_field.arrow.type == pa.bool_()
1238
+ assert active_field.past_default is True
1239
+
1240
+ created_at_field = result_schema.field("created_at")
1241
+ assert created_at_field.id == 7
1242
+ assert created_at_field.arrow.name == "created_at"
1243
+ assert pa.types.is_timestamp(created_at_field.arrow.type)
1244
+
1245
+ # Verify original fields are preserved
1246
+ assert result_schema.field("id").id == 1
1247
+ assert result_schema.field("name").id == 2
1248
+ assert result_schema.field("age").id == 3
1249
+
1250
+ # Verify no duplicate field IDs exist
1251
+ field_ids = [field.id for field in result_schema.fields]
1252
+ assert len(field_ids) == len(
1253
+ set(field_ids)
1254
+ ), f"Duplicate field IDs found: {field_ids}"
1255
+
1256
+ # Verify field IDs are sequential starting from max_field_id + 1
1257
+ expected_ids = [1, 2, 3, 4, 5, 6, 7]
1258
+ assert sorted(field_ids) == expected_ids
1259
+
1260
+ def test_conflicting_operations_add_then_remove_same_field(self, base_schema):
1261
+ """Test conflicting operations: adding a field then removing the same field should raise ValueError."""
1262
+ new_field = Field.of(pa.field("temp", pa.string(), nullable=True), field_id=4)
1263
+
1264
+ # Add field then remove the same field - should raise ValueError for conflicting operations
1265
+ update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
1266
+ with pytest.raises(ValueError) as exc_info:
1267
+ (
1268
+ update.add_field(new_field)
1269
+ .remove_field("temp") # Conflicts with add operation
1270
+ .apply()
1271
+ )
1272
+
1273
+ assert "Conflicting operations detected on field 'temp'" in str(exc_info.value)
1274
+
1275
+ def test_conflicting_operations_remove_then_add_same_field(self, base_schema):
1276
+ """Test conflicting operations: removing a field then adding it back should raise ValueError."""
1277
+ # Remove existing field then add it back - should raise ValueError for conflicting operations
1278
+ replacement_field = Field.of(
1279
+ pa.field("age", pa.int32(), nullable=True), field_id=3
1280
+ )
1281
+
1282
+ update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
1283
+ with pytest.raises(ValueError) as exc_info:
1284
+ (
1285
+ update.remove_field("age") # Remove existing field
1286
+ .add_field(replacement_field) # Conflicts with remove operation
1287
+ .apply()
1288
+ )
1289
+
1290
+ assert "Conflicting operations detected on field 'age'" in str(exc_info.value)
1291
+
1292
+ def test_conflicting_operations_update_then_remove_same_field(self, base_schema):
1293
+ """Test conflicting operations: updating a field then removing it should raise ValueError."""
1294
+ update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
1295
+ with pytest.raises(ValueError) as exc_info:
1296
+ (
1297
+ update.update_field_type("age", pa.int64()) # Update field type
1298
+ .remove_field("age") # Conflicts with update operation
1299
+ .apply()
1300
+ )
1301
+
1302
+ assert "Conflicting operations detected on field 'age'" in str(exc_info.value)
1303
+
1304
+ def test_conflicting_operations_remove_then_update_same_field_fails(
1305
+ self, base_schema
1306
+ ):
1307
+ """Test conflicting operations: removing a field then trying to update it fails during method chaining.
1308
+
1309
+ Note: This fails with AttributeError during update_field_type() call because _get_existing_field
1310
+ returns None for the removed field. This happens before our conflict validation in apply().
1311
+ The conflict detection catches most cases, but this specific order triggers the old behavior.
1312
+ """
1313
+ update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
1314
+
1315
+ # Remove field first, then try to update it - fails during update_field_type() call
1316
+ with pytest.raises(AttributeError) as exc_info:
1317
+ (
1318
+ update.remove_field("age").update_field_type( # Remove field
1319
+ "age", pa.int64()
1320
+ ) # Fails here due to _get_existing_field returning None
1321
+ )
1322
+
1323
+ assert "NoneType" in str(exc_info.value)
1324
+ assert "arrow" in str(exc_info.value)
1325
+
1326
+ def test_multiple_updates_same_field_allowed(self, base_schema):
1327
+ """Test multiple updates to the same field are allowed and applied cumulatively."""
1328
+ update = SchemaUpdate.of(base_schema)
1329
+ result_schema = (
1330
+ update.update_field_type("age", pa.int64()) # First update
1331
+ .update_field_doc("age", "Age in years") # Second update - should work
1332
+ .update_field_consistency_type(
1333
+ "age", SchemaConsistencyType.VALIDATE
1334
+ ) # Third update
1335
+ .apply()
1336
+ )
1337
+
1338
+ # All updates should be applied cumulatively
1339
+ updated_field = result_schema.field("age")
1340
+ assert updated_field.arrow.type == pa.int64()
1341
+ assert updated_field.doc == "Age in years"
1342
+ assert updated_field.consistency_type == SchemaConsistencyType.VALIDATE
1343
+ assert updated_field.id == 3 # ID should remain same
1344
+
1345
+ def test_multiple_updates_only_are_allowed_explicitly(self, base_schema):
1346
+ """Test that ONLY multiple update operations on same field are allowed - this demonstrates the refined logic."""
1347
+ update = SchemaUpdate.of(base_schema)
1348
+
1349
+ # Multiple update operations should work
1350
+ result_schema = (
1351
+ update.update_field_type("age", pa.int64())
1352
+ .update_field_doc("age", "Updated age field")
1353
+ .update_field_consistency_type("age", SchemaConsistencyType.COERCE)
1354
+ .update_field_future_default("age", 25)
1355
+ .apply()
1356
+ )
1357
+
1358
+ # All updates should be applied cumulatively
1359
+ age_field = result_schema.field("age")
1360
+ assert age_field.arrow.type == pa.int64()
1361
+ assert age_field.doc == "Updated age field"
1362
+ assert age_field.consistency_type == SchemaConsistencyType.COERCE
1363
+ assert age_field.future_default == 25
1364
+ assert age_field.id == 3 # Original ID preserved
1365
+
1366
+ def test_non_conflicting_operations_succeed(self, base_schema):
1367
+ """Test that non-conflicting operations on different fields succeed."""
1368
+ new_field = Field.of(
1369
+ pa.field("email", pa.string(), nullable=True), field_id=999
1370
+ )
1371
+
1372
+ update = SchemaUpdate.of(base_schema)
1373
+ result_schema = (
1374
+ update.add_field(new_field) # Add new field "email"
1375
+ .update_field_type("age", pa.int64()) # Update different field "age"
1376
+ .update_field_doc("name", "Full name") # Update different field "name"
1377
+ .apply()
1378
+ )
1379
+
1380
+ # All operations should succeed since they target different fields
1381
+ assert len(result_schema.fields) == 4 # 3 original + 1 new
1382
+
1383
+ # Verify new field was added with auto-assigned ID
1384
+ email_field = result_schema.field("email")
1385
+ assert email_field.id == 4 # Auto-assigned, not 999
1386
+ assert email_field.arrow.name == "email"
1387
+
1388
+ # Verify updates were applied
1389
+ age_field = result_schema.field("age")
1390
+ assert age_field.arrow.type == pa.int64()
1391
+ assert age_field.id == 3 # Original ID preserved
1392
+
1393
+ name_field = result_schema.field("name")
1394
+ assert name_field.doc == "Full name"
1395
+ assert name_field.id == 2 # Original ID preserved
1396
+
1397
+ def test_add_duplicate_field_name_fails(self, base_schema):
1398
+ """Test adding a field with a name that already exists should fail."""
1399
+ # Try to add a field with same name as existing field
1400
+ duplicate_field = Field.of(
1401
+ pa.field("name", pa.int32(), nullable=True), field_id=4
1402
+ )
1403
+
1404
+ update = SchemaUpdate.of(base_schema)
1405
+ with pytest.raises(SchemaCompatibilityError) as exc_info:
1406
+ update.add_field(duplicate_field).apply()
1407
+
1408
+ assert "already exists" in str(exc_info.value)
1409
+
1410
+ def test_add_field_ignores_user_specified_field_id(self, base_schema):
1411
+ """Test that add_field operations ignore user-specified field IDs and auto-assign sequentially.
1412
+
1413
+ This ensures field ID uniqueness and prevents users from accidentally creating
1414
+ conflicts by specifying existing field IDs.
1415
+ """
1416
+ # Try to add fields with conflicting field IDs (should be ignored)
1417
+ new_field1 = Field.of(
1418
+ pa.field("email", pa.string(), nullable=True),
1419
+ field_id=1, # Intentionally conflicts with existing "id" field
1420
+ )
1421
+ new_field2 = Field.of(
1422
+ pa.field("score", pa.float64(), nullable=True),
1423
+ field_id=2, # Intentionally conflicts with existing "name" field
1424
+ )
1425
+ new_field3 = Field.of(
1426
+ pa.field("active", pa.bool_(), nullable=True),
1427
+ field_id=999, # High number that should be ignored
1428
+ )
1429
+
1430
+ update = SchemaUpdate.of(base_schema)
1431
+ result_schema = (
1432
+ update.add_field(new_field1)
1433
+ .add_field(new_field2)
1434
+ .add_field(new_field3)
1435
+ .apply()
1436
+ )
1437
+
1438
+ # New fields should get auto-assigned field IDs starting from max_field_id + 1
1439
+ assert len(result_schema.fields) == 6 # 3 original + 3 new
1440
+
1441
+ # Verify original fields keep their IDs
1442
+ assert result_schema.field("id").id == 1
1443
+ assert result_schema.field("name").id == 2
1444
+ assert result_schema.field("age").id == 3
1445
+
1446
+ # Verify new fields get sequential auto-assigned IDs (ignoring user input)
1447
+ email_field = result_schema.field("email")
1448
+ score_field = result_schema.field("score")
1449
+ active_field = result_schema.field("active")
1450
+
1451
+ assert email_field.id == 4 # Not 1 (user-specified)
1452
+ assert score_field.id == 5 # Not 2 (user-specified)
1453
+ assert active_field.id == 6 # Not 999 (user-specified)
1454
+
1455
+ # Verify no duplicate field IDs
1456
+ field_ids = [field.id for field in result_schema.fields]
1457
+ assert len(field_ids) == len(
1458
+ set(field_ids)
1459
+ ), f"Duplicate field IDs found: {field_ids}"
1460
+
1461
+ def test_update_field_preserves_original_field_id(self, base_schema):
1462
+ """Test that update operations preserve the original field's ID regardless of user input.
1463
+
1464
+ This ensures field ID stability during updates - the field ID should never change
1465
+ when updating an existing field's properties.
1466
+ """
1467
+ # Create a field update with a different field ID (should be ignored)
1468
+ update = SchemaUpdate.of(base_schema)
1469
+
1470
+ # Update field type - the field ID in this context should be ignored
1471
+ result_schema = update.update_field_type("age", pa.int64()).apply()
1472
+
1473
+ # Field ID should remain the same as original (3), not change
1474
+ updated_field = result_schema.field("age")
1475
+ original_field = base_schema.field("age")
1476
+
1477
+ assert updated_field.id == original_field.id # Should be 3
1478
+ assert updated_field.arrow.type == pa.int64() # Type should be updated
1479
+ assert updated_field.arrow.name == "age" # Name should stay same
1480
+
1481
+ # All other fields should keep their original IDs too
1482
+ assert result_schema.field("id").id == 1
1483
+ assert result_schema.field("name").id == 2
1484
+
1485
+ def test_mixed_add_update_field_id_management(self, base_schema):
1486
+ """Test field ID management with mixed add and update operations.
1487
+
1488
+ Updates should preserve existing field IDs, while adds should get new sequential IDs.
1489
+ """
1490
+ # Add a field with conflicting ID, then update an existing field
1491
+ new_field = Field.of(
1492
+ pa.field("email", pa.string(), nullable=True),
1493
+ field_id=2, # Same as "name" field - should be ignored
1494
+ )
1495
+
1496
+ update = SchemaUpdate.of(base_schema)
1497
+ result_schema = (
1498
+ update.add_field(new_field) # Should get field_id=4, not 2
1499
+ .update_field_type("age", pa.int64()) # Should keep field_id=3
1500
+ .update_field_doc("name", "Full name") # Should keep field_id=2
1501
+ .apply()
1502
+ )
1503
+
1504
+ # Verify field ID assignments
1505
+ assert result_schema.field("id").id == 1 # Original
1506
+ assert result_schema.field("name").id == 2 # Original, updated doc
1507
+ assert result_schema.field("age").id == 3 # Original, updated type
1508
+ assert result_schema.field("email").id == 4 # New field, auto-assigned
1509
+
1510
+ # Verify updates were applied
1511
+ assert result_schema.field("age").arrow.type == pa.int64()
1512
+ assert result_schema.field("name").doc == "Full name"
1513
+ assert result_schema.field("email").arrow.name == "email"
1514
+
1515
+ # Verify no duplicates
1516
+ field_ids = [field.id for field in result_schema.fields]
1517
+ assert len(field_ids) == len(set(field_ids))
1518
+
1519
+ def test_field_id_auto_assignment_with_gaps(self):
1520
+ """Test that field ID auto-assignment handles gaps in existing field IDs correctly.
1521
+
1522
+ If the schema has field IDs [1, 3, 7], new fields should start from 8.
1523
+ """
1524
+ # Create a schema with gaps in field IDs
1525
+ schema_with_gaps = Schema.of(
1526
+ [
1527
+ Field.of(pa.field("id", pa.int64()), field_id=1),
1528
+ Field.of(pa.field("name", pa.string()), field_id=3), # Gap at 2
1529
+ Field.of(pa.field("score", pa.float32()), field_id=7), # Gap at 4,5,6
1530
+ ]
1531
+ )
1532
+
1533
+ new_field = Field.of(
1534
+ pa.field("email", pa.string(), nullable=True),
1535
+ field_id=999, # Should be ignored, auto-assigned to 8
1536
+ )
1537
+
1538
+ update = SchemaUpdate.of(schema_with_gaps)
1539
+ result_schema = update.add_field(new_field).apply()
1540
+
1541
+ # New field should get max_field_id + 1 = 7 + 1 = 8
1542
+ email_field = result_schema.field("email")
1543
+ assert email_field.id == 8 # Not 999 or any of the existing gaps
1544
+
1545
+ # Original fields should be unchanged
1546
+ assert result_schema.field("id").id == 1
1547
+ assert result_schema.field("name").id == 3
1548
+ assert result_schema.field("score").id == 7
1549
+
1550
+ def test_field_id_never_reused_after_max_field_removal(self):
1551
+ """Test that field IDs are never reused, even when max field ID is removed and same-named field added back.
1552
+
1553
+ This ensures field ID uniqueness over schema evolution history - a field with the same name
1554
+ but added after removal gets a new field ID, making it clear it's a different field.
1555
+ """
1556
+ # Create schema with fields having IDs 1, 2, 3
1557
+ base_schema = Schema.of(
1558
+ [
1559
+ Field.of(pa.field("id", pa.int64()), field_id=1),
1560
+ Field.of(pa.field("name", pa.string()), field_id=2),
1561
+ Field.of(
1562
+ pa.field("score", pa.float32()), field_id=3
1563
+ ), # This has max field ID
1564
+ ]
1565
+ )
1566
+
1567
+ # Step 1: Remove the field with the max field ID (score, ID=3)
1568
+ update1 = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
1569
+ schema_after_remove = update1.remove_field("score").apply()
1570
+
1571
+ # Verify the field is removed
1572
+ assert len(schema_after_remove.fields) == 2
1573
+ field_names = [f.path[0] for f in schema_after_remove.fields if f.path]
1574
+ assert "score" not in field_names
1575
+ assert "id" in field_names
1576
+ assert "name" in field_names
1577
+
1578
+ # Max field ID should still be 3 (based on original schema)
1579
+ assert schema_after_remove.max_field_id == 3
1580
+
1581
+ # Step 2: Add a field back with the same name ("score") but different type
1582
+ new_score_field = Field.of(
1583
+ pa.field(
1584
+ "score", pa.int32(), nullable=True
1585
+ ), # Different type than original
1586
+ field_id=999, # Will be ignored, should get ID 4 (not reuse 3)
1587
+ )
1588
+
1589
+ update2 = SchemaUpdate.of(schema_after_remove)
1590
+ schema_after_add = update2.add_field(new_score_field).apply()
1591
+
1592
+ # Verify field is added back
1593
+ assert len(schema_after_add.fields) == 3
1594
+ restored_score_field = schema_after_add.field("score")
1595
+
1596
+ # New field should get ID 4 (max_field_id + 1), NOT reuse ID 3
1597
+ assert (
1598
+ restored_score_field.id == 4
1599
+ ) # Should be 4, not 3 (the removed field's ID)
1600
+ assert restored_score_field.arrow.name == "score"
1601
+ assert (
1602
+ restored_score_field.arrow.type == pa.int32()
1603
+ ) # Different type than original
1604
+
1605
+ # Original fields should keep their IDs
1606
+ assert schema_after_add.field("id").id == 1
1607
+ assert schema_after_add.field("name").id == 2
1608
+
1609
+ # Verify no duplicate field IDs in final schema
1610
+ field_ids = [field.id for field in schema_after_add.fields]
1611
+ assert len(field_ids) == len(
1612
+ set(field_ids)
1613
+ ), f"Duplicate field IDs found: {field_ids}"
1614
+ assert sorted(field_ids) == [1, 2, 4] # Field ID 3 is permanently "retired"
1615
+
1616
+ def test_field_id_never_reused_multiple_removes_adds(self):
1617
+ """Test field ID non-reuse with multiple remove/add cycles.
1618
+
1619
+ This tests that field IDs continue incrementing even through multiple
1620
+ remove and add operations, ensuring each field gets a truly unique ID.
1621
+ """
1622
+ # Start with schema having IDs 1, 2, 3
1623
+ base_schema = Schema.of(
1624
+ [
1625
+ Field.of(pa.field("id", pa.int64()), field_id=1),
1626
+ Field.of(pa.field("name", pa.string()), field_id=2),
1627
+ Field.of(pa.field("score", pa.float32()), field_id=3),
1628
+ ]
1629
+ )
1630
+
1631
+ # Remove field with ID 3, add new field -> should get ID 4
1632
+ step1_schema = (
1633
+ SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
1634
+ .remove_field("score")
1635
+ .add_field(Field.of(pa.field("email", pa.string()), field_id=999))
1636
+ .apply()
1637
+ )
1638
+ assert step1_schema.field("email").id == 4
1639
+
1640
+ # Remove field with ID 2, add new field -> should get ID 5
1641
+ step2_schema = (
1642
+ SchemaUpdate.of(step1_schema, allow_incompatible_changes=True)
1643
+ .remove_field("name")
1644
+ .add_field(Field.of(pa.field("phone", pa.string()), field_id=888))
1645
+ .apply()
1646
+ )
1647
+ assert step2_schema.field("phone").id == 5
1648
+
1649
+ # Add back "name" field -> should get ID 6 (not reuse 2)
1650
+ step3_schema = (
1651
+ SchemaUpdate.of(step2_schema)
1652
+ .add_field(Field.of(pa.field("name", pa.string()), field_id=777))
1653
+ .apply()
1654
+ )
1655
+ restored_name_field = step3_schema.field("name")
1656
+ assert restored_name_field.id == 6 # Not 2 (the original name field's ID)
1657
+
1658
+ # Final schema should have fields with IDs [1, 4, 5, 6]
1659
+ # IDs 2 and 3 are permanently "retired"
1660
+ field_ids = [field.id for field in step3_schema.fields]
1661
+ assert sorted(field_ids) == [1, 4, 5, 6]
1662
+
1663
+ # Verify field names in final schema
1664
+ field_names = [f.path[0] for f in step3_schema.fields if f.path]
1665
+ assert sorted(field_names) == ["email", "id", "name", "phone"]
1666
+
1667
+ def test_schema_update_increments_id_by_one(self, base_schema):
1668
+ """Test that SchemaUpdate.apply() increments schema ID by exactly 1."""
1669
+ # Create a schema with a specific schema ID
1670
+ test_schema = Schema.of(
1671
+ [
1672
+ Field.of(
1673
+ pa.field("id", pa.int64(), nullable=False),
1674
+ field_id=1,
1675
+ is_merge_key=True,
1676
+ ),
1677
+ Field.of(pa.field("name", pa.string(), nullable=True), field_id=2),
1678
+ Field.of(pa.field("age", pa.int32(), nullable=True), field_id=3),
1679
+ ],
1680
+ schema_id=5, # Explicitly set schema ID to 5
1681
+ )
1682
+
1683
+ # Verify base schema has the expected ID
1684
+ assert test_schema.id == 5
1685
+
1686
+ # Apply a schema update (add a new field)
1687
+ new_field = Field.of(pa.field("email", pa.string(), nullable=True), field_id=4)
1688
+ updated_schema = SchemaUpdate.of(test_schema).add_field(new_field).apply()
1689
+
1690
+ # Verify the updated schema has ID = base_schema.id + 1
1691
+ assert updated_schema.id == 6 # 5 + 1
1692
+ assert len(updated_schema.fields) == 4
1693
+
1694
+ def test_schema_update_increments_id_from_zero(self):
1695
+ """Test that schema ID increments correctly when starting from 0."""
1696
+ # Create a schema with default schema ID (0)
1697
+ base_schema = Schema.of(
1698
+ [
1699
+ Field.of(pa.field("id", pa.int64()), field_id=1),
1700
+ Field.of(pa.field("name", pa.string()), field_id=2),
1701
+ ]
1702
+ )
1703
+
1704
+ # Verify base schema has default ID of 0
1705
+ assert base_schema.id == 0
1706
+
1707
+ # Apply a schema update
1708
+ updated_schema = (
1709
+ base_schema.update()
1710
+ .update_field_type("name", pa.string())
1711
+ .update_field_doc("name", "Full name")
1712
+ .apply()
1713
+ )
1714
+
1715
+ # Verify the updated schema has ID = 0 + 1 = 1
1716
+ assert updated_schema.id == 1
1717
+
1718
+ def test_multiple_schema_updates_increment_sequentially(self):
1719
+ """Test that multiple schema updates increment ID sequentially."""
1720
+ # Start with schema ID 10
1721
+ base_schema = Schema.of(
1722
+ [
1723
+ Field.of(pa.field("id", pa.int64()), field_id=1),
1724
+ Field.of(pa.field("name", pa.string()), field_id=2),
1725
+ ],
1726
+ schema_id=10,
1727
+ )
1728
+
1729
+ assert base_schema.id == 10
1730
+
1731
+ # First update: should go from 10 to 11
1732
+ schema_v11 = (
1733
+ base_schema.update()
1734
+ .add_field(Field.of(pa.field("age", pa.int32(), nullable=True)))
1735
+ .apply()
1736
+ )
1737
+ assert schema_v11.id == 11
1738
+
1739
+ # Second update: should go from 11 to 12
1740
+ schema_v12 = (
1741
+ schema_v11.update()
1742
+ .add_field(Field.of(pa.field("email", pa.string(), nullable=True)))
1743
+ .apply()
1744
+ )
1745
+ assert schema_v12.id == 12
1746
+
1747
+ # Third update: should go from 12 to 13
1748
+ schema_v13 = (
1749
+ schema_v12.update()
1750
+ .update_field_consistency_type("name", SchemaConsistencyType.VALIDATE)
1751
+ .apply()
1752
+ )
1753
+ assert schema_v13.id == 13
1754
+
1755
+ def test_schema_update_different_operation_types_increment_id(self):
1756
+ """Test that different types of schema operations all increment schema ID."""
1757
+ base_schema = Schema.of(
1758
+ [
1759
+ Field.of(pa.field("id", pa.int64()), field_id=1, is_merge_key=True),
1760
+ Field.of(pa.field("name", pa.string()), field_id=2),
1761
+ Field.of(pa.field("age", pa.int32()), field_id=3),
1762
+ ],
1763
+ schema_id=100,
1764
+ )
1765
+
1766
+ # Test add field operation
1767
+ add_result = (
1768
+ base_schema.update()
1769
+ .add_field(Field.of(pa.field("email", pa.string(), nullable=True)))
1770
+ .apply()
1771
+ )
1772
+ assert add_result.id == 101
1773
+
1774
+ # Test update field operation
1775
+ update_result = (
1776
+ base_schema.update().update_field_type("age", pa.int64()).apply()
1777
+ )
1778
+ assert update_result.id == 101
1779
+
1780
+ # Test rename field operation
1781
+ rename_result = base_schema.update().rename_field("name", "full_name").apply()
1782
+ assert rename_result.id == 101
1783
+
1784
+ # Test remove field operation (with incompatible changes allowed)
1785
+ remove_result = (
1786
+ base_schema.update(allow_incompatible_changes=True)
1787
+ .remove_field("age")
1788
+ .apply()
1789
+ )
1790
+ assert remove_result.id == 101
1791
+
1792
+ # Test update field documentation
1793
+ doc_result = (
1794
+ base_schema.update().update_field_doc("name", "Person's full name").apply()
1795
+ )
1796
+ assert doc_result.id == 101
1797
+
1798
+ def test_schema_update_chained_operations_increment_once(self):
1799
+ """Test that multiple chained operations in one update increment ID by 1, not per operation."""
1800
+ base_schema = Schema.of(
1801
+ [
1802
+ Field.of(pa.field("id", pa.int64()), field_id=1),
1803
+ Field.of(pa.field("name", pa.string()), field_id=2),
1804
+ Field.of(pa.field("age", pa.int32()), field_id=3),
1805
+ ],
1806
+ schema_id=50,
1807
+ )
1808
+
1809
+ # Chain multiple operations in a single SchemaUpdate
1810
+ chained_result = (
1811
+ base_schema.update()
1812
+ .add_field(Field.of(pa.field("email", pa.string(), nullable=True)))
1813
+ .add_field(Field.of(pa.field("phone", pa.string(), nullable=True)))
1814
+ .update_field_type("age", pa.int64())
1815
+ .update_field_doc("name", "Full name")
1816
+ .rename_field("id", "user_id")
1817
+ .apply()
1818
+ )
1819
+
1820
+ # Even with 5 operations, schema ID should only increment by 1
1821
+ assert chained_result.id == 51 # 50 + 1, not 50 + 5
1822
+
1823
+ def test_schema_subschema_operations_increment_id(self):
1824
+ """Test that subschema operations (add/delete/replace) also increment schema ID by 1."""
1825
+ # Create a base schema
1826
+ base_schema = Schema.of(
1827
+ [
1828
+ Field.of(pa.field("id", pa.int64()), field_id=1),
1829
+ Field.of(pa.field("name", pa.string()), field_id=2),
1830
+ ],
1831
+ schema_id=20,
1832
+ )
1833
+
1834
+ # Test add_subschema operation
1835
+ add_subschema_result = base_schema.add_subschema(
1836
+ "user_profile",
1837
+ [
1838
+ Field.of(pa.field("email", pa.string()), field_id=3),
1839
+ Field.of(pa.field("age", pa.int32()), field_id=4),
1840
+ ],
1841
+ )
1842
+ assert add_subschema_result.id == 21 # 20 + 1
1843
+
1844
+ # Test replace_subschema operation
1845
+ schema_with_subschema = base_schema.add_subschema(
1846
+ "test_subschema", [Field.of(pa.field("temp", pa.string()), field_id=5)]
1847
+ )
1848
+ replace_result = schema_with_subschema.replace_subschema(
1849
+ "test_subschema", [Field.of(pa.field("replaced", pa.int32()), field_id=6)]
1850
+ )
1851
+ assert replace_result.id == 22 # 21 + 1
1852
+
1853
+ # Test delete_subschema operation
1854
+ delete_result = schema_with_subschema.delete_subschema("test_subschema")
1855
+ assert delete_result.id == 22 # 21 + 1
1856
+
1857
+ def test_schema_id_increment_with_high_values(self):
1858
+ """Test that schema ID increment works correctly with high values."""
1859
+ # Test with a high schema ID to ensure no overflow issues
1860
+ high_id = 999999
1861
+ base_schema = Schema.of(
1862
+ [Field.of(pa.field("id", pa.int64()), field_id=1)],
1863
+ schema_id=high_id,
1864
+ )
1865
+
1866
+ updated_schema = (
1867
+ base_schema.update()
1868
+ .add_field(Field.of(pa.field("name", pa.string(), nullable=True)))
1869
+ .apply()
1870
+ )
1871
+
1872
+ assert updated_schema.id == high_id + 1
1873
+
1874
+ def test_schema_id_preserved_in_failed_updates(self):
1875
+ """Test that schema ID is not incremented when schema updates fail."""
1876
+ base_schema = Schema.of(
1877
+ [
1878
+ Field.of(pa.field("id", pa.int64()), field_id=1),
1879
+ Field.of(pa.field("name", pa.string()), field_id=2),
1880
+ ],
1881
+ schema_id=42,
1882
+ )
1883
+
1884
+ # Try an operation that should fail (adding non-nullable field without defaults)
1885
+ with pytest.raises(Exception): # Could be SchemaCompatibilityError or other
1886
+ base_schema.update().add_field(
1887
+ Field.of(pa.field("required_field", pa.string(), nullable=False))
1888
+ ).apply()
1889
+
1890
+ # Original schema should still have the same ID
1891
+ assert base_schema.id == 42
1892
+
1893
+ # A successful update should still increment correctly
1894
+ success_schema = (
1895
+ base_schema.update()
1896
+ .add_field(Field.of(pa.field("optional_field", pa.string(), nullable=True)))
1897
+ .apply()
1898
+ )
1899
+ assert success_schema.id == 43
1900
+
1901
+ def test_schema_id_increment_consistency_across_update_methods(self):
1902
+ """Test that schema ID increments consistently regardless of how SchemaUpdate is created."""
1903
+ base_schema = Schema.of(
1904
+ [Field.of(pa.field("id", pa.int64()), field_id=1)],
1905
+ schema_id=77,
1906
+ )
1907
+
1908
+ # Method 1: Using Schema.update()
1909
+ result1 = (
1910
+ base_schema.update()
1911
+ .add_field(Field.of(pa.field("field1", pa.string(), nullable=True)))
1912
+ .apply()
1913
+ )
1914
+ assert result1.id == 78
1915
+
1916
+ # Method 2: Using SchemaUpdate.of()
1917
+ result2 = (
1918
+ SchemaUpdate.of(base_schema)
1919
+ .add_field(Field.of(pa.field("field2", pa.string(), nullable=True)))
1920
+ .apply()
1921
+ )
1922
+ assert result2.id == 78
1923
+
1924
+ # Both methods should produce the same schema ID increment
1925
+ assert result1.id == result2.id