deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1972 @@
1
+ import shutil
2
+ import tempfile
3
+
4
+ import pytest
5
+ import pyarrow as pa
6
+ import pandas as pd
7
+ import polars as pl
8
+ import numpy as np
9
+ import ray.data as rd
10
+ import daft
11
+
12
+ import deltacat.catalog.main.impl as catalog
13
+ from deltacat.catalog import get_catalog_properties
14
+ from deltacat.storage.model.schema import (
15
+ Schema,
16
+ Field,
17
+ )
18
+ from deltacat.storage.model.types import SchemaConsistencyType
19
+ from deltacat.storage.model.sort_key import SortKey, SortScheme, SortOrder, NullOrder
20
+ from deltacat.storage.model.types import LifecycleState
21
+ from deltacat.exceptions import (
22
+ TableAlreadyExistsError,
23
+ TableNotFoundError,
24
+ TableValidationError,
25
+ SchemaValidationError,
26
+ )
27
+ from deltacat.types.tables import TableWriteMode, TableProperty, SchemaEvolutionMode
28
+ from deltacat.types.media import ContentType
29
+
30
+
31
+ @pytest.fixture(scope="class")
32
+ def catalog_setup():
33
+ """Setup and teardown for the catalog test environment."""
34
+ temp_dir = tempfile.mkdtemp()
35
+ catalog_properties = get_catalog_properties(root=temp_dir)
36
+ yield temp_dir, catalog_properties
37
+
38
+ # Teardown
39
+ shutil.rmtree(temp_dir)
40
+
41
+
42
+ @pytest.fixture(scope="function")
43
+ def test_namespace(catalog_setup):
44
+ """Create a test namespace for each test."""
45
+ _, catalog_properties = catalog_setup
46
+ namespace_name = "test_table_namespace"
47
+
48
+ if not catalog.namespace_exists(namespace_name, inner=catalog_properties):
49
+ catalog.create_namespace(
50
+ namespace=namespace_name,
51
+ properties={"description": "Test Table Namespace"},
52
+ inner=catalog_properties,
53
+ )
54
+
55
+ return namespace_name, catalog_properties
56
+
57
+
58
+ @pytest.fixture
59
+ def sample_arrow_schema():
60
+ """Create a sample PyArrow schema for testing."""
61
+ return pa.schema(
62
+ [
63
+ pa.field("id", pa.int64()),
64
+ pa.field("name", pa.string()),
65
+ pa.field("value", pa.float64()),
66
+ ]
67
+ )
68
+
69
+
70
+ @pytest.fixture
71
+ def sample_sort_keys():
72
+ """Create a sample sort scheme for testing."""
73
+ return SortScheme(
74
+ keys=[
75
+ SortKey.of(
76
+ key=["id"], sort_order=SortOrder.ASCENDING, null_order=NullOrder.AT_END
77
+ ),
78
+ ]
79
+ )
80
+
81
+
82
+ class TestCatalogTableOperations:
83
+ """Test catalog table operations including table creation, existence checks, etc."""
84
+
85
+ @classmethod
86
+ def setup_class(cls):
87
+ cls.temp_dir = tempfile.mkdtemp()
88
+ cls.catalog_properties = get_catalog_properties(root=cls.temp_dir)
89
+
90
+ # Create a test namespace
91
+ cls.test_namespace = "test_write_operations"
92
+ catalog.create_namespace(
93
+ namespace=cls.test_namespace,
94
+ inner=cls.catalog_properties,
95
+ )
96
+
97
+ @classmethod
98
+ def teardown_class(cls):
99
+ shutil.rmtree(cls.temp_dir)
100
+
101
+ def test_create_table(self, test_namespace, sample_arrow_schema, sample_sort_keys):
102
+ """Test creating a table with schema and properties"""
103
+ namespace_name, catalog_properties = test_namespace
104
+ table_name = "test_create_table"
105
+
106
+ # Create a schema
107
+ schema = Schema(arrow=sample_arrow_schema)
108
+
109
+ # Create table properties
110
+ table_properties = {"owner": "test-user", "department": "engineering"}
111
+
112
+ # Create namespace properties
113
+ namespace_properties = {"description": "Test Namespace"}
114
+
115
+ # Create the table
116
+ table_definition = catalog.create_table(
117
+ table=table_name,
118
+ namespace=namespace_name,
119
+ schema=schema,
120
+ sort_keys=sample_sort_keys,
121
+ table_description="Test table for unit tests",
122
+ table_properties=table_properties,
123
+ namespace_properties=namespace_properties,
124
+ inner=catalog_properties,
125
+ )
126
+
127
+ # Verify table was created
128
+ assert catalog.table_exists(
129
+ table_name,
130
+ namespace=namespace_name,
131
+ inner=catalog_properties,
132
+ )
133
+
134
+ table = table_definition.table
135
+ table_version = table_definition.table_version
136
+
137
+ # Verify table definition properties
138
+ assert table_version.table_name == table_name
139
+ assert table_version.namespace == namespace_name
140
+ assert table_version.description == "Test table for unit tests"
141
+ assert table_version.state == LifecycleState.ACTIVE
142
+ assert table.properties.get("owner") == "test-user"
143
+ assert table.properties.get("department") == "engineering"
144
+ assert table_version.schema.arrow.names == sample_arrow_schema.names
145
+ assert len(table_version.sort_scheme.keys) == 1
146
+ sort_key_paths = [key[0][0] for key in table_version.sort_scheme.keys]
147
+ assert "id" in sort_key_paths
148
+
149
+ def test_create_table_already_exists(self, test_namespace):
150
+ namespace_name, catalog_properties = test_namespace
151
+ table_name = "test_table_exists"
152
+
153
+ # Create the table
154
+ catalog.create_table(
155
+ table=table_name,
156
+ namespace=namespace_name,
157
+ table_description="First creation",
158
+ inner=catalog_properties,
159
+ )
160
+
161
+ # Verify table exists
162
+ assert catalog.table_exists(
163
+ table_name,
164
+ namespace=namespace_name,
165
+ inner=catalog_properties,
166
+ )
167
+
168
+ # Try to create the same table again, should raise TableAlreadyExistsError
169
+ with pytest.raises(
170
+ TableAlreadyExistsError,
171
+ match=f"Table {namespace_name}.{table_name} already exists",
172
+ ):
173
+ catalog.create_table(
174
+ table=table_name,
175
+ namespace=namespace_name,
176
+ table_description="Second creation attempt",
177
+ inner=catalog_properties,
178
+ )
179
+
180
+ def test_create_table_already_exists_no_fail(self, test_namespace):
181
+ """Test creating a table that already exists with fail_if_exists=False"""
182
+ namespace_name, catalog_properties = test_namespace
183
+ table_name = "test_table_exists_no_fail"
184
+
185
+ # Create the table with original description
186
+ catalog.create_table(
187
+ table=table_name,
188
+ namespace=namespace_name,
189
+ table_description="Original description",
190
+ inner=catalog_properties,
191
+ )
192
+
193
+ assert catalog.table_exists(
194
+ table_name,
195
+ namespace=namespace_name,
196
+ catalog=catalog_properties,
197
+ )
198
+
199
+ # Create the same table with fail_if_exists=False
200
+ table_definition = catalog.create_table(
201
+ table=table_name,
202
+ namespace=namespace_name,
203
+ table_description="Updated description",
204
+ fail_if_exists=False,
205
+ inner=catalog_properties,
206
+ )
207
+
208
+ table = table_definition.table
209
+
210
+ assert table.table_name == table_name
211
+ assert table.namespace == namespace_name
212
+ # Ensure description is unchanged
213
+ assert table.description == "Original description"
214
+
215
+ def test_drop_table(self, test_namespace):
216
+ namespace_name, catalog_properties = test_namespace
217
+ table_name = "test_drop_table"
218
+
219
+ # Create the table
220
+ catalog.create_table(
221
+ table=table_name,
222
+ namespace=namespace_name,
223
+ inner=catalog_properties,
224
+ )
225
+
226
+ # Verify table exists
227
+ assert catalog.table_exists(
228
+ table_name,
229
+ namespace=namespace_name,
230
+ inner=catalog_properties,
231
+ )
232
+
233
+ # Drop the table
234
+ catalog.drop_table(
235
+ table=table_name,
236
+ namespace=namespace_name,
237
+ inner=catalog_properties,
238
+ )
239
+
240
+ # Verify table no longer exists
241
+ assert not catalog.table_exists(
242
+ table_name,
243
+ namespace=namespace_name,
244
+ inner=catalog_properties,
245
+ )
246
+
247
+ def test_drop_table_not_exists(self, test_namespace):
248
+ namespace_name, catalog_properties = test_namespace
249
+ table_name = "nonexistent_table"
250
+
251
+ # Verify table doesn't exist
252
+ assert not catalog.table_exists(
253
+ table_name,
254
+ namespace=namespace_name,
255
+ inner=catalog_properties,
256
+ )
257
+
258
+ # Try to drop the table, should raise TableNotFoundError
259
+ with pytest.raises(TableNotFoundError, match=table_name):
260
+ catalog.drop_table(
261
+ table=table_name,
262
+ namespace=namespace_name,
263
+ inner=catalog_properties,
264
+ )
265
+
266
+ def test_rename_namespace(self, test_namespace):
267
+ namespace_name, catalog_properties = test_namespace
268
+ original_name = "test_original_table"
269
+ new_name = "test_renamed_namespace"
270
+
271
+ # Create the table with original name
272
+ catalog.create_table(
273
+ table=original_name,
274
+ namespace=namespace_name,
275
+ table_description="Table to in namespace to be renamed",
276
+ inner=catalog_properties,
277
+ )
278
+
279
+ # Verify original table exists
280
+ assert catalog.table_exists(
281
+ original_name,
282
+ namespace=namespace_name,
283
+ inner=catalog_properties,
284
+ )
285
+
286
+ # Rename the namespace
287
+ catalog.alter_namespace(
288
+ namespace=namespace_name,
289
+ new_namespace=new_name,
290
+ inner=catalog_properties,
291
+ )
292
+
293
+ # Verify new namespace exists and old namespace doesn't
294
+ assert catalog.namespace_exists(new_name, inner=catalog_properties)
295
+ assert not catalog.namespace_exists(namespace_name, inner=catalog_properties)
296
+
297
+ # Verify we can still discover the table in the new namespace
298
+ assert catalog.table_exists(
299
+ original_name,
300
+ namespace=new_name,
301
+ inner=catalog_properties,
302
+ )
303
+
304
+ def test_rename_table(self, test_namespace):
305
+ namespace_name, catalog_properties = test_namespace
306
+ original_name = "test_original_table"
307
+ new_name = "test_renamed_table"
308
+
309
+ # Create the table with original name
310
+ catalog.create_table(
311
+ table=original_name,
312
+ namespace=namespace_name,
313
+ table_description="Table to be renamed",
314
+ inner=catalog_properties,
315
+ )
316
+
317
+ # Verify original table exists
318
+ assert catalog.table_exists(
319
+ original_name,
320
+ namespace=namespace_name,
321
+ inner=catalog_properties,
322
+ )
323
+
324
+ # Rename the table
325
+ catalog.rename_table(
326
+ table=original_name,
327
+ new_name=new_name,
328
+ namespace=namespace_name,
329
+ inner=catalog_properties,
330
+ )
331
+
332
+ # Verify new table exists and old table doesn't
333
+ assert catalog.table_exists(
334
+ new_name,
335
+ namespace=namespace_name,
336
+ inner=catalog_properties,
337
+ )
338
+ assert not catalog.table_exists(
339
+ original_name,
340
+ namespace=namespace_name,
341
+ inner=catalog_properties,
342
+ )
343
+
344
+ def test_rename_table_not_exists(self, test_namespace):
345
+ namespace_name, catalog_properties = test_namespace
346
+ original_name = "nonexistent_table"
347
+ new_name = "test_renamed_nonexistent"
348
+
349
+ # Verify table doesn't exist
350
+ assert not catalog.table_exists(
351
+ original_name,
352
+ namespace=namespace_name,
353
+ inner=catalog_properties,
354
+ )
355
+
356
+ # Try to rename the table, should raise TableNotFoundError
357
+ with pytest.raises(TableNotFoundError, match=original_name):
358
+ catalog.rename_table(
359
+ table=original_name,
360
+ new_name=new_name,
361
+ namespace=namespace_name,
362
+ inner=catalog_properties,
363
+ )
364
+
365
+ def test_table_exists(self, test_namespace):
366
+ namespace_name, catalog_properties = test_namespace
367
+ existing_table = "test_table_exists_check"
368
+ non_existing_table = "nonexistent_table"
369
+
370
+ # Create a table
371
+ catalog.create_table(
372
+ table=existing_table,
373
+ namespace=namespace_name,
374
+ inner=catalog_properties,
375
+ )
376
+
377
+ # Check existing table
378
+ assert catalog.table_exists(
379
+ existing_table,
380
+ namespace=namespace_name,
381
+ inner=catalog_properties,
382
+ )
383
+
384
+ # Check non-existing table
385
+ assert not catalog.table_exists(
386
+ non_existing_table,
387
+ namespace=namespace_name,
388
+ inner=catalog_properties,
389
+ )
390
+
391
+ def test_create_table_with_default_namespace(self, catalog_setup):
392
+ _, catalog_properties = catalog_setup
393
+ table_name = "test_default_namespace_table"
394
+
395
+ # Create table with default namespace
396
+ table_definition = catalog.create_table(
397
+ table=table_name, inner=catalog_properties
398
+ )
399
+
400
+ table = table_definition.table
401
+ # Verify table was created in default namespace
402
+ default_ns = catalog.default_namespace()
403
+ assert table.namespace == default_ns
404
+ assert catalog.table_exists(
405
+ table_name,
406
+ namespace=default_ns,
407
+ inner=catalog_properties,
408
+ )
409
+
410
+ def test_create_table_with_missing_namespace(self, catalog_setup):
411
+ _, catalog_properties = catalog_setup
412
+ table_name = "test_namespace_not_found_table"
413
+ new_namespace = "nonexistent_namespace"
414
+
415
+ # Verify namespace doesn't exist yet
416
+ assert not catalog.namespace_exists(new_namespace, inner=catalog_properties)
417
+
418
+ # Try to create table with non-existent namespace
419
+ catalog.create_table(
420
+ table=table_name,
421
+ namespace=new_namespace,
422
+ inner=catalog_properties,
423
+ )
424
+
425
+ assert catalog.table_exists(
426
+ table_name,
427
+ namespace=new_namespace,
428
+ inner=catalog_properties,
429
+ )
430
+ assert catalog.namespace_exists(new_namespace, inner=catalog_properties)
431
+
432
+ def test_alter_table(self, test_namespace, sample_arrow_schema, sample_sort_keys):
433
+ namespace_name, catalog_properties = test_namespace
434
+ table_name = "test_alter_table"
435
+
436
+ # Create initial schema and properties
437
+ schema = Schema.of(schema=sample_arrow_schema)
438
+ initial_properties = {"owner": "original-user", "department": "engineering"}
439
+
440
+ # Create the table with initial properties
441
+ table = catalog.create_table(
442
+ table=table_name,
443
+ namespace=namespace_name,
444
+ schema=schema,
445
+ sort_keys=sample_sort_keys,
446
+ table_description="Initial description",
447
+ table_properties=initial_properties,
448
+ inner=catalog_properties,
449
+ )
450
+ old_schema = table.table_version.schema
451
+
452
+ # Verify table was created with initial properties
453
+ assert catalog.table_exists(
454
+ table_name,
455
+ namespace=namespace_name,
456
+ inner=catalog_properties,
457
+ )
458
+
459
+ # Create schema update operations to add a new field
460
+ new_field = Field.of(pa.field("count", pa.float64(), nullable=True))
461
+ schema_update = old_schema.update().add_field(new_field)
462
+
463
+ # Create updated properties
464
+ updated_properties = {
465
+ "owner": "new-user",
466
+ "department": "data-science",
467
+ "priority": "high",
468
+ }
469
+
470
+ # Alter the table with new properties and schema updates
471
+ catalog.alter_table(
472
+ table=table_name,
473
+ namespace=namespace_name,
474
+ schema_updates=schema_update,
475
+ table_description="Updated description",
476
+ table_properties=updated_properties,
477
+ inner=catalog_properties,
478
+ )
479
+
480
+ # Get the updated table definition
481
+ updated_table_def = catalog.get_table(
482
+ table_name,
483
+ namespace=namespace_name,
484
+ inner=catalog_properties,
485
+ )
486
+
487
+ updated_table = updated_table_def.table
488
+ updated_table_version = updated_table_def.table_version
489
+
490
+ # Verify table properties were updated
491
+ assert updated_table_version.description == "Updated description"
492
+ assert updated_table_version.state == LifecycleState.ACTIVE
493
+ assert updated_table.properties.get("owner") == "new-user"
494
+ assert updated_table.properties.get("department") == "data-science"
495
+ assert updated_table.properties.get("priority") == "high"
496
+
497
+ # Verify schema was updated with new field
498
+ updated_schema = updated_table_version.schema
499
+ assert updated_schema.field("count") is not None
500
+ assert updated_schema.field("count").arrow.type == pa.float64()
501
+ assert updated_schema.field("count").arrow.nullable is True
502
+ assert (
503
+ updated_schema.field("count").id == 3
504
+ ) # Next sequential ID after id(0), name(1), value(2)
505
+
506
+ # Verify schema ID was incremented (proving SchemaUpdate was used)
507
+ assert updated_schema.id == old_schema.id + 1
508
+
509
+ def test_alter_table_not_exists(self, test_namespace):
510
+ """Test altering a table that doesn't exist"""
511
+ namespace_name, catalog_properties = test_namespace
512
+ nonexistent_table = "nonexistent_alter_table"
513
+
514
+ # Verify table doesn't exist
515
+ assert not catalog.table_exists(
516
+ nonexistent_table,
517
+ namespace=namespace_name,
518
+ inner=catalog_properties,
519
+ )
520
+
521
+ # Try to alter the nonexistent table, should raise TableNotFoundError
522
+ with pytest.raises(TableNotFoundError, match=nonexistent_table):
523
+ catalog.alter_table(
524
+ table=nonexistent_table,
525
+ namespace=namespace_name,
526
+ table_description="Updated description",
527
+ inner=catalog_properties,
528
+ )
529
+
530
+ def test_alter_table_with_multiple_schema_operations(
531
+ self, test_namespace, sample_arrow_schema
532
+ ):
533
+ """Test altering a table with multiple schema update operations."""
534
+ namespace_name, catalog_properties = test_namespace
535
+ table_name = "test_alter_table_multiple_ops"
536
+
537
+ # Create initial schema
538
+ schema = Schema.of(schema=sample_arrow_schema)
539
+ print("schema.max_field_id", schema.max_field_id)
540
+
541
+ # Create the table
542
+ table = catalog.create_table(
543
+ table=table_name,
544
+ namespace=namespace_name,
545
+ schema=schema,
546
+ table_description="Initial description",
547
+ inner=catalog_properties,
548
+ )
549
+
550
+ original_schema = table.table_version.schema
551
+
552
+ # Create multiple schema update operations
553
+ new_field1 = Field.of(pa.field("count", pa.int64(), nullable=True))
554
+ new_field2 = Field.of(
555
+ pa.field("status", pa.string(), nullable=False),
556
+ past_default="active",
557
+ )
558
+
559
+ schema_update = (
560
+ original_schema.update().add_field(new_field1).add_field(new_field2)
561
+ )
562
+ print("original_schema.max_field_id", original_schema.max_field_id)
563
+ print(
564
+ "schema_update.base_schema.max_field_id",
565
+ schema_update.base_schema.max_field_id,
566
+ )
567
+
568
+ # Alter the table
569
+ catalog.alter_table(
570
+ table=table_name,
571
+ namespace=namespace_name,
572
+ schema_updates=schema_update,
573
+ table_description="Updated with multiple fields",
574
+ inner=catalog_properties,
575
+ )
576
+
577
+ # Get the updated table
578
+ updated_table_def = catalog.get_table(
579
+ table_name,
580
+ namespace=namespace_name,
581
+ inner=catalog_properties,
582
+ )
583
+
584
+ updated_schema = updated_table_def.table_version.schema
585
+
586
+ # Verify both fields were added
587
+ assert updated_schema.field("count") is not None
588
+ assert updated_schema.field("count").arrow.type == pa.int64()
589
+ assert (
590
+ updated_schema.field("count").id == 3
591
+ ) # Next sequential ID after id(0), name(1), value(2)
592
+
593
+ assert updated_schema.field("status") is not None
594
+ assert updated_schema.field("status").arrow.type == pa.string()
595
+ assert (
596
+ updated_schema.field("status").id == 4
597
+ ) # Next sequential ID after count(3)
598
+ assert updated_schema.field("status").past_default == "active"
599
+
600
+ # Verify schema ID was incremented
601
+ assert updated_schema.id == original_schema.id + 1
602
+
603
+ def test_alter_table_with_remove_operation(self, test_namespace):
604
+ """Test altering a table with field removal (requires allow_incompatible_changes)."""
605
+ namespace_name, catalog_properties = test_namespace
606
+ table_name = "test_alter_table_remove"
607
+
608
+ # Create schema with multiple fields
609
+ initial_fields = [
610
+ Field.of(
611
+ pa.field("id", pa.int64(), nullable=False),
612
+ is_merge_key=True,
613
+ field_id=1,
614
+ ),
615
+ Field.of(pa.field("name", pa.string(), nullable=True), field_id=2),
616
+ Field.of(pa.field("temp_field", pa.float64(), nullable=True), field_id=3),
617
+ ]
618
+ schema = Schema.of(initial_fields)
619
+
620
+ # Create the table
621
+ table = catalog.create_table(
622
+ table=table_name,
623
+ namespace=namespace_name,
624
+ schema=schema,
625
+ inner=catalog_properties,
626
+ )
627
+ original_schema = table.table_version.schema
628
+ temp_field = original_schema.field("temp_field")
629
+ assert temp_field is not None
630
+
631
+ schema_update = original_schema.update(True).remove_field("temp_field")
632
+
633
+ catalog.alter_table(
634
+ table=table_name,
635
+ namespace=namespace_name,
636
+ schema_updates=schema_update,
637
+ inner=catalog_properties,
638
+ )
639
+
640
+ # If successful, verify the field was removed
641
+ updated_table_def = catalog.get_table(
642
+ table_name,
643
+ namespace=namespace_name,
644
+ inner=catalog_properties,
645
+ )
646
+ updated_schema = updated_table_def.table_version.schema
647
+
648
+ # temp_field should be removed
649
+ with pytest.raises(KeyError):
650
+ updated_schema.field("temp_field")
651
+
652
+ # all other fields should be present
653
+ assert updated_schema.field("id") is not None
654
+ assert updated_schema.field("id").arrow.type == pa.int64()
655
+ assert updated_schema.field("id").id == 1
656
+ assert updated_schema.field("name") is not None
657
+ assert updated_schema.field("name").arrow.type == pa.string()
658
+ assert updated_schema.field("name").id == 2
659
+
660
+ def test_alter_table_with_update_operation(self, test_namespace):
661
+ """Test altering a table with field update operation."""
662
+ namespace_name, catalog_properties = test_namespace
663
+ table_name = "test_alter_table_update"
664
+
665
+ # Create schema with a field to update
666
+ initial_fields = [
667
+ Field.of(
668
+ pa.field("id", pa.int64(), nullable=False),
669
+ is_merge_key=True,
670
+ field_id=1,
671
+ ),
672
+ Field.of(pa.field("value", pa.int32(), nullable=True), field_id=2),
673
+ ]
674
+ schema = Schema.of(initial_fields)
675
+
676
+ # Create the table
677
+ table = catalog.create_table(
678
+ table=table_name,
679
+ namespace=namespace_name,
680
+ schema=schema,
681
+ inner=catalog_properties,
682
+ )
683
+
684
+ original_schema = table.table_version.schema
685
+
686
+ # Update the value field to int64 (compatible type widening)
687
+ schema_update = original_schema.update().update_field_type("value", pa.int64())
688
+
689
+ # Alter the table
690
+ catalog.alter_table(
691
+ table=table_name,
692
+ namespace=namespace_name,
693
+ schema_updates=schema_update,
694
+ inner=catalog_properties,
695
+ )
696
+
697
+ # Get the updated table
698
+ updated_table_def = catalog.get_table(
699
+ table_name,
700
+ namespace=namespace_name,
701
+ inner=catalog_properties,
702
+ )
703
+
704
+ updated_schema = updated_table_def.table_version.schema
705
+
706
+ # Verify field was updated
707
+ assert updated_schema.field("value").arrow.type == pa.int64()
708
+ assert updated_schema.field("value").id == 2
709
+
710
+ # Verify schema ID was incremented
711
+ assert updated_schema.id == original_schema.id + 1
712
+
713
+ def test_alter_table_with_schema_evolution_disabled(self, test_namespace):
714
+ """Test that alter_table raises TableValidationError when schema evolution is disabled."""
715
+ namespace_name, catalog_properties = test_namespace
716
+ table_name = "test_alter_table_schema_evolution_disabled"
717
+
718
+ # Create initial schema
719
+ initial_fields = [
720
+ Field.of(
721
+ pa.field("id", pa.int64(), nullable=False),
722
+ is_merge_key=True,
723
+ field_id=1,
724
+ ),
725
+ Field.of(pa.field("value", pa.int32(), nullable=True), field_id=2),
726
+ ]
727
+ schema = Schema.of(initial_fields)
728
+
729
+ # Create table with SCHEMA_EVOLUTION_MODE.DISABLED
730
+ table_properties = {
731
+ TableProperty.SCHEMA_EVOLUTION_MODE: SchemaEvolutionMode.DISABLED
732
+ }
733
+
734
+ table = catalog.create_table(
735
+ table=table_name,
736
+ namespace=namespace_name,
737
+ schema=schema,
738
+ table_properties=table_properties,
739
+ inner=catalog_properties,
740
+ )
741
+
742
+ original_schema = table.table_version.schema
743
+
744
+ # Try to add a new field - this should be blocked
745
+ new_field = Field.of(pa.field("description", pa.string(), nullable=True))
746
+ schema_update = original_schema.update().add_field(new_field)
747
+
748
+ # Alter table with schema updates should raise TableValidationError
749
+ with pytest.raises(
750
+ TableValidationError,
751
+ match="Schema evolution is disabled for this table. Please enable schema evolution or remove schema updates.",
752
+ ):
753
+ catalog.alter_table(
754
+ table=table_name,
755
+ namespace=namespace_name,
756
+ schema_updates=schema_update,
757
+ inner=catalog_properties,
758
+ )
759
+
760
+ # Verify the schema wasn't changed
761
+ unchanged_table_def = catalog.get_table(
762
+ table_name,
763
+ namespace=namespace_name,
764
+ inner=catalog_properties,
765
+ )
766
+ unchanged_schema = unchanged_table_def.table_version.schema
767
+
768
+ # Schema should be unchanged
769
+ assert unchanged_schema.id == original_schema.id
770
+ assert len(unchanged_schema.fields) == len(original_schema.fields)
771
+
772
+ # Verify the new field was not added
773
+ field_names = [field.arrow.name for field in unchanged_schema.fields]
774
+ assert "description" not in field_names
775
+
776
+ # Test that alter_table works without schema_updates even when schema evolution is disabled
777
+ catalog.alter_table(
778
+ table=table_name,
779
+ namespace=namespace_name,
780
+ table_description="Updated description without schema changes",
781
+ inner=catalog_properties,
782
+ )
783
+
784
+ # Verify that table description was updated but schema remains unchanged
785
+ final_table_def = catalog.get_table(
786
+ table_name,
787
+ namespace=namespace_name,
788
+ inner=catalog_properties,
789
+ )
790
+ assert (
791
+ final_table_def.table_version.description
792
+ == "Updated description without schema changes"
793
+ )
794
+ assert final_table_def.table_version.schema.id == original_schema.id
795
+
796
+ def test_drop_with_purge_validation(self, test_namespace):
797
+ """Test that using purge flag raises ValidationError"""
798
+ namespace_name, catalog_properties = test_namespace
799
+ table_name = "test_drop_with_purge"
800
+
801
+ # Create the table
802
+ catalog.create_table(
803
+ table=table_name,
804
+ namespace=namespace_name,
805
+ inner=catalog_properties,
806
+ )
807
+
808
+ # Try to drop with purge=True, should raise ValidationError
809
+ with pytest.raises(
810
+ NotImplementedError, match="Purge flag is not currently supported"
811
+ ):
812
+ catalog.drop_table(
813
+ table=table_name,
814
+ namespace=namespace_name,
815
+ purge=True,
816
+ inner=catalog_properties,
817
+ )
818
+
819
+ def test_create_table_basic(self):
820
+ """Test basic table creation"""
821
+ table_name = "test_create_table_basic"
822
+ schema = Schema.of(
823
+ schema=pa.schema(
824
+ [
825
+ ("id", pa.int64()),
826
+ ("name", pa.string()),
827
+ ]
828
+ )
829
+ )
830
+
831
+ table_def = catalog.create_table(
832
+ table=table_name,
833
+ namespace=self.test_namespace,
834
+ schema=schema,
835
+ inner=self.catalog_properties,
836
+ )
837
+
838
+ assert table_def.table.table_name == table_name
839
+ assert table_def.table_version.schema.equivalent_to(schema)
840
+
841
+ # Verify table exists
842
+ assert catalog.table_exists(
843
+ table=table_name,
844
+ namespace=self.test_namespace,
845
+ inner=self.catalog_properties,
846
+ )
847
+
848
+ def test_create_table_already_exists_fail_if_exists_true(self):
849
+ """Test creating a table that already exists with fail_if_exists=True"""
850
+ table_name = "test_create_table_exists"
851
+ schema = Schema.of(schema=pa.schema([("id", pa.int64())]))
852
+
853
+ # Create table first
854
+ catalog.create_table(
855
+ table=table_name,
856
+ namespace=self.test_namespace,
857
+ schema=schema,
858
+ inner=self.catalog_properties,
859
+ )
860
+
861
+ # Try to create again with fail_if_exists=True (default)
862
+ with pytest.raises(TableAlreadyExistsError):
863
+ catalog.create_table(
864
+ table=table_name,
865
+ namespace=self.test_namespace,
866
+ schema=schema,
867
+ fail_if_exists=True,
868
+ inner=self.catalog_properties,
869
+ )
870
+
871
+ def test_create_table_already_exists_fail_if_exists_false(self):
872
+ """Test creating a table that already exists with fail_if_exists=False"""
873
+ table_name = "test_create_table_exists_ok"
874
+ schema = Schema.of(schema=pa.schema([("id", pa.int64())]))
875
+
876
+ # Create table first
877
+ table_def1 = catalog.create_table(
878
+ table=table_name,
879
+ namespace=self.test_namespace,
880
+ schema=schema,
881
+ inner=self.catalog_properties,
882
+ )
883
+
884
+ # Create again with fail_if_exists=False should return existing table
885
+ table_def2 = catalog.create_table(
886
+ table=table_name,
887
+ namespace=self.test_namespace,
888
+ schema=schema,
889
+ fail_if_exists=False,
890
+ inner=self.catalog_properties,
891
+ )
892
+
893
+ assert table_def1.table.table_name == table_def2.table.table_name
894
+
895
+
896
+ class TestWriteToTable:
897
+ """Test the write_to_table implementation with different modes and data types."""
898
+
899
+ @classmethod
900
+ def setup_class(cls):
901
+ cls.temp_dir = tempfile.mkdtemp()
902
+ cls.catalog_properties = get_catalog_properties(root=cls.temp_dir)
903
+
904
+ # Create a test namespace
905
+ cls.test_namespace = "test_write_to_table"
906
+ catalog.create_namespace(
907
+ namespace=cls.test_namespace, inner=cls.catalog_properties
908
+ )
909
+
910
+ @classmethod
911
+ def teardown_class(cls):
912
+ shutil.rmtree(cls.temp_dir)
913
+
914
+ def _create_test_pandas_data(self):
915
+ """Create test pandas DataFrame"""
916
+ return pd.DataFrame(
917
+ {
918
+ "id": [1, 2, 3, 4, 5],
919
+ "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
920
+ "age": [25, 30, 35, 40, 45],
921
+ "city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
922
+ }
923
+ )
924
+
925
+ def _create_test_pyarrow_data(self):
926
+ """Create test PyArrow Table"""
927
+ return pa.table(
928
+ {
929
+ "id": [1, 2, 3, 4, 5],
930
+ "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
931
+ "age": [25, 30, 35, 40, 45],
932
+ "city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
933
+ }
934
+ )
935
+
936
+ def _create_test_polars_data(self):
937
+ """Create test Polars DataFrame"""
938
+ return pl.DataFrame(
939
+ {
940
+ "id": [1, 2, 3, 4, 5],
941
+ "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
942
+ "age": [25, 30, 35, 40, 45],
943
+ "city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
944
+ }
945
+ )
946
+
947
+ def _create_second_batch_pandas_data(self):
948
+ """Create second batch of test data for append tests"""
949
+ return pd.DataFrame(
950
+ {
951
+ "id": [6, 7, 8],
952
+ "name": ["Frank", "Grace", "Henry"],
953
+ "age": [50, 55, 60],
954
+ "city": ["Boston", "Seattle", "Denver"],
955
+ }
956
+ )
957
+
958
+ def _create_test_ray_data(self):
959
+ """Create test Ray Dataset for schema inference testing."""
960
+ import ray
961
+
962
+ # Initialize Ray if not already initialized
963
+ # Note: Use distributed mode (not local_mode=True) to avoid Ray 2.46.0 internal bug
964
+ if not ray.is_initialized():
965
+ ray.init()
966
+
967
+ data = pa.table(
968
+ {
969
+ "id": [1, 2, 3, 4, 5],
970
+ "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
971
+ "age": [25, 30, 35, 40, 45],
972
+ "city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
973
+ }
974
+ )
975
+ return rd.from_arrow(data)
976
+
977
+ def _create_test_daft_data(self):
978
+ """Create test Daft DataFrame for schema inference testing."""
979
+ data = {
980
+ "id": [1, 2, 3],
981
+ "name": ["Alice", "Bob", "Charlie"],
982
+ "age": [25, 30, 35],
983
+ "city": ["NYC", "LA", "Chicago"],
984
+ }
985
+ return daft.from_pydict(data)
986
+
987
+ def _create_test_numpy_1d_data(self):
988
+ """Create test 1D numpy array for schema inference testing."""
989
+ return np.array([1, 2, 3, 4, 5])
990
+
991
+ def _create_test_numpy_2d_data(self):
992
+ """Create test 2D numpy array for schema inference testing."""
993
+ return np.array([[1, 25], [2, 30], [3, 35]], dtype=np.int64)
994
+
995
+ def _create_table_with_merge_keys(self, table_name: str):
996
+ """Create a table with merge keys for testing MERGE mode"""
997
+ from deltacat.storage.model.schema import Schema, Field
998
+
999
+ # Create schema with merge keys
1000
+ schema = Schema.of(
1001
+ [
1002
+ Field.of(pa.field("id", pa.int64()), is_merge_key=True), # merge key
1003
+ Field.of(pa.field("name", pa.string())),
1004
+ Field.of(pa.field("age", pa.int32())),
1005
+ Field.of(pa.field("city", pa.string())),
1006
+ ]
1007
+ )
1008
+
1009
+ catalog.create_table(
1010
+ table=table_name,
1011
+ namespace=self.test_namespace,
1012
+ schema=schema,
1013
+ inner=self.catalog_properties,
1014
+ )
1015
+
1016
+ return schema
1017
+
1018
+ def _create_table_without_merge_keys(self, table_name: str):
1019
+ """Create a table without merge keys for testing APPEND mode"""
1020
+ # Use schema inference with no merge keys
1021
+ data = self._create_test_pandas_data()
1022
+ catalog.write_to_table(
1023
+ data=data,
1024
+ table=table_name,
1025
+ namespace=self.test_namespace,
1026
+ mode=TableWriteMode.CREATE,
1027
+ inner=self.catalog_properties,
1028
+ )
1029
+
1030
+ # Test TableWriteMode.AUTO
1031
+ def test_write_to_table_auto_create_new_table_pandas(self):
1032
+ """Test AUTO mode creating a new table with pandas data"""
1033
+ table_name = "test_auto_create_pandas"
1034
+ data = self._create_test_pandas_data()
1035
+
1036
+ # Table doesn't exist, AUTO should create it
1037
+ catalog.write_to_table(
1038
+ data=data,
1039
+ table=table_name,
1040
+ namespace=self.test_namespace,
1041
+ mode=TableWriteMode.AUTO,
1042
+ inner=self.catalog_properties,
1043
+ )
1044
+
1045
+ # Verify table was created
1046
+ assert catalog.table_exists(
1047
+ table=table_name,
1048
+ namespace=self.test_namespace,
1049
+ inner=self.catalog_properties,
1050
+ )
1051
+
1052
+ # Verify table has correct schema
1053
+ table_def = catalog.get_table(
1054
+ table=table_name,
1055
+ namespace=self.test_namespace,
1056
+ inner=self.catalog_properties,
1057
+ )
1058
+ assert table_def.table_version.schema is not None
1059
+
1060
+ def test_write_to_table_auto_create_new_table_pyarrow(self):
1061
+ """Test AUTO mode creating a new table with PyArrow data"""
1062
+ table_name = "test_auto_create_pyarrow"
1063
+ data = self._create_test_pyarrow_data()
1064
+
1065
+ catalog.write_to_table(
1066
+ data=data,
1067
+ table=table_name,
1068
+ namespace=self.test_namespace,
1069
+ mode=TableWriteMode.AUTO,
1070
+ inner=self.catalog_properties,
1071
+ )
1072
+
1073
+ assert catalog.table_exists(
1074
+ table=table_name,
1075
+ namespace=self.test_namespace,
1076
+ inner=self.catalog_properties,
1077
+ )
1078
+
1079
+ def test_write_to_table_auto_create_new_table_polars(self):
1080
+ """Test AUTO mode creating a new table with Polars data"""
1081
+ table_name = "test_auto_create_polars"
1082
+ data = self._create_test_polars_data()
1083
+
1084
+ catalog.write_to_table(
1085
+ data=data,
1086
+ table=table_name,
1087
+ namespace=self.test_namespace,
1088
+ mode=TableWriteMode.AUTO,
1089
+ inner=self.catalog_properties,
1090
+ )
1091
+
1092
+ assert catalog.table_exists(
1093
+ table=table_name,
1094
+ namespace=self.test_namespace,
1095
+ inner=self.catalog_properties,
1096
+ )
1097
+
1098
+ def test_write_to_table_auto_append_existing_table(self):
1099
+ """Test AUTO mode appending to existing table"""
1100
+ table_name = "test_auto_append"
1101
+ data1 = self._create_test_pandas_data()
1102
+ data2 = self._create_second_batch_pandas_data()
1103
+
1104
+ # First write creates table
1105
+ catalog.write_to_table(
1106
+ data=data1,
1107
+ table=table_name,
1108
+ namespace=self.test_namespace,
1109
+ mode=TableWriteMode.AUTO,
1110
+ inner=self.catalog_properties,
1111
+ )
1112
+
1113
+ # Second write should append
1114
+ catalog.write_to_table(
1115
+ data=data2,
1116
+ table=table_name,
1117
+ namespace=self.test_namespace,
1118
+ mode=TableWriteMode.AUTO,
1119
+ inner=self.catalog_properties,
1120
+ )
1121
+
1122
+ # Verify table still exists
1123
+ assert catalog.table_exists(
1124
+ table=table_name,
1125
+ namespace=self.test_namespace,
1126
+ inner=self.catalog_properties,
1127
+ )
1128
+
1129
+ # Test TableWriteMode.CREATE
1130
+ def test_write_to_table_create_new_table(self):
1131
+ """Test CREATE mode with new table"""
1132
+ table_name = "test_create_new"
1133
+ data = self._create_test_pandas_data()
1134
+
1135
+ catalog.write_to_table(
1136
+ data=data,
1137
+ table=table_name,
1138
+ namespace=self.test_namespace,
1139
+ mode=TableWriteMode.CREATE,
1140
+ inner=self.catalog_properties,
1141
+ )
1142
+
1143
+ assert catalog.table_exists(
1144
+ table=table_name,
1145
+ namespace=self.test_namespace,
1146
+ inner=self.catalog_properties,
1147
+ )
1148
+
1149
+ def test_write_to_table_create_existing_table_fails(self):
1150
+ """Test CREATE mode fails when table exists"""
1151
+ table_name = "test_create_fail"
1152
+ data = self._create_test_pandas_data()
1153
+
1154
+ # Create table first
1155
+ catalog.write_to_table(
1156
+ data=data,
1157
+ table=table_name,
1158
+ namespace=self.test_namespace,
1159
+ mode=TableWriteMode.CREATE,
1160
+ inner=self.catalog_properties,
1161
+ )
1162
+
1163
+ # Try to create again should fail
1164
+ with pytest.raises(
1165
+ TableAlreadyExistsError, match="already exists and mode is CREATE"
1166
+ ):
1167
+ catalog.write_to_table(
1168
+ data=data,
1169
+ table=table_name,
1170
+ namespace=self.test_namespace,
1171
+ mode=TableWriteMode.CREATE,
1172
+ inner=self.catalog_properties,
1173
+ )
1174
+
1175
+ # Test TableWriteMode.APPEND
1176
+ def test_write_to_table_append_existing_table(self):
1177
+ """Test APPEND mode with existing table"""
1178
+ table_name = "test_append_existing"
1179
+ data1 = self._create_test_pandas_data()
1180
+ data2 = self._create_second_batch_pandas_data()
1181
+
1182
+ # Create table first
1183
+ catalog.write_to_table(
1184
+ data=data1,
1185
+ table=table_name,
1186
+ namespace=self.test_namespace,
1187
+ mode=TableWriteMode.CREATE,
1188
+ inner=self.catalog_properties,
1189
+ )
1190
+
1191
+ # Append to existing table
1192
+ catalog.write_to_table(
1193
+ data=data2,
1194
+ table=table_name,
1195
+ namespace=self.test_namespace,
1196
+ mode=TableWriteMode.APPEND,
1197
+ inner=self.catalog_properties,
1198
+ )
1199
+
1200
+ def test_write_to_table_append_nonexistent_table_fails(self):
1201
+ """Test APPEND mode fails when table doesn't exist"""
1202
+ table_name = "test_append_fail"
1203
+ data = self._create_test_pandas_data()
1204
+
1205
+ with pytest.raises(
1206
+ TableNotFoundError,
1207
+ match="does not exist and write mode is append. Use CREATE or AUTO mode",
1208
+ ):
1209
+ catalog.write_to_table(
1210
+ data=data,
1211
+ table=table_name,
1212
+ namespace=self.test_namespace,
1213
+ mode=TableWriteMode.APPEND,
1214
+ inner=self.catalog_properties,
1215
+ )
1216
+
1217
+ def test_write_to_table_append_with_merge_keys_fails(self):
1218
+ """Test APPEND mode fails when table has merge keys"""
1219
+ table_name = "test_append_with_merge_keys"
1220
+
1221
+ # Create a table with merge keys
1222
+ self._create_table_with_merge_keys(table_name)
1223
+
1224
+ # Create test data that matches the schema
1225
+ data = pd.DataFrame(
1226
+ {
1227
+ "id": [1, 2, 3],
1228
+ "name": ["Alice", "Bob", "Charlie"],
1229
+ "age": [25, 30, 35],
1230
+ "city": ["NYC", "LA", "Chicago"],
1231
+ }
1232
+ )
1233
+
1234
+ # APPEND mode should fail since table has merge keys
1235
+ with pytest.raises(
1236
+ SchemaValidationError,
1237
+ match="APPEND mode cannot be used with tables that have merge keys",
1238
+ ):
1239
+ catalog.write_to_table(
1240
+ data=data,
1241
+ table=table_name,
1242
+ namespace=self.test_namespace,
1243
+ mode=TableWriteMode.APPEND,
1244
+ inner=self.catalog_properties,
1245
+ )
1246
+
1247
+ def test_write_to_table_append_without_merge_keys_succeeds(self):
1248
+ """Test APPEND mode works when table has no merge keys"""
1249
+ table_name = "test_append_no_merge_keys"
1250
+
1251
+ # Create a table without merge keys
1252
+ self._create_table_without_merge_keys(table_name)
1253
+
1254
+ # Add more data to the table
1255
+ data2 = self._create_second_batch_pandas_data()
1256
+
1257
+ # APPEND mode should work since table has no merge keys
1258
+ catalog.write_to_table(
1259
+ data=data2,
1260
+ table=table_name,
1261
+ namespace=self.test_namespace,
1262
+ mode=TableWriteMode.APPEND,
1263
+ inner=self.catalog_properties,
1264
+ )
1265
+
1266
+ # Table should still exist
1267
+ assert catalog.table_exists(
1268
+ table=table_name,
1269
+ namespace=self.test_namespace,
1270
+ inner=self.catalog_properties,
1271
+ )
1272
+
1273
+ # Test explicit schema specification
1274
+ def test_write_to_table_explicit_schema(self):
1275
+ """Test writing with explicit schema specification"""
1276
+ table_name = "test_explicit_schema"
1277
+ data = self._create_test_pandas_data()
1278
+
1279
+ # Define explicit schema with COERCE consistency types to preserve exact types
1280
+ explicit_schema = Schema.of(
1281
+ schema=[
1282
+ Field.of(
1283
+ pa.field("id", pa.int64()),
1284
+ consistency_type=SchemaConsistencyType.COERCE,
1285
+ ),
1286
+ Field.of(
1287
+ pa.field("name", pa.string()),
1288
+ consistency_type=SchemaConsistencyType.COERCE,
1289
+ ),
1290
+ Field.of(
1291
+ pa.field("age", pa.int32()),
1292
+ consistency_type=SchemaConsistencyType.COERCE,
1293
+ ), # Different from inferred schema
1294
+ Field.of(
1295
+ pa.field("city", pa.string()),
1296
+ consistency_type=SchemaConsistencyType.COERCE,
1297
+ ),
1298
+ ]
1299
+ )
1300
+
1301
+ catalog.write_to_table(
1302
+ data=data,
1303
+ table=table_name,
1304
+ namespace=self.test_namespace,
1305
+ mode=TableWriteMode.CREATE,
1306
+ schema=explicit_schema,
1307
+ inner=self.catalog_properties,
1308
+ )
1309
+
1310
+ # Verify schema was used
1311
+ table_def = catalog.get_table(
1312
+ table=table_name,
1313
+ namespace=self.test_namespace,
1314
+ inner=self.catalog_properties,
1315
+ )
1316
+ assert table_def.table_version.schema.equivalent_to(explicit_schema)
1317
+
1318
+ def test_write_to_table_explicit_schema_none(self):
1319
+ """Test writing with explicit schema=None to create schemaless table"""
1320
+ table_name = "test_explicit_schema_none"
1321
+ data = self._create_test_pandas_data()
1322
+
1323
+ catalog.write_to_table(
1324
+ data=data,
1325
+ table=table_name,
1326
+ namespace=self.test_namespace,
1327
+ mode=TableWriteMode.CREATE,
1328
+ schema=None, # Explicitly set schema=None
1329
+ inner=self.catalog_properties,
1330
+ )
1331
+
1332
+ # Verify table was created with schema=None (schemaless)
1333
+ table_def = catalog.get_table(
1334
+ table=table_name,
1335
+ namespace=self.test_namespace,
1336
+ inner=self.catalog_properties,
1337
+ )
1338
+
1339
+ # The table should exist but have a None/empty schema
1340
+ assert table_def is not None
1341
+ # Note: The exact behavior of schemaless tables may vary by storage implementation
1342
+ # We're mainly testing that the create_table call succeeded with schema=None
1343
+
1344
+ def test_schema_behavior_comparison(self):
1345
+ """Test that demonstrates the difference between no schema vs explicit schema=None"""
1346
+ data = self._create_test_pandas_data()
1347
+
1348
+ # Case 1: No schema argument - should infer schema
1349
+ table_name_inferred = "test_schema_inferred"
1350
+ catalog.write_to_table(
1351
+ data=data,
1352
+ table=table_name_inferred,
1353
+ namespace=self.test_namespace,
1354
+ mode=TableWriteMode.CREATE,
1355
+ # No schema argument provided - should infer from data
1356
+ inner=self.catalog_properties,
1357
+ )
1358
+
1359
+ # Case 2: Explicit schema=None - should create schemaless table
1360
+ table_name_schemaless = "test_schema_none"
1361
+ catalog.write_to_table(
1362
+ data=data,
1363
+ table=table_name_schemaless,
1364
+ namespace=self.test_namespace,
1365
+ mode=TableWriteMode.CREATE,
1366
+ schema=None, # Explicitly set schema=None
1367
+ inner=self.catalog_properties,
1368
+ )
1369
+
1370
+ # Verify both tables were created
1371
+ table_inferred = catalog.get_table(
1372
+ table=table_name_inferred,
1373
+ namespace=self.test_namespace,
1374
+ inner=self.catalog_properties,
1375
+ )
1376
+
1377
+ table_schemaless = catalog.get_table(
1378
+ table=table_name_schemaless,
1379
+ namespace=self.test_namespace,
1380
+ inner=self.catalog_properties,
1381
+ )
1382
+
1383
+ # Both tables should exist
1384
+ assert table_inferred is not None
1385
+ assert table_schemaless is not None
1386
+
1387
+ # The inferred schema table should have a schema with the expected columns
1388
+ inferred_schema = table_inferred.table_version.schema.arrow
1389
+ assert "id" in inferred_schema.names
1390
+ assert "name" in inferred_schema.names
1391
+ assert "age" in inferred_schema.names
1392
+ assert "city" in inferred_schema.names
1393
+
1394
+ # Test schema inference from different data types
1395
+ def test_schema_inference_pandas(self):
1396
+ """Test schema inference from pandas DataFrame"""
1397
+ table_name = "test_schema_inference_pandas"
1398
+ data = pd.DataFrame(
1399
+ {
1400
+ "int_col": [1, 2, 3],
1401
+ "float_col": [1.1, 2.2, 3.3],
1402
+ "str_col": ["a", "b", "c"],
1403
+ "bool_col": [True, False, True],
1404
+ }
1405
+ )
1406
+
1407
+ catalog.write_to_table(
1408
+ data=data,
1409
+ table=table_name,
1410
+ namespace=self.test_namespace,
1411
+ mode=TableWriteMode.CREATE,
1412
+ inner=self.catalog_properties,
1413
+ )
1414
+
1415
+ table_def = catalog.get_table(
1416
+ table=table_name,
1417
+ namespace=self.test_namespace,
1418
+ inner=self.catalog_properties,
1419
+ )
1420
+
1421
+ schema = table_def.table_version.schema.arrow
1422
+ assert "int_col" in schema.names
1423
+ assert "float_col" in schema.names
1424
+ assert "str_col" in schema.names
1425
+ assert "bool_col" in schema.names
1426
+
1427
+ def test_schema_inference_pyarrow(self):
1428
+ """Test schema inference from PyArrow Table"""
1429
+ table_name = "test_schema_inference_pyarrow"
1430
+ data = pa.table(
1431
+ {
1432
+ "int64_col": pa.array([1, 2, 3], type=pa.int64()),
1433
+ "string_col": pa.array(["x", "y", "z"], type=pa.string()),
1434
+ "double_col": pa.array([1.1, 2.2, 3.3], type=pa.float64()),
1435
+ }
1436
+ )
1437
+
1438
+ catalog.write_to_table(
1439
+ data=data,
1440
+ table=table_name,
1441
+ namespace=self.test_namespace,
1442
+ mode=TableWriteMode.CREATE,
1443
+ inner=self.catalog_properties,
1444
+ )
1445
+
1446
+ table_def = catalog.get_table(
1447
+ table=table_name,
1448
+ namespace=self.test_namespace,
1449
+ inner=self.catalog_properties,
1450
+ )
1451
+
1452
+ schema = table_def.table_version.schema.arrow
1453
+ assert schema.field("int64_col").type == pa.int64()
1454
+ assert schema.field("string_col").type == pa.string()
1455
+ assert schema.field("double_col").type == pa.float64()
1456
+
1457
+ def test_schema_inference_polars(self):
1458
+ """Test schema inference from Polars DataFrame"""
1459
+ table_name = "test_schema_inference_polars"
1460
+ data = pl.DataFrame(
1461
+ {
1462
+ "int_col": [1, 2, 3],
1463
+ "str_col": ["a", "b", "c"],
1464
+ "float_col": [1.1, 2.2, 3.3],
1465
+ }
1466
+ )
1467
+
1468
+ catalog.write_to_table(
1469
+ data=data,
1470
+ table=table_name,
1471
+ namespace=self.test_namespace,
1472
+ mode=TableWriteMode.CREATE,
1473
+ inner=self.catalog_properties,
1474
+ )
1475
+
1476
+ table_def = catalog.get_table(
1477
+ table=table_name,
1478
+ namespace=self.test_namespace,
1479
+ inner=self.catalog_properties,
1480
+ )
1481
+
1482
+ schema = table_def.table_version.schema.arrow
1483
+ assert "int_col" in schema.names
1484
+ assert "str_col" in schema.names
1485
+ assert "float_col" in schema.names
1486
+
1487
+ def test_schema_inference_ray_dataset(self):
1488
+ """Test schema inference from Ray Dataset"""
1489
+ table_name = "test_schema_inference_ray"
1490
+ ray_data = self._create_test_ray_data()
1491
+
1492
+ catalog.write_to_table(
1493
+ data=ray_data,
1494
+ table=table_name,
1495
+ namespace=self.test_namespace,
1496
+ mode=TableWriteMode.CREATE,
1497
+ inner=self.catalog_properties,
1498
+ )
1499
+
1500
+ table_def = catalog.get_table(
1501
+ table=table_name,
1502
+ namespace=self.test_namespace,
1503
+ inner=self.catalog_properties,
1504
+ )
1505
+
1506
+ schema = table_def.table_version.schema.arrow
1507
+ assert "id" in schema.names
1508
+ assert "name" in schema.names
1509
+ assert "age" in schema.names
1510
+ assert "city" in schema.names
1511
+
1512
+ def test_schema_inference_daft_dataframe(self):
1513
+ """Test schema inference from Daft DataFrame"""
1514
+ table_name = "test_schema_inference_daft"
1515
+ data = self._create_test_daft_data()
1516
+
1517
+ catalog.write_to_table(
1518
+ data=data,
1519
+ table=table_name,
1520
+ namespace=self.test_namespace,
1521
+ mode=TableWriteMode.CREATE,
1522
+ inner=self.catalog_properties,
1523
+ )
1524
+
1525
+ table_def = catalog.get_table(
1526
+ table=table_name,
1527
+ namespace=self.test_namespace,
1528
+ inner=self.catalog_properties,
1529
+ )
1530
+
1531
+ schema = table_def.table_version.schema.arrow
1532
+ assert "id" in schema.names
1533
+ assert "name" in schema.names
1534
+ assert "age" in schema.names
1535
+ assert "city" in schema.names
1536
+
1537
+ def test_schema_inference_numpy_1d(self):
1538
+ """Test schema inference from 1D numpy array"""
1539
+ table_name = "test_schema_inference_numpy_1d"
1540
+ data = self._create_test_numpy_1d_data()
1541
+
1542
+ catalog.write_to_table(
1543
+ data=data,
1544
+ table=table_name,
1545
+ namespace=self.test_namespace,
1546
+ mode=TableWriteMode.CREATE,
1547
+ inner=self.catalog_properties,
1548
+ )
1549
+
1550
+ table_def = catalog.get_table(
1551
+ table=table_name,
1552
+ namespace=self.test_namespace,
1553
+ inner=self.catalog_properties,
1554
+ )
1555
+
1556
+ schema = table_def.table_version.schema.arrow
1557
+ assert (
1558
+ "0" in schema.names
1559
+ ) # pandas converts 1D numpy array with column name "0"
1560
+ assert len(schema.names) == 1
1561
+
1562
+ def test_schema_inference_numpy_2d(self):
1563
+ """Test schema inference from 2D numpy array"""
1564
+ table_name = "test_schema_inference_numpy_2d"
1565
+ data = self._create_test_numpy_2d_data()
1566
+
1567
+ catalog.write_to_table(
1568
+ data=data,
1569
+ table=table_name,
1570
+ namespace=self.test_namespace,
1571
+ mode=TableWriteMode.CREATE,
1572
+ inner=self.catalog_properties,
1573
+ )
1574
+
1575
+ table_def = catalog.get_table(
1576
+ table=table_name,
1577
+ namespace=self.test_namespace,
1578
+ inner=self.catalog_properties,
1579
+ )
1580
+
1581
+ schema = table_def.table_version.schema.arrow
1582
+ assert (
1583
+ "0" in schema.names
1584
+ ) # pandas converts 2D numpy array with column names "0", "1"
1585
+ assert "1" in schema.names
1586
+ assert len(schema.names) == 2
1587
+
1588
+ def test_numpy_3d_array_error(self):
1589
+ """Test that 3D numpy arrays raise an error"""
1590
+ table_name = "test_numpy_3d_error"
1591
+ data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # 3D array
1592
+
1593
+ with pytest.raises(
1594
+ ValueError, match="NumPy arrays with 3 dimensions are not supported"
1595
+ ):
1596
+ catalog.write_to_table(
1597
+ data=data,
1598
+ table=table_name,
1599
+ namespace=self.test_namespace,
1600
+ mode=TableWriteMode.CREATE,
1601
+ inner=self.catalog_properties,
1602
+ )
1603
+
1604
+ # Test different content types
1605
+ def test_write_to_table_different_content_types(self):
1606
+ """Test writing with different content types"""
1607
+ data = self._create_test_pandas_data()
1608
+
1609
+ content_types = [
1610
+ ContentType.PARQUET,
1611
+ ContentType.CSV,
1612
+ ContentType.JSON,
1613
+ ]
1614
+
1615
+ for i, content_type in enumerate(content_types):
1616
+ table_name = f"test_content_type_{content_type.value}_{i}"
1617
+
1618
+ catalog.write_to_table(
1619
+ data=data,
1620
+ table=table_name,
1621
+ namespace=self.test_namespace,
1622
+ mode=TableWriteMode.CREATE,
1623
+ content_type=content_type,
1624
+ inner=self.catalog_properties,
1625
+ schema=None,
1626
+ )
1627
+
1628
+ assert catalog.table_exists(
1629
+ table=table_name,
1630
+ namespace=self.test_namespace,
1631
+ inner=self.catalog_properties,
1632
+ )
1633
+
1634
+ # Test table creation parameters
1635
+ def test_write_to_table_with_table_properties(self):
1636
+ """Test writing with table creation parameters"""
1637
+ table_name = "test_table_properties"
1638
+ data = self._create_test_pandas_data()
1639
+
1640
+ catalog.write_to_table(
1641
+ data=data,
1642
+ table=table_name,
1643
+ namespace=self.test_namespace,
1644
+ mode=TableWriteMode.CREATE,
1645
+ table_description="Test table with properties",
1646
+ lifecycle_state=LifecycleState.ACTIVE,
1647
+ inner=self.catalog_properties,
1648
+ )
1649
+
1650
+ table_def = catalog.get_table(
1651
+ table=table_name,
1652
+ namespace=self.test_namespace,
1653
+ inner=self.catalog_properties,
1654
+ )
1655
+
1656
+ assert table_def.table.description == "Test table with properties"
1657
+ # Note: lifecycle_state defaults to ACTIVE in create_table, but may be overridden
1658
+ # We'll accept either ACTIVE or CREATED as both are valid for our test purpose
1659
+ assert table_def.table_version.state in [
1660
+ LifecycleState.ACTIVE,
1661
+ LifecycleState.CREATED,
1662
+ ]
1663
+
1664
+ # Test error conditions
1665
+ def test_write_to_table_unsupported_data_type(self):
1666
+ """Test error when data type cannot be inferred"""
1667
+ table_name = "test_unsupported_data"
1668
+
1669
+ # Use a plain dict which doesn't have schema inference
1670
+ unsupported_data = {"key": "value"}
1671
+
1672
+ with pytest.raises(
1673
+ ValueError, match="No schema inference function found for table type"
1674
+ ):
1675
+ catalog.write_to_table(
1676
+ data=unsupported_data,
1677
+ table=table_name,
1678
+ namespace=self.test_namespace,
1679
+ mode=TableWriteMode.CREATE,
1680
+ inner=self.catalog_properties,
1681
+ )
1682
+
1683
+ def test_write_to_table_replace_mode(self):
1684
+ """Test REPLACE mode creating a new stream to replace existing data"""
1685
+ table_name = "test_replace_mode"
1686
+ data1 = self._create_test_pandas_data()
1687
+ data2 = self._create_second_batch_pandas_data()
1688
+
1689
+ # First, create the table
1690
+ catalog.write_to_table(
1691
+ data=data1,
1692
+ table=table_name,
1693
+ namespace=self.test_namespace,
1694
+ mode=TableWriteMode.CREATE,
1695
+ inner=self.catalog_properties,
1696
+ )
1697
+
1698
+ # Verify table exists
1699
+ assert catalog.table_exists(
1700
+ table=table_name,
1701
+ namespace=self.test_namespace,
1702
+ inner=self.catalog_properties,
1703
+ )
1704
+
1705
+ # Now use REPLACE mode to replace all existing data
1706
+ catalog.write_to_table(
1707
+ data=data2,
1708
+ table=table_name,
1709
+ namespace=self.test_namespace,
1710
+ mode=TableWriteMode.REPLACE,
1711
+ inner=self.catalog_properties,
1712
+ )
1713
+
1714
+ # Table should still exist
1715
+ assert catalog.table_exists(
1716
+ table=table_name,
1717
+ namespace=self.test_namespace,
1718
+ inner=self.catalog_properties,
1719
+ )
1720
+
1721
+ def test_write_to_table_merge_mode_with_merge_keys(self):
1722
+ """Test MERGE mode works when table has merge keys"""
1723
+ table_name = "test_merge_mode_with_keys"
1724
+
1725
+ # Create a table with merge keys
1726
+ self._create_table_with_merge_keys(table_name)
1727
+
1728
+ # Create test data that matches the schema
1729
+ data = pd.DataFrame(
1730
+ {
1731
+ "id": [1, 2, 3],
1732
+ "name": ["Alice", "Bob", "Charlie"],
1733
+ "age": [25, 30, 35],
1734
+ "city": ["NYC", "LA", "Chicago"],
1735
+ }
1736
+ )
1737
+
1738
+ # MERGE mode should work since table has merge keys
1739
+ catalog.write_to_table(
1740
+ data=data,
1741
+ table=table_name,
1742
+ namespace=self.test_namespace,
1743
+ mode=TableWriteMode.MERGE,
1744
+ inner=self.catalog_properties,
1745
+ )
1746
+
1747
+ # Table should still exist
1748
+ assert catalog.table_exists(
1749
+ table=table_name,
1750
+ namespace=self.test_namespace,
1751
+ inner=self.catalog_properties,
1752
+ )
1753
+
1754
+ def test_write_to_table_merge_mode_without_merge_keys_fails(self):
1755
+ """Test MERGE mode fails when table has no merge keys"""
1756
+ table_name = "test_merge_mode_no_keys"
1757
+
1758
+ # Create a table without merge keys
1759
+ self._create_table_without_merge_keys(table_name)
1760
+
1761
+ data = self._create_test_pandas_data()
1762
+
1763
+ # MERGE mode should fail since table has no merge keys
1764
+ with pytest.raises(
1765
+ TableValidationError,
1766
+ match="MERGE mode requires tables to have at least one merge key",
1767
+ ):
1768
+ catalog.write_to_table(
1769
+ data=data,
1770
+ table=table_name,
1771
+ namespace=self.test_namespace,
1772
+ mode=TableWriteMode.MERGE,
1773
+ inner=self.catalog_properties,
1774
+ )
1775
+
1776
+ # Test default namespace behavior
1777
+ def test_write_to_table_default_namespace(self):
1778
+ """Test writing to table using default namespace"""
1779
+ table_name = "test_default_namespace"
1780
+ data = self._create_test_pandas_data()
1781
+
1782
+ # Don't specify namespace, should use default
1783
+ catalog.write_to_table(
1784
+ data=data,
1785
+ table=table_name,
1786
+ mode=TableWriteMode.CREATE,
1787
+ inner=self.catalog_properties,
1788
+ )
1789
+
1790
+ # Should be able to find table in default namespace
1791
+ default_ns = catalog.default_namespace(inner=self.catalog_properties)
1792
+ assert catalog.table_exists(
1793
+ table=table_name, namespace=default_ns, inner=self.catalog_properties
1794
+ )
1795
+
1796
+ def test_write_to_table_append_creates_separate_deltas(self):
1797
+ """Test that APPEND mode creates separate deltas in the same partition"""
1798
+ from deltacat.catalog.main.impl import _get_storage
1799
+
1800
+ table_name = "test_append_separate_deltas"
1801
+ data1 = self._create_test_pandas_data()
1802
+ data2 = self._create_second_batch_pandas_data()
1803
+
1804
+ # Create table with first batch
1805
+ catalog.write_to_table(
1806
+ data=data1,
1807
+ table=table_name,
1808
+ namespace=self.test_namespace,
1809
+ mode=TableWriteMode.CREATE,
1810
+ inner=self.catalog_properties,
1811
+ )
1812
+
1813
+ # Get the table definition to access stream information
1814
+ table_def = catalog.get_table(
1815
+ table=table_name,
1816
+ namespace=self.test_namespace,
1817
+ inner=self.catalog_properties,
1818
+ )
1819
+
1820
+ # Get storage interface
1821
+ storage = _get_storage(inner=self.catalog_properties)
1822
+
1823
+ # Get the stream
1824
+ stream = storage.get_stream(
1825
+ namespace=self.test_namespace,
1826
+ table_name=table_name,
1827
+ table_version=table_def.table_version.table_version,
1828
+ inner=self.catalog_properties,
1829
+ )
1830
+
1831
+ # Get the partition (should be only one for unpartitioned table)
1832
+ partition = storage.get_partition(
1833
+ stream_locator=stream.locator,
1834
+ partition_values=None, # unpartitioned
1835
+ inner=self.catalog_properties,
1836
+ )
1837
+
1838
+ # List deltas before second write
1839
+ deltas_before = storage.list_partition_deltas(
1840
+ partition_like=partition,
1841
+ inner=self.catalog_properties,
1842
+ ).all_items()
1843
+
1844
+ assert (
1845
+ len(deltas_before) == 1
1846
+ ), f"Expected 1 delta before append, got {len(deltas_before)}"
1847
+
1848
+ # Append second batch using APPEND mode
1849
+ catalog.write_to_table(
1850
+ data=data2,
1851
+ table=table_name,
1852
+ namespace=self.test_namespace,
1853
+ mode=TableWriteMode.APPEND,
1854
+ inner=self.catalog_properties,
1855
+ )
1856
+
1857
+ # Get the same partition again (should be the same partition object)
1858
+ partition_after = storage.get_partition(
1859
+ stream_locator=stream.locator,
1860
+ partition_values=None, # unpartitioned
1861
+ inner=self.catalog_properties,
1862
+ )
1863
+
1864
+ # Verify it's the same partition
1865
+ assert (
1866
+ partition.partition_id == partition_after.partition_id
1867
+ ), "APPEND should reuse the same partition"
1868
+
1869
+ # List deltas after second write
1870
+ deltas_after = storage.list_partition_deltas(
1871
+ partition_like=partition_after,
1872
+ inner=self.catalog_properties,
1873
+ ).all_items()
1874
+
1875
+ # Should now have 2 deltas in the same partition
1876
+ assert (
1877
+ len(deltas_after) == 2
1878
+ ), f"Expected 2 deltas after append, got {len(deltas_after)}"
1879
+
1880
+ # Verify deltas have different stream positions
1881
+ stream_positions = [delta.stream_position for delta in deltas_after]
1882
+ assert (
1883
+ len(set(stream_positions)) == 2
1884
+ ), "Deltas should have different stream positions"
1885
+ assert min(stream_positions) == 1, "First delta should have stream position 1"
1886
+ assert max(stream_positions) == 2, "Second delta should have stream position 2"
1887
+
1888
+ def test_write_to_table_partitioned_table_raises_not_implemented(self):
1889
+ """Test that write_to_table raises NotImplementedError for partitioned tables"""
1890
+ from deltacat.storage.model.partition import (
1891
+ PartitionScheme,
1892
+ PartitionKey,
1893
+ PartitionKeyList,
1894
+ )
1895
+ from deltacat.storage.model.transform import IdentityTransform
1896
+
1897
+ table_name = "test_partitioned_table"
1898
+ data = self._create_test_pandas_data()
1899
+
1900
+ # Create a partition scheme with partition keys
1901
+ partition_keys = [
1902
+ PartitionKey.of(
1903
+ key=["city"],
1904
+ name="city_partition",
1905
+ transform=IdentityTransform.of(),
1906
+ )
1907
+ ]
1908
+ partition_scheme = PartitionScheme.of(
1909
+ keys=PartitionKeyList.of(partition_keys),
1910
+ name="test_partition_scheme",
1911
+ scheme_id="test_partition_scheme_id",
1912
+ )
1913
+
1914
+ # Try to create a partitioned table using write_to_table
1915
+ with pytest.raises(
1916
+ NotImplementedError,
1917
+ match="write_to_table does not yet support partitioned tables",
1918
+ ):
1919
+ catalog.write_to_table(
1920
+ data=data,
1921
+ table=table_name,
1922
+ namespace=self.test_namespace,
1923
+ mode=TableWriteMode.CREATE,
1924
+ partition_scheme=partition_scheme, # This makes it partitioned
1925
+ inner=self.catalog_properties,
1926
+ )
1927
+
1928
+ def test_write_to_table_sorted_table_raises_not_implemented(self):
1929
+ """Test that write_to_table raises NotImplementedError for tables with sort keys"""
1930
+ from deltacat.storage.model.sort_key import SortScheme, SortKey, SortKeyList
1931
+ from deltacat.storage.model.types import SortOrder, NullOrder
1932
+
1933
+ table_name = "test_sorted_table"
1934
+ data = self._create_test_pandas_data()
1935
+
1936
+ # Create sort scheme with sort keys
1937
+ sort_scheme = SortScheme.of(
1938
+ keys=SortKeyList.of(
1939
+ [
1940
+ SortKey.of(
1941
+ key=["id"],
1942
+ sort_order=SortOrder.ASCENDING,
1943
+ null_order=NullOrder.AT_END,
1944
+ )
1945
+ ]
1946
+ ),
1947
+ name="test_sort_scheme",
1948
+ scheme_id="test_sort_scheme_id",
1949
+ )
1950
+
1951
+ # Create table with sort keys
1952
+ catalog.create_table(
1953
+ table=table_name,
1954
+ namespace=self.test_namespace,
1955
+ sort_keys=sort_scheme,
1956
+ inner=self.catalog_properties,
1957
+ )
1958
+
1959
+ # Attempt to write to the sorted table should raise NotImplementedError
1960
+ with pytest.raises(NotImplementedError) as exc_info:
1961
+ catalog.write_to_table(
1962
+ data=data,
1963
+ table=table_name,
1964
+ namespace=self.test_namespace,
1965
+ mode=TableWriteMode.APPEND,
1966
+ inner=self.catalog_properties,
1967
+ )
1968
+
1969
+ # Verify the error message contains expected information
1970
+ assert "sort keys" in str(exc_info.value)
1971
+ assert "sort scheme with 1 sort key(s)" in str(exc_info.value)
1972
+ assert "id" in str(exc_info.value)