deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -3,18 +3,29 @@ import tempfile
3
3
 
4
4
  import pytest
5
5
  import pyarrow as pa
6
+ import pandas as pd
7
+ import polars as pl
8
+ import numpy as np
9
+ import ray.data as rd
10
+ import daft
6
11
 
7
12
  import deltacat.catalog.main.impl as catalog
8
13
  from deltacat.catalog import get_catalog_properties
9
- from deltacat.storage.model.schema import Schema
14
+ from deltacat.storage.model.schema import (
15
+ Schema,
16
+ Field,
17
+ )
18
+ from deltacat.storage.model.types import SchemaConsistencyType
10
19
  from deltacat.storage.model.sort_key import SortKey, SortScheme, SortOrder, NullOrder
11
- from deltacat.storage.model.table import TableProperties
12
- from deltacat.storage.model.namespace import NamespaceProperties
13
20
  from deltacat.storage.model.types import LifecycleState
14
21
  from deltacat.exceptions import (
15
22
  TableAlreadyExistsError,
16
23
  TableNotFoundError,
24
+ TableValidationError,
25
+ SchemaValidationError,
17
26
  )
27
+ from deltacat.types.tables import TableWriteMode, TableProperty, SchemaEvolutionMode
28
+ from deltacat.types.media import ContentType
18
29
 
19
30
 
20
31
  @pytest.fixture(scope="class")
@@ -69,6 +80,24 @@ def sample_sort_keys():
69
80
 
70
81
 
71
82
  class TestCatalogTableOperations:
83
+ """Test catalog table operations including table creation, existence checks, etc."""
84
+
85
+ @classmethod
86
+ def setup_class(cls):
87
+ cls.temp_dir = tempfile.mkdtemp()
88
+ cls.catalog_properties = get_catalog_properties(root=cls.temp_dir)
89
+
90
+ # Create a test namespace
91
+ cls.test_namespace = "test_write_operations"
92
+ catalog.create_namespace(
93
+ namespace=cls.test_namespace,
94
+ inner=cls.catalog_properties,
95
+ )
96
+
97
+ @classmethod
98
+ def teardown_class(cls):
99
+ shutil.rmtree(cls.temp_dir)
100
+
72
101
  def test_create_table(self, test_namespace, sample_arrow_schema, sample_sort_keys):
73
102
  """Test creating a table with schema and properties"""
74
103
  namespace_name, catalog_properties = test_namespace
@@ -78,20 +107,18 @@ class TestCatalogTableOperations:
78
107
  schema = Schema(arrow=sample_arrow_schema)
79
108
 
80
109
  # Create table properties
81
- table_properties = TableProperties(
82
- {"owner": "test-user", "department": "engineering"}
83
- )
110
+ table_properties = {"owner": "test-user", "department": "engineering"}
84
111
 
85
112
  # Create namespace properties
86
- namespace_properties = NamespaceProperties({"description": "Test Namespace"})
113
+ namespace_properties = {"description": "Test Namespace"}
87
114
 
88
115
  # Create the table
89
116
  table_definition = catalog.create_table(
90
- name=table_name,
117
+ table=table_name,
91
118
  namespace=namespace_name,
92
119
  schema=schema,
93
120
  sort_keys=sample_sort_keys,
94
- description="Test table for unit tests",
121
+ table_description="Test table for unit tests",
95
122
  table_properties=table_properties,
96
123
  namespace_properties=namespace_properties,
97
124
  inner=catalog_properties,
@@ -99,7 +126,9 @@ class TestCatalogTableOperations:
99
126
 
100
127
  # Verify table was created
101
128
  assert catalog.table_exists(
102
- table_name, namespace=namespace_name, inner=catalog_properties
129
+ table_name,
130
+ namespace=namespace_name,
131
+ inner=catalog_properties,
103
132
  )
104
133
 
105
134
  table = table_definition.table
@@ -109,7 +138,7 @@ class TestCatalogTableOperations:
109
138
  assert table_version.table_name == table_name
110
139
  assert table_version.namespace == namespace_name
111
140
  assert table_version.description == "Test table for unit tests"
112
- assert table_version.state == LifecycleState.CREATED
141
+ assert table_version.state == LifecycleState.ACTIVE
113
142
  assert table.properties.get("owner") == "test-user"
114
143
  assert table.properties.get("department") == "engineering"
115
144
  assert table_version.schema.arrow.names == sample_arrow_schema.names
@@ -123,15 +152,17 @@ class TestCatalogTableOperations:
123
152
 
124
153
  # Create the table
125
154
  catalog.create_table(
126
- name=table_name,
155
+ table=table_name,
127
156
  namespace=namespace_name,
128
- description="First creation",
157
+ table_description="First creation",
129
158
  inner=catalog_properties,
130
159
  )
131
160
 
132
161
  # Verify table exists
133
162
  assert catalog.table_exists(
134
- table_name, namespace=namespace_name, inner=catalog_properties
163
+ table_name,
164
+ namespace=namespace_name,
165
+ inner=catalog_properties,
135
166
  )
136
167
 
137
168
  # Try to create the same table again, should raise TableAlreadyExistsError
@@ -140,9 +171,9 @@ class TestCatalogTableOperations:
140
171
  match=f"Table {namespace_name}.{table_name} already exists",
141
172
  ):
142
173
  catalog.create_table(
143
- name=table_name,
174
+ table=table_name,
144
175
  namespace=namespace_name,
145
- description="Second creation attempt",
176
+ table_description="Second creation attempt",
146
177
  inner=catalog_properties,
147
178
  )
148
179
 
@@ -153,21 +184,23 @@ class TestCatalogTableOperations:
153
184
 
154
185
  # Create the table with original description
155
186
  catalog.create_table(
156
- name=table_name,
187
+ table=table_name,
157
188
  namespace=namespace_name,
158
- description="Original description",
189
+ table_description="Original description",
159
190
  inner=catalog_properties,
160
191
  )
161
192
 
162
193
  assert catalog.table_exists(
163
- table_name, namespace=namespace_name, inner=catalog_properties
194
+ table_name,
195
+ namespace=namespace_name,
196
+ catalog=catalog_properties,
164
197
  )
165
198
 
166
199
  # Create the same table with fail_if_exists=False
167
200
  table_definition = catalog.create_table(
168
- name=table_name,
201
+ table=table_name,
169
202
  namespace=namespace_name,
170
- description="Updated description",
203
+ table_description="Updated description",
171
204
  fail_if_exists=False,
172
205
  inner=catalog_properties,
173
206
  )
@@ -185,22 +218,30 @@ class TestCatalogTableOperations:
185
218
 
186
219
  # Create the table
187
220
  catalog.create_table(
188
- name=table_name, namespace=namespace_name, inner=catalog_properties
221
+ table=table_name,
222
+ namespace=namespace_name,
223
+ inner=catalog_properties,
189
224
  )
190
225
 
191
226
  # Verify table exists
192
227
  assert catalog.table_exists(
193
- table_name, namespace=namespace_name, inner=catalog_properties
228
+ table_name,
229
+ namespace=namespace_name,
230
+ inner=catalog_properties,
194
231
  )
195
232
 
196
233
  # Drop the table
197
234
  catalog.drop_table(
198
- name=table_name, namespace=namespace_name, inner=catalog_properties
235
+ table=table_name,
236
+ namespace=namespace_name,
237
+ inner=catalog_properties,
199
238
  )
200
239
 
201
240
  # Verify table no longer exists
202
241
  assert not catalog.table_exists(
203
- table_name, namespace=namespace_name, inner=catalog_properties
242
+ table_name,
243
+ namespace=namespace_name,
244
+ inner=catalog_properties,
204
245
  )
205
246
 
206
247
  def test_drop_table_not_exists(self, test_namespace):
@@ -209,15 +250,57 @@ class TestCatalogTableOperations:
209
250
 
210
251
  # Verify table doesn't exist
211
252
  assert not catalog.table_exists(
212
- table_name, namespace=namespace_name, inner=catalog_properties
253
+ table_name,
254
+ namespace=namespace_name,
255
+ inner=catalog_properties,
213
256
  )
214
257
 
215
258
  # Try to drop the table, should raise TableNotFoundError
216
259
  with pytest.raises(TableNotFoundError, match=table_name):
217
260
  catalog.drop_table(
218
- name=table_name, namespace=namespace_name, inner=catalog_properties
261
+ table=table_name,
262
+ namespace=namespace_name,
263
+ inner=catalog_properties,
219
264
  )
220
265
 
266
+ def test_rename_namespace(self, test_namespace):
267
+ namespace_name, catalog_properties = test_namespace
268
+ original_name = "test_original_table"
269
+ new_name = "test_renamed_namespace"
270
+
271
+ # Create the table with original name
272
+ catalog.create_table(
273
+ table=original_name,
274
+ namespace=namespace_name,
275
+ table_description="Table to in namespace to be renamed",
276
+ inner=catalog_properties,
277
+ )
278
+
279
+ # Verify original table exists
280
+ assert catalog.table_exists(
281
+ original_name,
282
+ namespace=namespace_name,
283
+ inner=catalog_properties,
284
+ )
285
+
286
+ # Rename the namespace
287
+ catalog.alter_namespace(
288
+ namespace=namespace_name,
289
+ new_namespace=new_name,
290
+ inner=catalog_properties,
291
+ )
292
+
293
+ # Verify new namespace exists and old namespace doesn't
294
+ assert catalog.namespace_exists(new_name, inner=catalog_properties)
295
+ assert not catalog.namespace_exists(namespace_name, inner=catalog_properties)
296
+
297
+ # Verify we can still discover the table in the new namespace
298
+ assert catalog.table_exists(
299
+ original_name,
300
+ namespace=new_name,
301
+ inner=catalog_properties,
302
+ )
303
+
221
304
  def test_rename_table(self, test_namespace):
222
305
  namespace_name, catalog_properties = test_namespace
223
306
  original_name = "test_original_table"
@@ -225,15 +308,17 @@ class TestCatalogTableOperations:
225
308
 
226
309
  # Create the table with original name
227
310
  catalog.create_table(
228
- name=original_name,
311
+ table=original_name,
229
312
  namespace=namespace_name,
230
- description="Table to be renamed",
313
+ table_description="Table to be renamed",
231
314
  inner=catalog_properties,
232
315
  )
233
316
 
234
317
  # Verify original table exists
235
318
  assert catalog.table_exists(
236
- original_name, namespace=namespace_name, inner=catalog_properties
319
+ original_name,
320
+ namespace=namespace_name,
321
+ inner=catalog_properties,
237
322
  )
238
323
 
239
324
  # Rename the table
@@ -246,10 +331,14 @@ class TestCatalogTableOperations:
246
331
 
247
332
  # Verify new table exists and old table doesn't
248
333
  assert catalog.table_exists(
249
- new_name, namespace=namespace_name, inner=catalog_properties
334
+ new_name,
335
+ namespace=namespace_name,
336
+ inner=catalog_properties,
250
337
  )
251
338
  assert not catalog.table_exists(
252
- original_name, namespace=namespace_name, inner=catalog_properties
339
+ original_name,
340
+ namespace=namespace_name,
341
+ inner=catalog_properties,
253
342
  )
254
343
 
255
344
  def test_rename_table_not_exists(self, test_namespace):
@@ -259,7 +348,9 @@ class TestCatalogTableOperations:
259
348
 
260
349
  # Verify table doesn't exist
261
350
  assert not catalog.table_exists(
262
- original_name, namespace=namespace_name, inner=catalog_properties
351
+ original_name,
352
+ namespace=namespace_name,
353
+ inner=catalog_properties,
263
354
  )
264
355
 
265
356
  # Try to rename the table, should raise TableNotFoundError
@@ -278,17 +369,23 @@ class TestCatalogTableOperations:
278
369
 
279
370
  # Create a table
280
371
  catalog.create_table(
281
- name=existing_table, namespace=namespace_name, inner=catalog_properties
372
+ table=existing_table,
373
+ namespace=namespace_name,
374
+ inner=catalog_properties,
282
375
  )
283
376
 
284
377
  # Check existing table
285
378
  assert catalog.table_exists(
286
- existing_table, namespace=namespace_name, inner=catalog_properties
379
+ existing_table,
380
+ namespace=namespace_name,
381
+ inner=catalog_properties,
287
382
  )
288
383
 
289
384
  # Check non-existing table
290
385
  assert not catalog.table_exists(
291
- non_existing_table, namespace=namespace_name, inner=catalog_properties
386
+ non_existing_table,
387
+ namespace=namespace_name,
388
+ inner=catalog_properties,
292
389
  )
293
390
 
294
391
  def test_create_table_with_default_namespace(self, catalog_setup):
@@ -297,7 +394,7 @@ class TestCatalogTableOperations:
297
394
 
298
395
  # Create table with default namespace
299
396
  table_definition = catalog.create_table(
300
- name=table_name, inner=catalog_properties
397
+ table=table_name, inner=catalog_properties
301
398
  )
302
399
 
303
400
  table = table_definition.table
@@ -305,7 +402,9 @@ class TestCatalogTableOperations:
305
402
  default_ns = catalog.default_namespace()
306
403
  assert table.namespace == default_ns
307
404
  assert catalog.table_exists(
308
- table_name, namespace=default_ns, inner=catalog_properties
405
+ table_name,
406
+ namespace=default_ns,
407
+ inner=catalog_properties,
309
408
  )
310
409
 
311
410
  def test_create_table_with_missing_namespace(self, catalog_setup):
@@ -318,11 +417,15 @@ class TestCatalogTableOperations:
318
417
 
319
418
  # Try to create table with non-existent namespace
320
419
  catalog.create_table(
321
- name=table_name, namespace=new_namespace, inner=catalog_properties
420
+ table=table_name,
421
+ namespace=new_namespace,
422
+ inner=catalog_properties,
322
423
  )
323
424
 
324
425
  assert catalog.table_exists(
325
- table_name, namespace=new_namespace, inner=catalog_properties
426
+ table_name,
427
+ namespace=new_namespace,
428
+ inner=catalog_properties,
326
429
  )
327
430
  assert catalog.namespace_exists(new_namespace, inner=catalog_properties)
328
431
 
@@ -332,17 +435,15 @@ class TestCatalogTableOperations:
332
435
 
333
436
  # Create initial schema and properties
334
437
  schema = Schema.of(schema=sample_arrow_schema)
335
- initial_properties = TableProperties(
336
- {"owner": "original-user", "department": "engineering"}
337
- )
438
+ initial_properties = {"owner": "original-user", "department": "engineering"}
338
439
 
339
440
  # Create the table with initial properties
340
441
  table = catalog.create_table(
341
- name=table_name,
442
+ table=table_name,
342
443
  namespace=namespace_name,
343
444
  schema=schema,
344
445
  sort_keys=sample_sort_keys,
345
- description="Initial description",
446
+ table_description="Initial description",
346
447
  table_properties=initial_properties,
347
448
  inner=catalog_properties,
348
449
  )
@@ -350,39 +451,37 @@ class TestCatalogTableOperations:
350
451
 
351
452
  # Verify table was created with initial properties
352
453
  assert catalog.table_exists(
353
- table_name, namespace=namespace_name, inner=catalog_properties
354
- )
355
-
356
- # Create updated schema
357
- updated_arrow_schema = pa.schema(
358
- [
359
- pa.field("count", pa.float64()), # Added field
360
- ]
454
+ table_name,
455
+ namespace=namespace_name,
456
+ inner=catalog_properties,
361
457
  )
362
458
 
363
- new_schema = old_schema.add_subschema(
364
- name="updated_schema",
365
- schema=updated_arrow_schema,
366
- )
459
+ # Create schema update operations to add a new field
460
+ new_field = Field.of(pa.field("count", pa.float64(), nullable=True))
461
+ schema_update = old_schema.update().add_field(new_field)
367
462
 
368
463
  # Create updated properties
369
- updated_properties = TableProperties(
370
- {"owner": "new-user", "department": "data-science", "priority": "high"}
371
- )
464
+ updated_properties = {
465
+ "owner": "new-user",
466
+ "department": "data-science",
467
+ "priority": "high",
468
+ }
372
469
 
373
- # Alter the table with new properties
470
+ # Alter the table with new properties and schema updates
374
471
  catalog.alter_table(
375
472
  table=table_name,
376
473
  namespace=namespace_name,
377
- schema_updates=new_schema,
378
- description="Updated description",
379
- properties=updated_properties,
474
+ schema_updates=schema_update,
475
+ table_description="Updated description",
476
+ table_properties=updated_properties,
380
477
  inner=catalog_properties,
381
478
  )
382
479
 
383
480
  # Get the updated table definition
384
481
  updated_table_def = catalog.get_table(
385
- table_name, namespace=namespace_name, inner=catalog_properties
482
+ table_name,
483
+ namespace=namespace_name,
484
+ inner=catalog_properties,
386
485
  )
387
486
 
388
487
  updated_table = updated_table_def.table
@@ -390,11 +489,23 @@ class TestCatalogTableOperations:
390
489
 
391
490
  # Verify table properties were updated
392
491
  assert updated_table_version.description == "Updated description"
393
- assert updated_table_version.state == LifecycleState.CREATED
492
+ assert updated_table_version.state == LifecycleState.ACTIVE
394
493
  assert updated_table.properties.get("owner") == "new-user"
395
494
  assert updated_table.properties.get("department") == "data-science"
396
495
  assert updated_table.properties.get("priority") == "high"
397
496
 
497
+ # Verify schema was updated with new field
498
+ updated_schema = updated_table_version.schema
499
+ assert updated_schema.field("count") is not None
500
+ assert updated_schema.field("count").arrow.type == pa.float64()
501
+ assert updated_schema.field("count").arrow.nullable is True
502
+ assert (
503
+ updated_schema.field("count").id == 3
504
+ ) # Next sequential ID after id(0), name(1), value(2)
505
+
506
+ # Verify schema ID was incremented (proving SchemaUpdate was used)
507
+ assert updated_schema.id == old_schema.id + 1
508
+
398
509
  def test_alter_table_not_exists(self, test_namespace):
399
510
  """Test altering a table that doesn't exist"""
400
511
  namespace_name, catalog_properties = test_namespace
@@ -402,7 +513,9 @@ class TestCatalogTableOperations:
402
513
 
403
514
  # Verify table doesn't exist
404
515
  assert not catalog.table_exists(
405
- nonexistent_table, namespace=namespace_name, inner=catalog_properties
516
+ nonexistent_table,
517
+ namespace=namespace_name,
518
+ inner=catalog_properties,
406
519
  )
407
520
 
408
521
  # Try to alter the nonexistent table, should raise TableNotFoundError
@@ -410,10 +523,276 @@ class TestCatalogTableOperations:
410
523
  catalog.alter_table(
411
524
  table=nonexistent_table,
412
525
  namespace=namespace_name,
413
- description="Updated description",
526
+ table_description="Updated description",
527
+ inner=catalog_properties,
528
+ )
529
+
530
+ def test_alter_table_with_multiple_schema_operations(
531
+ self, test_namespace, sample_arrow_schema
532
+ ):
533
+ """Test altering a table with multiple schema update operations."""
534
+ namespace_name, catalog_properties = test_namespace
535
+ table_name = "test_alter_table_multiple_ops"
536
+
537
+ # Create initial schema
538
+ schema = Schema.of(schema=sample_arrow_schema)
539
+ print("schema.max_field_id", schema.max_field_id)
540
+
541
+ # Create the table
542
+ table = catalog.create_table(
543
+ table=table_name,
544
+ namespace=namespace_name,
545
+ schema=schema,
546
+ table_description="Initial description",
547
+ inner=catalog_properties,
548
+ )
549
+
550
+ original_schema = table.table_version.schema
551
+
552
+ # Create multiple schema update operations
553
+ new_field1 = Field.of(pa.field("count", pa.int64(), nullable=True))
554
+ new_field2 = Field.of(
555
+ pa.field("status", pa.string(), nullable=False),
556
+ past_default="active",
557
+ )
558
+
559
+ schema_update = (
560
+ original_schema.update().add_field(new_field1).add_field(new_field2)
561
+ )
562
+ print("original_schema.max_field_id", original_schema.max_field_id)
563
+ print(
564
+ "schema_update.base_schema.max_field_id",
565
+ schema_update.base_schema.max_field_id,
566
+ )
567
+
568
+ # Alter the table
569
+ catalog.alter_table(
570
+ table=table_name,
571
+ namespace=namespace_name,
572
+ schema_updates=schema_update,
573
+ table_description="Updated with multiple fields",
574
+ inner=catalog_properties,
575
+ )
576
+
577
+ # Get the updated table
578
+ updated_table_def = catalog.get_table(
579
+ table_name,
580
+ namespace=namespace_name,
581
+ inner=catalog_properties,
582
+ )
583
+
584
+ updated_schema = updated_table_def.table_version.schema
585
+
586
+ # Verify both fields were added
587
+ assert updated_schema.field("count") is not None
588
+ assert updated_schema.field("count").arrow.type == pa.int64()
589
+ assert (
590
+ updated_schema.field("count").id == 3
591
+ ) # Next sequential ID after id(0), name(1), value(2)
592
+
593
+ assert updated_schema.field("status") is not None
594
+ assert updated_schema.field("status").arrow.type == pa.string()
595
+ assert (
596
+ updated_schema.field("status").id == 4
597
+ ) # Next sequential ID after count(3)
598
+ assert updated_schema.field("status").past_default == "active"
599
+
600
+ # Verify schema ID was incremented
601
+ assert updated_schema.id == original_schema.id + 1
602
+
603
+ def test_alter_table_with_remove_operation(self, test_namespace):
604
+ """Test altering a table with field removal (requires allow_incompatible_changes)."""
605
+ namespace_name, catalog_properties = test_namespace
606
+ table_name = "test_alter_table_remove"
607
+
608
+ # Create schema with multiple fields
609
+ initial_fields = [
610
+ Field.of(
611
+ pa.field("id", pa.int64(), nullable=False),
612
+ is_merge_key=True,
613
+ field_id=1,
614
+ ),
615
+ Field.of(pa.field("name", pa.string(), nullable=True), field_id=2),
616
+ Field.of(pa.field("temp_field", pa.float64(), nullable=True), field_id=3),
617
+ ]
618
+ schema = Schema.of(initial_fields)
619
+
620
+ # Create the table
621
+ table = catalog.create_table(
622
+ table=table_name,
623
+ namespace=namespace_name,
624
+ schema=schema,
625
+ inner=catalog_properties,
626
+ )
627
+ original_schema = table.table_version.schema
628
+ temp_field = original_schema.field("temp_field")
629
+ assert temp_field is not None
630
+
631
+ schema_update = original_schema.update(True).remove_field("temp_field")
632
+
633
+ catalog.alter_table(
634
+ table=table_name,
635
+ namespace=namespace_name,
636
+ schema_updates=schema_update,
637
+ inner=catalog_properties,
638
+ )
639
+
640
+ # If successful, verify the field was removed
641
+ updated_table_def = catalog.get_table(
642
+ table_name,
643
+ namespace=namespace_name,
644
+ inner=catalog_properties,
645
+ )
646
+ updated_schema = updated_table_def.table_version.schema
647
+
648
+ # temp_field should be removed
649
+ with pytest.raises(KeyError):
650
+ updated_schema.field("temp_field")
651
+
652
+ # all other fields should be present
653
+ assert updated_schema.field("id") is not None
654
+ assert updated_schema.field("id").arrow.type == pa.int64()
655
+ assert updated_schema.field("id").id == 1
656
+ assert updated_schema.field("name") is not None
657
+ assert updated_schema.field("name").arrow.type == pa.string()
658
+ assert updated_schema.field("name").id == 2
659
+
660
+ def test_alter_table_with_update_operation(self, test_namespace):
661
+ """Test altering a table with field update operation."""
662
+ namespace_name, catalog_properties = test_namespace
663
+ table_name = "test_alter_table_update"
664
+
665
+ # Create schema with a field to update
666
+ initial_fields = [
667
+ Field.of(
668
+ pa.field("id", pa.int64(), nullable=False),
669
+ is_merge_key=True,
670
+ field_id=1,
671
+ ),
672
+ Field.of(pa.field("value", pa.int32(), nullable=True), field_id=2),
673
+ ]
674
+ schema = Schema.of(initial_fields)
675
+
676
+ # Create the table
677
+ table = catalog.create_table(
678
+ table=table_name,
679
+ namespace=namespace_name,
680
+ schema=schema,
681
+ inner=catalog_properties,
682
+ )
683
+
684
+ original_schema = table.table_version.schema
685
+
686
+ # Update the value field to int64 (compatible type widening)
687
+ schema_update = original_schema.update().update_field_type("value", pa.int64())
688
+
689
+ # Alter the table
690
+ catalog.alter_table(
691
+ table=table_name,
692
+ namespace=namespace_name,
693
+ schema_updates=schema_update,
694
+ inner=catalog_properties,
695
+ )
696
+
697
+ # Get the updated table
698
+ updated_table_def = catalog.get_table(
699
+ table_name,
700
+ namespace=namespace_name,
701
+ inner=catalog_properties,
702
+ )
703
+
704
+ updated_schema = updated_table_def.table_version.schema
705
+
706
+ # Verify field was updated
707
+ assert updated_schema.field("value").arrow.type == pa.int64()
708
+ assert updated_schema.field("value").id == 2
709
+
710
+ # Verify schema ID was incremented
711
+ assert updated_schema.id == original_schema.id + 1
712
+
713
+ def test_alter_table_with_schema_evolution_disabled(self, test_namespace):
714
+ """Test that alter_table raises TableValidationError when schema evolution is disabled."""
715
+ namespace_name, catalog_properties = test_namespace
716
+ table_name = "test_alter_table_schema_evolution_disabled"
717
+
718
+ # Create initial schema
719
+ initial_fields = [
720
+ Field.of(
721
+ pa.field("id", pa.int64(), nullable=False),
722
+ is_merge_key=True,
723
+ field_id=1,
724
+ ),
725
+ Field.of(pa.field("value", pa.int32(), nullable=True), field_id=2),
726
+ ]
727
+ schema = Schema.of(initial_fields)
728
+
729
+ # Create table with SCHEMA_EVOLUTION_MODE.DISABLED
730
+ table_properties = {
731
+ TableProperty.SCHEMA_EVOLUTION_MODE: SchemaEvolutionMode.DISABLED
732
+ }
733
+
734
+ table = catalog.create_table(
735
+ table=table_name,
736
+ namespace=namespace_name,
737
+ schema=schema,
738
+ table_properties=table_properties,
739
+ inner=catalog_properties,
740
+ )
741
+
742
+ original_schema = table.table_version.schema
743
+
744
+ # Try to add a new field - this should be blocked
745
+ new_field = Field.of(pa.field("description", pa.string(), nullable=True))
746
+ schema_update = original_schema.update().add_field(new_field)
747
+
748
+ # Alter table with schema updates should raise TableValidationError
749
+ with pytest.raises(
750
+ TableValidationError,
751
+ match="Schema evolution is disabled for this table. Please enable schema evolution or remove schema updates.",
752
+ ):
753
+ catalog.alter_table(
754
+ table=table_name,
755
+ namespace=namespace_name,
756
+ schema_updates=schema_update,
414
757
  inner=catalog_properties,
415
758
  )
416
759
 
760
+ # Verify the schema wasn't changed
761
+ unchanged_table_def = catalog.get_table(
762
+ table_name,
763
+ namespace=namespace_name,
764
+ inner=catalog_properties,
765
+ )
766
+ unchanged_schema = unchanged_table_def.table_version.schema
767
+
768
+ # Schema should be unchanged
769
+ assert unchanged_schema.id == original_schema.id
770
+ assert len(unchanged_schema.fields) == len(original_schema.fields)
771
+
772
+ # Verify the new field was not added
773
+ field_names = [field.arrow.name for field in unchanged_schema.fields]
774
+ assert "description" not in field_names
775
+
776
+ # Test that alter_table works without schema_updates even when schema evolution is disabled
777
+ catalog.alter_table(
778
+ table=table_name,
779
+ namespace=namespace_name,
780
+ table_description="Updated description without schema changes",
781
+ inner=catalog_properties,
782
+ )
783
+
784
+ # Verify that table description was updated but schema remains unchanged
785
+ final_table_def = catalog.get_table(
786
+ table_name,
787
+ namespace=namespace_name,
788
+ inner=catalog_properties,
789
+ )
790
+ assert (
791
+ final_table_def.table_version.description
792
+ == "Updated description without schema changes"
793
+ )
794
+ assert final_table_def.table_version.schema.id == original_schema.id
795
+
417
796
  def test_drop_with_purge_validation(self, test_namespace):
418
797
  """Test that using purge flag raises ValidationError"""
419
798
  namespace_name, catalog_properties = test_namespace
@@ -421,7 +800,9 @@ class TestCatalogTableOperations:
421
800
 
422
801
  # Create the table
423
802
  catalog.create_table(
424
- name=table_name, namespace=namespace_name, inner=catalog_properties
803
+ table=table_name,
804
+ namespace=namespace_name,
805
+ inner=catalog_properties,
425
806
  )
426
807
 
427
808
  # Try to drop with purge=True, should raise ValidationError
@@ -429,8 +810,1163 @@ class TestCatalogTableOperations:
429
810
  NotImplementedError, match="Purge flag is not currently supported"
430
811
  ):
431
812
  catalog.drop_table(
432
- name=table_name,
813
+ table=table_name,
433
814
  namespace=namespace_name,
434
815
  purge=True,
435
816
  inner=catalog_properties,
436
817
  )
818
+
819
+ def test_create_table_basic(self):
820
+ """Test basic table creation"""
821
+ table_name = "test_create_table_basic"
822
+ schema = Schema.of(
823
+ schema=pa.schema(
824
+ [
825
+ ("id", pa.int64()),
826
+ ("name", pa.string()),
827
+ ]
828
+ )
829
+ )
830
+
831
+ table_def = catalog.create_table(
832
+ table=table_name,
833
+ namespace=self.test_namespace,
834
+ schema=schema,
835
+ inner=self.catalog_properties,
836
+ )
837
+
838
+ assert table_def.table.table_name == table_name
839
+ assert table_def.table_version.schema.equivalent_to(schema)
840
+
841
+ # Verify table exists
842
+ assert catalog.table_exists(
843
+ table=table_name,
844
+ namespace=self.test_namespace,
845
+ inner=self.catalog_properties,
846
+ )
847
+
848
+ def test_create_table_already_exists_fail_if_exists_true(self):
849
+ """Test creating a table that already exists with fail_if_exists=True"""
850
+ table_name = "test_create_table_exists"
851
+ schema = Schema.of(schema=pa.schema([("id", pa.int64())]))
852
+
853
+ # Create table first
854
+ catalog.create_table(
855
+ table=table_name,
856
+ namespace=self.test_namespace,
857
+ schema=schema,
858
+ inner=self.catalog_properties,
859
+ )
860
+
861
+ # Try to create again with fail_if_exists=True (default)
862
+ with pytest.raises(TableAlreadyExistsError):
863
+ catalog.create_table(
864
+ table=table_name,
865
+ namespace=self.test_namespace,
866
+ schema=schema,
867
+ fail_if_exists=True,
868
+ inner=self.catalog_properties,
869
+ )
870
+
871
+ def test_create_table_already_exists_fail_if_exists_false(self):
872
+ """Test creating a table that already exists with fail_if_exists=False"""
873
+ table_name = "test_create_table_exists_ok"
874
+ schema = Schema.of(schema=pa.schema([("id", pa.int64())]))
875
+
876
+ # Create table first
877
+ table_def1 = catalog.create_table(
878
+ table=table_name,
879
+ namespace=self.test_namespace,
880
+ schema=schema,
881
+ inner=self.catalog_properties,
882
+ )
883
+
884
+ # Create again with fail_if_exists=False should return existing table
885
+ table_def2 = catalog.create_table(
886
+ table=table_name,
887
+ namespace=self.test_namespace,
888
+ schema=schema,
889
+ fail_if_exists=False,
890
+ inner=self.catalog_properties,
891
+ )
892
+
893
+ assert table_def1.table.table_name == table_def2.table.table_name
894
+
895
+
896
+ class TestWriteToTable:
897
+ """Test the write_to_table implementation with different modes and data types."""
898
+
899
+ @classmethod
900
+ def setup_class(cls):
901
+ cls.temp_dir = tempfile.mkdtemp()
902
+ cls.catalog_properties = get_catalog_properties(root=cls.temp_dir)
903
+
904
+ # Create a test namespace
905
+ cls.test_namespace = "test_write_to_table"
906
+ catalog.create_namespace(
907
+ namespace=cls.test_namespace, inner=cls.catalog_properties
908
+ )
909
+
910
+ @classmethod
911
+ def teardown_class(cls):
912
+ shutil.rmtree(cls.temp_dir)
913
+
914
+ def _create_test_pandas_data(self):
915
+ """Create test pandas DataFrame"""
916
+ return pd.DataFrame(
917
+ {
918
+ "id": [1, 2, 3, 4, 5],
919
+ "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
920
+ "age": [25, 30, 35, 40, 45],
921
+ "city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
922
+ }
923
+ )
924
+
925
+ def _create_test_pyarrow_data(self):
926
+ """Create test PyArrow Table"""
927
+ return pa.table(
928
+ {
929
+ "id": [1, 2, 3, 4, 5],
930
+ "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
931
+ "age": [25, 30, 35, 40, 45],
932
+ "city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
933
+ }
934
+ )
935
+
936
+ def _create_test_polars_data(self):
937
+ """Create test Polars DataFrame"""
938
+ return pl.DataFrame(
939
+ {
940
+ "id": [1, 2, 3, 4, 5],
941
+ "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
942
+ "age": [25, 30, 35, 40, 45],
943
+ "city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
944
+ }
945
+ )
946
+
947
+ def _create_second_batch_pandas_data(self):
948
+ """Create second batch of test data for append tests"""
949
+ return pd.DataFrame(
950
+ {
951
+ "id": [6, 7, 8],
952
+ "name": ["Frank", "Grace", "Henry"],
953
+ "age": [50, 55, 60],
954
+ "city": ["Boston", "Seattle", "Denver"],
955
+ }
956
+ )
957
+
958
+ def _create_test_ray_data(self):
959
+ """Create test Ray Dataset for schema inference testing."""
960
+ import ray
961
+
962
+ # Initialize Ray if not already initialized
963
+ # Note: Use distributed mode (not local_mode=True) to avoid Ray 2.46.0 internal bug
964
+ if not ray.is_initialized():
965
+ ray.init()
966
+
967
+ data = pa.table(
968
+ {
969
+ "id": [1, 2, 3, 4, 5],
970
+ "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
971
+ "age": [25, 30, 35, 40, 45],
972
+ "city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
973
+ }
974
+ )
975
+ return rd.from_arrow(data)
976
+
977
+ def _create_test_daft_data(self):
978
+ """Create test Daft DataFrame for schema inference testing."""
979
+ data = {
980
+ "id": [1, 2, 3],
981
+ "name": ["Alice", "Bob", "Charlie"],
982
+ "age": [25, 30, 35],
983
+ "city": ["NYC", "LA", "Chicago"],
984
+ }
985
+ return daft.from_pydict(data)
986
+
987
+ def _create_test_numpy_1d_data(self):
988
+ """Create test 1D numpy array for schema inference testing."""
989
+ return np.array([1, 2, 3, 4, 5])
990
+
991
+ def _create_test_numpy_2d_data(self):
992
+ """Create test 2D numpy array for schema inference testing."""
993
+ return np.array([[1, 25], [2, 30], [3, 35]], dtype=np.int64)
994
+
995
+ def _create_table_with_merge_keys(self, table_name: str):
996
+ """Create a table with merge keys for testing MERGE mode"""
997
+ from deltacat.storage.model.schema import Schema, Field
998
+
999
+ # Create schema with merge keys
1000
+ schema = Schema.of(
1001
+ [
1002
+ Field.of(pa.field("id", pa.int64()), is_merge_key=True), # merge key
1003
+ Field.of(pa.field("name", pa.string())),
1004
+ Field.of(pa.field("age", pa.int32())),
1005
+ Field.of(pa.field("city", pa.string())),
1006
+ ]
1007
+ )
1008
+
1009
+ catalog.create_table(
1010
+ table=table_name,
1011
+ namespace=self.test_namespace,
1012
+ schema=schema,
1013
+ inner=self.catalog_properties,
1014
+ )
1015
+
1016
+ return schema
1017
+
1018
+ def _create_table_without_merge_keys(self, table_name: str):
1019
+ """Create a table without merge keys for testing APPEND mode"""
1020
+ # Use schema inference with no merge keys
1021
+ data = self._create_test_pandas_data()
1022
+ catalog.write_to_table(
1023
+ data=data,
1024
+ table=table_name,
1025
+ namespace=self.test_namespace,
1026
+ mode=TableWriteMode.CREATE,
1027
+ inner=self.catalog_properties,
1028
+ )
1029
+
1030
+ # Test TableWriteMode.AUTO
1031
+ def test_write_to_table_auto_create_new_table_pandas(self):
1032
+ """Test AUTO mode creating a new table with pandas data"""
1033
+ table_name = "test_auto_create_pandas"
1034
+ data = self._create_test_pandas_data()
1035
+
1036
+ # Table doesn't exist, AUTO should create it
1037
+ catalog.write_to_table(
1038
+ data=data,
1039
+ table=table_name,
1040
+ namespace=self.test_namespace,
1041
+ mode=TableWriteMode.AUTO,
1042
+ inner=self.catalog_properties,
1043
+ )
1044
+
1045
+ # Verify table was created
1046
+ assert catalog.table_exists(
1047
+ table=table_name,
1048
+ namespace=self.test_namespace,
1049
+ inner=self.catalog_properties,
1050
+ )
1051
+
1052
+ # Verify table has correct schema
1053
+ table_def = catalog.get_table(
1054
+ table=table_name,
1055
+ namespace=self.test_namespace,
1056
+ inner=self.catalog_properties,
1057
+ )
1058
+ assert table_def.table_version.schema is not None
1059
+
1060
+ def test_write_to_table_auto_create_new_table_pyarrow(self):
1061
+ """Test AUTO mode creating a new table with PyArrow data"""
1062
+ table_name = "test_auto_create_pyarrow"
1063
+ data = self._create_test_pyarrow_data()
1064
+
1065
+ catalog.write_to_table(
1066
+ data=data,
1067
+ table=table_name,
1068
+ namespace=self.test_namespace,
1069
+ mode=TableWriteMode.AUTO,
1070
+ inner=self.catalog_properties,
1071
+ )
1072
+
1073
+ assert catalog.table_exists(
1074
+ table=table_name,
1075
+ namespace=self.test_namespace,
1076
+ inner=self.catalog_properties,
1077
+ )
1078
+
1079
+ def test_write_to_table_auto_create_new_table_polars(self):
1080
+ """Test AUTO mode creating a new table with Polars data"""
1081
+ table_name = "test_auto_create_polars"
1082
+ data = self._create_test_polars_data()
1083
+
1084
+ catalog.write_to_table(
1085
+ data=data,
1086
+ table=table_name,
1087
+ namespace=self.test_namespace,
1088
+ mode=TableWriteMode.AUTO,
1089
+ inner=self.catalog_properties,
1090
+ )
1091
+
1092
+ assert catalog.table_exists(
1093
+ table=table_name,
1094
+ namespace=self.test_namespace,
1095
+ inner=self.catalog_properties,
1096
+ )
1097
+
1098
+ def test_write_to_table_auto_append_existing_table(self):
1099
+ """Test AUTO mode appending to existing table"""
1100
+ table_name = "test_auto_append"
1101
+ data1 = self._create_test_pandas_data()
1102
+ data2 = self._create_second_batch_pandas_data()
1103
+
1104
+ # First write creates table
1105
+ catalog.write_to_table(
1106
+ data=data1,
1107
+ table=table_name,
1108
+ namespace=self.test_namespace,
1109
+ mode=TableWriteMode.AUTO,
1110
+ inner=self.catalog_properties,
1111
+ )
1112
+
1113
+ # Second write should append
1114
+ catalog.write_to_table(
1115
+ data=data2,
1116
+ table=table_name,
1117
+ namespace=self.test_namespace,
1118
+ mode=TableWriteMode.AUTO,
1119
+ inner=self.catalog_properties,
1120
+ )
1121
+
1122
+ # Verify table still exists
1123
+ assert catalog.table_exists(
1124
+ table=table_name,
1125
+ namespace=self.test_namespace,
1126
+ inner=self.catalog_properties,
1127
+ )
1128
+
1129
+ # Test TableWriteMode.CREATE
1130
+ def test_write_to_table_create_new_table(self):
1131
+ """Test CREATE mode with new table"""
1132
+ table_name = "test_create_new"
1133
+ data = self._create_test_pandas_data()
1134
+
1135
+ catalog.write_to_table(
1136
+ data=data,
1137
+ table=table_name,
1138
+ namespace=self.test_namespace,
1139
+ mode=TableWriteMode.CREATE,
1140
+ inner=self.catalog_properties,
1141
+ )
1142
+
1143
+ assert catalog.table_exists(
1144
+ table=table_name,
1145
+ namespace=self.test_namespace,
1146
+ inner=self.catalog_properties,
1147
+ )
1148
+
1149
+ def test_write_to_table_create_existing_table_fails(self):
1150
+ """Test CREATE mode fails when table exists"""
1151
+ table_name = "test_create_fail"
1152
+ data = self._create_test_pandas_data()
1153
+
1154
+ # Create table first
1155
+ catalog.write_to_table(
1156
+ data=data,
1157
+ table=table_name,
1158
+ namespace=self.test_namespace,
1159
+ mode=TableWriteMode.CREATE,
1160
+ inner=self.catalog_properties,
1161
+ )
1162
+
1163
+ # Try to create again should fail
1164
+ with pytest.raises(
1165
+ TableAlreadyExistsError, match="already exists and mode is CREATE"
1166
+ ):
1167
+ catalog.write_to_table(
1168
+ data=data,
1169
+ table=table_name,
1170
+ namespace=self.test_namespace,
1171
+ mode=TableWriteMode.CREATE,
1172
+ inner=self.catalog_properties,
1173
+ )
1174
+
1175
+ # Test TableWriteMode.APPEND
1176
+ def test_write_to_table_append_existing_table(self):
1177
+ """Test APPEND mode with existing table"""
1178
+ table_name = "test_append_existing"
1179
+ data1 = self._create_test_pandas_data()
1180
+ data2 = self._create_second_batch_pandas_data()
1181
+
1182
+ # Create table first
1183
+ catalog.write_to_table(
1184
+ data=data1,
1185
+ table=table_name,
1186
+ namespace=self.test_namespace,
1187
+ mode=TableWriteMode.CREATE,
1188
+ inner=self.catalog_properties,
1189
+ )
1190
+
1191
+ # Append to existing table
1192
+ catalog.write_to_table(
1193
+ data=data2,
1194
+ table=table_name,
1195
+ namespace=self.test_namespace,
1196
+ mode=TableWriteMode.APPEND,
1197
+ inner=self.catalog_properties,
1198
+ )
1199
+
1200
+ def test_write_to_table_append_nonexistent_table_fails(self):
1201
+ """Test APPEND mode fails when table doesn't exist"""
1202
+ table_name = "test_append_fail"
1203
+ data = self._create_test_pandas_data()
1204
+
1205
+ with pytest.raises(
1206
+ TableNotFoundError,
1207
+ match="does not exist and write mode is append. Use CREATE or AUTO mode",
1208
+ ):
1209
+ catalog.write_to_table(
1210
+ data=data,
1211
+ table=table_name,
1212
+ namespace=self.test_namespace,
1213
+ mode=TableWriteMode.APPEND,
1214
+ inner=self.catalog_properties,
1215
+ )
1216
+
1217
+ def test_write_to_table_append_with_merge_keys_fails(self):
1218
+ """Test APPEND mode fails when table has merge keys"""
1219
+ table_name = "test_append_with_merge_keys"
1220
+
1221
+ # Create a table with merge keys
1222
+ self._create_table_with_merge_keys(table_name)
1223
+
1224
+ # Create test data that matches the schema
1225
+ data = pd.DataFrame(
1226
+ {
1227
+ "id": [1, 2, 3],
1228
+ "name": ["Alice", "Bob", "Charlie"],
1229
+ "age": [25, 30, 35],
1230
+ "city": ["NYC", "LA", "Chicago"],
1231
+ }
1232
+ )
1233
+
1234
+ # APPEND mode should fail since table has merge keys
1235
+ with pytest.raises(
1236
+ SchemaValidationError,
1237
+ match="APPEND mode cannot be used with tables that have merge keys",
1238
+ ):
1239
+ catalog.write_to_table(
1240
+ data=data,
1241
+ table=table_name,
1242
+ namespace=self.test_namespace,
1243
+ mode=TableWriteMode.APPEND,
1244
+ inner=self.catalog_properties,
1245
+ )
1246
+
1247
+ def test_write_to_table_append_without_merge_keys_succeeds(self):
1248
+ """Test APPEND mode works when table has no merge keys"""
1249
+ table_name = "test_append_no_merge_keys"
1250
+
1251
+ # Create a table without merge keys
1252
+ self._create_table_without_merge_keys(table_name)
1253
+
1254
+ # Add more data to the table
1255
+ data2 = self._create_second_batch_pandas_data()
1256
+
1257
+ # APPEND mode should work since table has no merge keys
1258
+ catalog.write_to_table(
1259
+ data=data2,
1260
+ table=table_name,
1261
+ namespace=self.test_namespace,
1262
+ mode=TableWriteMode.APPEND,
1263
+ inner=self.catalog_properties,
1264
+ )
1265
+
1266
+ # Table should still exist
1267
+ assert catalog.table_exists(
1268
+ table=table_name,
1269
+ namespace=self.test_namespace,
1270
+ inner=self.catalog_properties,
1271
+ )
1272
+
1273
+ # Test explicit schema specification
1274
+ def test_write_to_table_explicit_schema(self):
1275
+ """Test writing with explicit schema specification"""
1276
+ table_name = "test_explicit_schema"
1277
+ data = self._create_test_pandas_data()
1278
+
1279
+ # Define explicit schema with COERCE consistency types to preserve exact types
1280
+ explicit_schema = Schema.of(
1281
+ schema=[
1282
+ Field.of(
1283
+ pa.field("id", pa.int64()),
1284
+ consistency_type=SchemaConsistencyType.COERCE,
1285
+ ),
1286
+ Field.of(
1287
+ pa.field("name", pa.string()),
1288
+ consistency_type=SchemaConsistencyType.COERCE,
1289
+ ),
1290
+ Field.of(
1291
+ pa.field("age", pa.int32()),
1292
+ consistency_type=SchemaConsistencyType.COERCE,
1293
+ ), # Different from inferred schema
1294
+ Field.of(
1295
+ pa.field("city", pa.string()),
1296
+ consistency_type=SchemaConsistencyType.COERCE,
1297
+ ),
1298
+ ]
1299
+ )
1300
+
1301
+ catalog.write_to_table(
1302
+ data=data,
1303
+ table=table_name,
1304
+ namespace=self.test_namespace,
1305
+ mode=TableWriteMode.CREATE,
1306
+ schema=explicit_schema,
1307
+ inner=self.catalog_properties,
1308
+ )
1309
+
1310
+ # Verify schema was used
1311
+ table_def = catalog.get_table(
1312
+ table=table_name,
1313
+ namespace=self.test_namespace,
1314
+ inner=self.catalog_properties,
1315
+ )
1316
+ assert table_def.table_version.schema.equivalent_to(explicit_schema)
1317
+
1318
+ def test_write_to_table_explicit_schema_none(self):
1319
+ """Test writing with explicit schema=None to create schemaless table"""
1320
+ table_name = "test_explicit_schema_none"
1321
+ data = self._create_test_pandas_data()
1322
+
1323
+ catalog.write_to_table(
1324
+ data=data,
1325
+ table=table_name,
1326
+ namespace=self.test_namespace,
1327
+ mode=TableWriteMode.CREATE,
1328
+ schema=None, # Explicitly set schema=None
1329
+ inner=self.catalog_properties,
1330
+ )
1331
+
1332
+ # Verify table was created with schema=None (schemaless)
1333
+ table_def = catalog.get_table(
1334
+ table=table_name,
1335
+ namespace=self.test_namespace,
1336
+ inner=self.catalog_properties,
1337
+ )
1338
+
1339
+ # The table should exist but have a None/empty schema
1340
+ assert table_def is not None
1341
+ # Note: The exact behavior of schemaless tables may vary by storage implementation
1342
+ # We're mainly testing that the create_table call succeeded with schema=None
1343
+
1344
+ def test_schema_behavior_comparison(self):
1345
+ """Test that demonstrates the difference between no schema vs explicit schema=None"""
1346
+ data = self._create_test_pandas_data()
1347
+
1348
+ # Case 1: No schema argument - should infer schema
1349
+ table_name_inferred = "test_schema_inferred"
1350
+ catalog.write_to_table(
1351
+ data=data,
1352
+ table=table_name_inferred,
1353
+ namespace=self.test_namespace,
1354
+ mode=TableWriteMode.CREATE,
1355
+ # No schema argument provided - should infer from data
1356
+ inner=self.catalog_properties,
1357
+ )
1358
+
1359
+ # Case 2: Explicit schema=None - should create schemaless table
1360
+ table_name_schemaless = "test_schema_none"
1361
+ catalog.write_to_table(
1362
+ data=data,
1363
+ table=table_name_schemaless,
1364
+ namespace=self.test_namespace,
1365
+ mode=TableWriteMode.CREATE,
1366
+ schema=None, # Explicitly set schema=None
1367
+ inner=self.catalog_properties,
1368
+ )
1369
+
1370
+ # Verify both tables were created
1371
+ table_inferred = catalog.get_table(
1372
+ table=table_name_inferred,
1373
+ namespace=self.test_namespace,
1374
+ inner=self.catalog_properties,
1375
+ )
1376
+
1377
+ table_schemaless = catalog.get_table(
1378
+ table=table_name_schemaless,
1379
+ namespace=self.test_namespace,
1380
+ inner=self.catalog_properties,
1381
+ )
1382
+
1383
+ # Both tables should exist
1384
+ assert table_inferred is not None
1385
+ assert table_schemaless is not None
1386
+
1387
+ # The inferred schema table should have a schema with the expected columns
1388
+ inferred_schema = table_inferred.table_version.schema.arrow
1389
+ assert "id" in inferred_schema.names
1390
+ assert "name" in inferred_schema.names
1391
+ assert "age" in inferred_schema.names
1392
+ assert "city" in inferred_schema.names
1393
+
1394
+ # Test schema inference from different data types
1395
+ def test_schema_inference_pandas(self):
1396
+ """Test schema inference from pandas DataFrame"""
1397
+ table_name = "test_schema_inference_pandas"
1398
+ data = pd.DataFrame(
1399
+ {
1400
+ "int_col": [1, 2, 3],
1401
+ "float_col": [1.1, 2.2, 3.3],
1402
+ "str_col": ["a", "b", "c"],
1403
+ "bool_col": [True, False, True],
1404
+ }
1405
+ )
1406
+
1407
+ catalog.write_to_table(
1408
+ data=data,
1409
+ table=table_name,
1410
+ namespace=self.test_namespace,
1411
+ mode=TableWriteMode.CREATE,
1412
+ inner=self.catalog_properties,
1413
+ )
1414
+
1415
+ table_def = catalog.get_table(
1416
+ table=table_name,
1417
+ namespace=self.test_namespace,
1418
+ inner=self.catalog_properties,
1419
+ )
1420
+
1421
+ schema = table_def.table_version.schema.arrow
1422
+ assert "int_col" in schema.names
1423
+ assert "float_col" in schema.names
1424
+ assert "str_col" in schema.names
1425
+ assert "bool_col" in schema.names
1426
+
1427
+ def test_schema_inference_pyarrow(self):
1428
+ """Test schema inference from PyArrow Table"""
1429
+ table_name = "test_schema_inference_pyarrow"
1430
+ data = pa.table(
1431
+ {
1432
+ "int64_col": pa.array([1, 2, 3], type=pa.int64()),
1433
+ "string_col": pa.array(["x", "y", "z"], type=pa.string()),
1434
+ "double_col": pa.array([1.1, 2.2, 3.3], type=pa.float64()),
1435
+ }
1436
+ )
1437
+
1438
+ catalog.write_to_table(
1439
+ data=data,
1440
+ table=table_name,
1441
+ namespace=self.test_namespace,
1442
+ mode=TableWriteMode.CREATE,
1443
+ inner=self.catalog_properties,
1444
+ )
1445
+
1446
+ table_def = catalog.get_table(
1447
+ table=table_name,
1448
+ namespace=self.test_namespace,
1449
+ inner=self.catalog_properties,
1450
+ )
1451
+
1452
+ schema = table_def.table_version.schema.arrow
1453
+ assert schema.field("int64_col").type == pa.int64()
1454
+ assert schema.field("string_col").type == pa.string()
1455
+ assert schema.field("double_col").type == pa.float64()
1456
+
1457
+ def test_schema_inference_polars(self):
1458
+ """Test schema inference from Polars DataFrame"""
1459
+ table_name = "test_schema_inference_polars"
1460
+ data = pl.DataFrame(
1461
+ {
1462
+ "int_col": [1, 2, 3],
1463
+ "str_col": ["a", "b", "c"],
1464
+ "float_col": [1.1, 2.2, 3.3],
1465
+ }
1466
+ )
1467
+
1468
+ catalog.write_to_table(
1469
+ data=data,
1470
+ table=table_name,
1471
+ namespace=self.test_namespace,
1472
+ mode=TableWriteMode.CREATE,
1473
+ inner=self.catalog_properties,
1474
+ )
1475
+
1476
+ table_def = catalog.get_table(
1477
+ table=table_name,
1478
+ namespace=self.test_namespace,
1479
+ inner=self.catalog_properties,
1480
+ )
1481
+
1482
+ schema = table_def.table_version.schema.arrow
1483
+ assert "int_col" in schema.names
1484
+ assert "str_col" in schema.names
1485
+ assert "float_col" in schema.names
1486
+
1487
+ def test_schema_inference_ray_dataset(self):
1488
+ """Test schema inference from Ray Dataset"""
1489
+ table_name = "test_schema_inference_ray"
1490
+ ray_data = self._create_test_ray_data()
1491
+
1492
+ catalog.write_to_table(
1493
+ data=ray_data,
1494
+ table=table_name,
1495
+ namespace=self.test_namespace,
1496
+ mode=TableWriteMode.CREATE,
1497
+ inner=self.catalog_properties,
1498
+ )
1499
+
1500
+ table_def = catalog.get_table(
1501
+ table=table_name,
1502
+ namespace=self.test_namespace,
1503
+ inner=self.catalog_properties,
1504
+ )
1505
+
1506
+ schema = table_def.table_version.schema.arrow
1507
+ assert "id" in schema.names
1508
+ assert "name" in schema.names
1509
+ assert "age" in schema.names
1510
+ assert "city" in schema.names
1511
+
1512
+ def test_schema_inference_daft_dataframe(self):
1513
+ """Test schema inference from Daft DataFrame"""
1514
+ table_name = "test_schema_inference_daft"
1515
+ data = self._create_test_daft_data()
1516
+
1517
+ catalog.write_to_table(
1518
+ data=data,
1519
+ table=table_name,
1520
+ namespace=self.test_namespace,
1521
+ mode=TableWriteMode.CREATE,
1522
+ inner=self.catalog_properties,
1523
+ )
1524
+
1525
+ table_def = catalog.get_table(
1526
+ table=table_name,
1527
+ namespace=self.test_namespace,
1528
+ inner=self.catalog_properties,
1529
+ )
1530
+
1531
+ schema = table_def.table_version.schema.arrow
1532
+ assert "id" in schema.names
1533
+ assert "name" in schema.names
1534
+ assert "age" in schema.names
1535
+ assert "city" in schema.names
1536
+
1537
+ def test_schema_inference_numpy_1d(self):
1538
+ """Test schema inference from 1D numpy array"""
1539
+ table_name = "test_schema_inference_numpy_1d"
1540
+ data = self._create_test_numpy_1d_data()
1541
+
1542
+ catalog.write_to_table(
1543
+ data=data,
1544
+ table=table_name,
1545
+ namespace=self.test_namespace,
1546
+ mode=TableWriteMode.CREATE,
1547
+ inner=self.catalog_properties,
1548
+ )
1549
+
1550
+ table_def = catalog.get_table(
1551
+ table=table_name,
1552
+ namespace=self.test_namespace,
1553
+ inner=self.catalog_properties,
1554
+ )
1555
+
1556
+ schema = table_def.table_version.schema.arrow
1557
+ assert (
1558
+ "0" in schema.names
1559
+ ) # pandas converts 1D numpy array with column name "0"
1560
+ assert len(schema.names) == 1
1561
+
1562
+ def test_schema_inference_numpy_2d(self):
1563
+ """Test schema inference from 2D numpy array"""
1564
+ table_name = "test_schema_inference_numpy_2d"
1565
+ data = self._create_test_numpy_2d_data()
1566
+
1567
+ catalog.write_to_table(
1568
+ data=data,
1569
+ table=table_name,
1570
+ namespace=self.test_namespace,
1571
+ mode=TableWriteMode.CREATE,
1572
+ inner=self.catalog_properties,
1573
+ )
1574
+
1575
+ table_def = catalog.get_table(
1576
+ table=table_name,
1577
+ namespace=self.test_namespace,
1578
+ inner=self.catalog_properties,
1579
+ )
1580
+
1581
+ schema = table_def.table_version.schema.arrow
1582
+ assert (
1583
+ "0" in schema.names
1584
+ ) # pandas converts 2D numpy array with column names "0", "1"
1585
+ assert "1" in schema.names
1586
+ assert len(schema.names) == 2
1587
+
1588
+ def test_numpy_3d_array_error(self):
1589
+ """Test that 3D numpy arrays raise an error"""
1590
+ table_name = "test_numpy_3d_error"
1591
+ data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # 3D array
1592
+
1593
+ with pytest.raises(
1594
+ ValueError, match="NumPy arrays with 3 dimensions are not supported"
1595
+ ):
1596
+ catalog.write_to_table(
1597
+ data=data,
1598
+ table=table_name,
1599
+ namespace=self.test_namespace,
1600
+ mode=TableWriteMode.CREATE,
1601
+ inner=self.catalog_properties,
1602
+ )
1603
+
1604
+ # Test different content types
1605
+ def test_write_to_table_different_content_types(self):
1606
+ """Test writing with different content types"""
1607
+ data = self._create_test_pandas_data()
1608
+
1609
+ content_types = [
1610
+ ContentType.PARQUET,
1611
+ ContentType.CSV,
1612
+ ContentType.JSON,
1613
+ ]
1614
+
1615
+ for i, content_type in enumerate(content_types):
1616
+ table_name = f"test_content_type_{content_type.value}_{i}"
1617
+
1618
+ catalog.write_to_table(
1619
+ data=data,
1620
+ table=table_name,
1621
+ namespace=self.test_namespace,
1622
+ mode=TableWriteMode.CREATE,
1623
+ content_type=content_type,
1624
+ inner=self.catalog_properties,
1625
+ schema=None,
1626
+ )
1627
+
1628
+ assert catalog.table_exists(
1629
+ table=table_name,
1630
+ namespace=self.test_namespace,
1631
+ inner=self.catalog_properties,
1632
+ )
1633
+
1634
+ # Test table creation parameters
1635
+ def test_write_to_table_with_table_properties(self):
1636
+ """Test writing with table creation parameters"""
1637
+ table_name = "test_table_properties"
1638
+ data = self._create_test_pandas_data()
1639
+
1640
+ catalog.write_to_table(
1641
+ data=data,
1642
+ table=table_name,
1643
+ namespace=self.test_namespace,
1644
+ mode=TableWriteMode.CREATE,
1645
+ table_description="Test table with properties",
1646
+ lifecycle_state=LifecycleState.ACTIVE,
1647
+ inner=self.catalog_properties,
1648
+ )
1649
+
1650
+ table_def = catalog.get_table(
1651
+ table=table_name,
1652
+ namespace=self.test_namespace,
1653
+ inner=self.catalog_properties,
1654
+ )
1655
+
1656
+ assert table_def.table.description == "Test table with properties"
1657
+ # Note: lifecycle_state defaults to ACTIVE in create_table, but may be overridden
1658
+ # We'll accept either ACTIVE or CREATED as both are valid for our test purpose
1659
+ assert table_def.table_version.state in [
1660
+ LifecycleState.ACTIVE,
1661
+ LifecycleState.CREATED,
1662
+ ]
1663
+
1664
+ # Test error conditions
1665
+ def test_write_to_table_unsupported_data_type(self):
1666
+ """Test error when data type cannot be inferred"""
1667
+ table_name = "test_unsupported_data"
1668
+
1669
+ # Use a plain dict which doesn't have schema inference
1670
+ unsupported_data = {"key": "value"}
1671
+
1672
+ with pytest.raises(
1673
+ ValueError, match="No schema inference function found for table type"
1674
+ ):
1675
+ catalog.write_to_table(
1676
+ data=unsupported_data,
1677
+ table=table_name,
1678
+ namespace=self.test_namespace,
1679
+ mode=TableWriteMode.CREATE,
1680
+ inner=self.catalog_properties,
1681
+ )
1682
+
1683
+ def test_write_to_table_replace_mode(self):
1684
+ """Test REPLACE mode creating a new stream to replace existing data"""
1685
+ table_name = "test_replace_mode"
1686
+ data1 = self._create_test_pandas_data()
1687
+ data2 = self._create_second_batch_pandas_data()
1688
+
1689
+ # First, create the table
1690
+ catalog.write_to_table(
1691
+ data=data1,
1692
+ table=table_name,
1693
+ namespace=self.test_namespace,
1694
+ mode=TableWriteMode.CREATE,
1695
+ inner=self.catalog_properties,
1696
+ )
1697
+
1698
+ # Verify table exists
1699
+ assert catalog.table_exists(
1700
+ table=table_name,
1701
+ namespace=self.test_namespace,
1702
+ inner=self.catalog_properties,
1703
+ )
1704
+
1705
+ # Now use REPLACE mode to replace all existing data
1706
+ catalog.write_to_table(
1707
+ data=data2,
1708
+ table=table_name,
1709
+ namespace=self.test_namespace,
1710
+ mode=TableWriteMode.REPLACE,
1711
+ inner=self.catalog_properties,
1712
+ )
1713
+
1714
+ # Table should still exist
1715
+ assert catalog.table_exists(
1716
+ table=table_name,
1717
+ namespace=self.test_namespace,
1718
+ inner=self.catalog_properties,
1719
+ )
1720
+
1721
+ def test_write_to_table_merge_mode_with_merge_keys(self):
1722
+ """Test MERGE mode works when table has merge keys"""
1723
+ table_name = "test_merge_mode_with_keys"
1724
+
1725
+ # Create a table with merge keys
1726
+ self._create_table_with_merge_keys(table_name)
1727
+
1728
+ # Create test data that matches the schema
1729
+ data = pd.DataFrame(
1730
+ {
1731
+ "id": [1, 2, 3],
1732
+ "name": ["Alice", "Bob", "Charlie"],
1733
+ "age": [25, 30, 35],
1734
+ "city": ["NYC", "LA", "Chicago"],
1735
+ }
1736
+ )
1737
+
1738
+ # MERGE mode should work since table has merge keys
1739
+ catalog.write_to_table(
1740
+ data=data,
1741
+ table=table_name,
1742
+ namespace=self.test_namespace,
1743
+ mode=TableWriteMode.MERGE,
1744
+ inner=self.catalog_properties,
1745
+ )
1746
+
1747
+ # Table should still exist
1748
+ assert catalog.table_exists(
1749
+ table=table_name,
1750
+ namespace=self.test_namespace,
1751
+ inner=self.catalog_properties,
1752
+ )
1753
+
1754
+ def test_write_to_table_merge_mode_without_merge_keys_fails(self):
1755
+ """Test MERGE mode fails when table has no merge keys"""
1756
+ table_name = "test_merge_mode_no_keys"
1757
+
1758
+ # Create a table without merge keys
1759
+ self._create_table_without_merge_keys(table_name)
1760
+
1761
+ data = self._create_test_pandas_data()
1762
+
1763
+ # MERGE mode should fail since table has no merge keys
1764
+ with pytest.raises(
1765
+ TableValidationError,
1766
+ match="MERGE mode requires tables to have at least one merge key",
1767
+ ):
1768
+ catalog.write_to_table(
1769
+ data=data,
1770
+ table=table_name,
1771
+ namespace=self.test_namespace,
1772
+ mode=TableWriteMode.MERGE,
1773
+ inner=self.catalog_properties,
1774
+ )
1775
+
1776
+ # Test default namespace behavior
1777
+ def test_write_to_table_default_namespace(self):
1778
+ """Test writing to table using default namespace"""
1779
+ table_name = "test_default_namespace"
1780
+ data = self._create_test_pandas_data()
1781
+
1782
+ # Don't specify namespace, should use default
1783
+ catalog.write_to_table(
1784
+ data=data,
1785
+ table=table_name,
1786
+ mode=TableWriteMode.CREATE,
1787
+ inner=self.catalog_properties,
1788
+ )
1789
+
1790
+ # Should be able to find table in default namespace
1791
+ default_ns = catalog.default_namespace(inner=self.catalog_properties)
1792
+ assert catalog.table_exists(
1793
+ table=table_name, namespace=default_ns, inner=self.catalog_properties
1794
+ )
1795
+
1796
+ def test_write_to_table_append_creates_separate_deltas(self):
1797
+ """Test that APPEND mode creates separate deltas in the same partition"""
1798
+ from deltacat.catalog.main.impl import _get_storage
1799
+
1800
+ table_name = "test_append_separate_deltas"
1801
+ data1 = self._create_test_pandas_data()
1802
+ data2 = self._create_second_batch_pandas_data()
1803
+
1804
+ # Create table with first batch
1805
+ catalog.write_to_table(
1806
+ data=data1,
1807
+ table=table_name,
1808
+ namespace=self.test_namespace,
1809
+ mode=TableWriteMode.CREATE,
1810
+ inner=self.catalog_properties,
1811
+ )
1812
+
1813
+ # Get the table definition to access stream information
1814
+ table_def = catalog.get_table(
1815
+ table=table_name,
1816
+ namespace=self.test_namespace,
1817
+ inner=self.catalog_properties,
1818
+ )
1819
+
1820
+ # Get storage interface
1821
+ storage = _get_storage(inner=self.catalog_properties)
1822
+
1823
+ # Get the stream
1824
+ stream = storage.get_stream(
1825
+ namespace=self.test_namespace,
1826
+ table_name=table_name,
1827
+ table_version=table_def.table_version.table_version,
1828
+ inner=self.catalog_properties,
1829
+ )
1830
+
1831
+ # Get the partition (should be only one for unpartitioned table)
1832
+ partition = storage.get_partition(
1833
+ stream_locator=stream.locator,
1834
+ partition_values=None, # unpartitioned
1835
+ inner=self.catalog_properties,
1836
+ )
1837
+
1838
+ # List deltas before second write
1839
+ deltas_before = storage.list_partition_deltas(
1840
+ partition_like=partition,
1841
+ inner=self.catalog_properties,
1842
+ ).all_items()
1843
+
1844
+ assert (
1845
+ len(deltas_before) == 1
1846
+ ), f"Expected 1 delta before append, got {len(deltas_before)}"
1847
+
1848
+ # Append second batch using APPEND mode
1849
+ catalog.write_to_table(
1850
+ data=data2,
1851
+ table=table_name,
1852
+ namespace=self.test_namespace,
1853
+ mode=TableWriteMode.APPEND,
1854
+ inner=self.catalog_properties,
1855
+ )
1856
+
1857
+ # Get the same partition again (should be the same partition object)
1858
+ partition_after = storage.get_partition(
1859
+ stream_locator=stream.locator,
1860
+ partition_values=None, # unpartitioned
1861
+ inner=self.catalog_properties,
1862
+ )
1863
+
1864
+ # Verify it's the same partition
1865
+ assert (
1866
+ partition.partition_id == partition_after.partition_id
1867
+ ), "APPEND should reuse the same partition"
1868
+
1869
+ # List deltas after second write
1870
+ deltas_after = storage.list_partition_deltas(
1871
+ partition_like=partition_after,
1872
+ inner=self.catalog_properties,
1873
+ ).all_items()
1874
+
1875
+ # Should now have 2 deltas in the same partition
1876
+ assert (
1877
+ len(deltas_after) == 2
1878
+ ), f"Expected 2 deltas after append, got {len(deltas_after)}"
1879
+
1880
+ # Verify deltas have different stream positions
1881
+ stream_positions = [delta.stream_position for delta in deltas_after]
1882
+ assert (
1883
+ len(set(stream_positions)) == 2
1884
+ ), "Deltas should have different stream positions"
1885
+ assert min(stream_positions) == 1, "First delta should have stream position 1"
1886
+ assert max(stream_positions) == 2, "Second delta should have stream position 2"
1887
+
1888
+ def test_write_to_table_partitioned_table_raises_not_implemented(self):
1889
+ """Test that write_to_table raises NotImplementedError for partitioned tables"""
1890
+ from deltacat.storage.model.partition import (
1891
+ PartitionScheme,
1892
+ PartitionKey,
1893
+ PartitionKeyList,
1894
+ )
1895
+ from deltacat.storage.model.transform import IdentityTransform
1896
+
1897
+ table_name = "test_partitioned_table"
1898
+ data = self._create_test_pandas_data()
1899
+
1900
+ # Create a partition scheme with partition keys
1901
+ partition_keys = [
1902
+ PartitionKey.of(
1903
+ key=["city"],
1904
+ name="city_partition",
1905
+ transform=IdentityTransform.of(),
1906
+ )
1907
+ ]
1908
+ partition_scheme = PartitionScheme.of(
1909
+ keys=PartitionKeyList.of(partition_keys),
1910
+ name="test_partition_scheme",
1911
+ scheme_id="test_partition_scheme_id",
1912
+ )
1913
+
1914
+ # Try to create a partitioned table using write_to_table
1915
+ with pytest.raises(
1916
+ NotImplementedError,
1917
+ match="write_to_table does not yet support partitioned tables",
1918
+ ):
1919
+ catalog.write_to_table(
1920
+ data=data,
1921
+ table=table_name,
1922
+ namespace=self.test_namespace,
1923
+ mode=TableWriteMode.CREATE,
1924
+ partition_scheme=partition_scheme, # This makes it partitioned
1925
+ inner=self.catalog_properties,
1926
+ )
1927
+
1928
+ def test_write_to_table_sorted_table_raises_not_implemented(self):
1929
+ """Test that write_to_table raises NotImplementedError for tables with sort keys"""
1930
+ from deltacat.storage.model.sort_key import SortScheme, SortKey, SortKeyList
1931
+ from deltacat.storage.model.types import SortOrder, NullOrder
1932
+
1933
+ table_name = "test_sorted_table"
1934
+ data = self._create_test_pandas_data()
1935
+
1936
+ # Create sort scheme with sort keys
1937
+ sort_scheme = SortScheme.of(
1938
+ keys=SortKeyList.of(
1939
+ [
1940
+ SortKey.of(
1941
+ key=["id"],
1942
+ sort_order=SortOrder.ASCENDING,
1943
+ null_order=NullOrder.AT_END,
1944
+ )
1945
+ ]
1946
+ ),
1947
+ name="test_sort_scheme",
1948
+ scheme_id="test_sort_scheme_id",
1949
+ )
1950
+
1951
+ # Create table with sort keys
1952
+ catalog.create_table(
1953
+ table=table_name,
1954
+ namespace=self.test_namespace,
1955
+ sort_keys=sort_scheme,
1956
+ inner=self.catalog_properties,
1957
+ )
1958
+
1959
+ # Attempt to write to the sorted table should raise NotImplementedError
1960
+ with pytest.raises(NotImplementedError) as exc_info:
1961
+ catalog.write_to_table(
1962
+ data=data,
1963
+ table=table_name,
1964
+ namespace=self.test_namespace,
1965
+ mode=TableWriteMode.APPEND,
1966
+ inner=self.catalog_properties,
1967
+ )
1968
+
1969
+ # Verify the error message contains expected information
1970
+ assert "sort keys" in str(exc_info.value)
1971
+ assert "sort scheme with 1 sort key(s)" in str(exc_info.value)
1972
+ assert "id" in str(exc_info.value)