deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1972 @@
|
|
1
|
+
import shutil
|
2
|
+
import tempfile
|
3
|
+
|
4
|
+
import pytest
|
5
|
+
import pyarrow as pa
|
6
|
+
import pandas as pd
|
7
|
+
import polars as pl
|
8
|
+
import numpy as np
|
9
|
+
import ray.data as rd
|
10
|
+
import daft
|
11
|
+
|
12
|
+
import deltacat.catalog.main.impl as catalog
|
13
|
+
from deltacat.catalog import get_catalog_properties
|
14
|
+
from deltacat.storage.model.schema import (
|
15
|
+
Schema,
|
16
|
+
Field,
|
17
|
+
)
|
18
|
+
from deltacat.storage.model.types import SchemaConsistencyType
|
19
|
+
from deltacat.storage.model.sort_key import SortKey, SortScheme, SortOrder, NullOrder
|
20
|
+
from deltacat.storage.model.types import LifecycleState
|
21
|
+
from deltacat.exceptions import (
|
22
|
+
TableAlreadyExistsError,
|
23
|
+
TableNotFoundError,
|
24
|
+
TableValidationError,
|
25
|
+
SchemaValidationError,
|
26
|
+
)
|
27
|
+
from deltacat.types.tables import TableWriteMode, TableProperty, SchemaEvolutionMode
|
28
|
+
from deltacat.types.media import ContentType
|
29
|
+
|
30
|
+
|
31
|
+
@pytest.fixture(scope="class")
|
32
|
+
def catalog_setup():
|
33
|
+
"""Setup and teardown for the catalog test environment."""
|
34
|
+
temp_dir = tempfile.mkdtemp()
|
35
|
+
catalog_properties = get_catalog_properties(root=temp_dir)
|
36
|
+
yield temp_dir, catalog_properties
|
37
|
+
|
38
|
+
# Teardown
|
39
|
+
shutil.rmtree(temp_dir)
|
40
|
+
|
41
|
+
|
42
|
+
@pytest.fixture(scope="function")
|
43
|
+
def test_namespace(catalog_setup):
|
44
|
+
"""Create a test namespace for each test."""
|
45
|
+
_, catalog_properties = catalog_setup
|
46
|
+
namespace_name = "test_table_namespace"
|
47
|
+
|
48
|
+
if not catalog.namespace_exists(namespace_name, inner=catalog_properties):
|
49
|
+
catalog.create_namespace(
|
50
|
+
namespace=namespace_name,
|
51
|
+
properties={"description": "Test Table Namespace"},
|
52
|
+
inner=catalog_properties,
|
53
|
+
)
|
54
|
+
|
55
|
+
return namespace_name, catalog_properties
|
56
|
+
|
57
|
+
|
58
|
+
@pytest.fixture
|
59
|
+
def sample_arrow_schema():
|
60
|
+
"""Create a sample PyArrow schema for testing."""
|
61
|
+
return pa.schema(
|
62
|
+
[
|
63
|
+
pa.field("id", pa.int64()),
|
64
|
+
pa.field("name", pa.string()),
|
65
|
+
pa.field("value", pa.float64()),
|
66
|
+
]
|
67
|
+
)
|
68
|
+
|
69
|
+
|
70
|
+
@pytest.fixture
|
71
|
+
def sample_sort_keys():
|
72
|
+
"""Create a sample sort scheme for testing."""
|
73
|
+
return SortScheme(
|
74
|
+
keys=[
|
75
|
+
SortKey.of(
|
76
|
+
key=["id"], sort_order=SortOrder.ASCENDING, null_order=NullOrder.AT_END
|
77
|
+
),
|
78
|
+
]
|
79
|
+
)
|
80
|
+
|
81
|
+
|
82
|
+
class TestCatalogTableOperations:
|
83
|
+
"""Test catalog table operations including table creation, existence checks, etc."""
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def setup_class(cls):
|
87
|
+
cls.temp_dir = tempfile.mkdtemp()
|
88
|
+
cls.catalog_properties = get_catalog_properties(root=cls.temp_dir)
|
89
|
+
|
90
|
+
# Create a test namespace
|
91
|
+
cls.test_namespace = "test_write_operations"
|
92
|
+
catalog.create_namespace(
|
93
|
+
namespace=cls.test_namespace,
|
94
|
+
inner=cls.catalog_properties,
|
95
|
+
)
|
96
|
+
|
97
|
+
@classmethod
|
98
|
+
def teardown_class(cls):
|
99
|
+
shutil.rmtree(cls.temp_dir)
|
100
|
+
|
101
|
+
def test_create_table(self, test_namespace, sample_arrow_schema, sample_sort_keys):
|
102
|
+
"""Test creating a table with schema and properties"""
|
103
|
+
namespace_name, catalog_properties = test_namespace
|
104
|
+
table_name = "test_create_table"
|
105
|
+
|
106
|
+
# Create a schema
|
107
|
+
schema = Schema(arrow=sample_arrow_schema)
|
108
|
+
|
109
|
+
# Create table properties
|
110
|
+
table_properties = {"owner": "test-user", "department": "engineering"}
|
111
|
+
|
112
|
+
# Create namespace properties
|
113
|
+
namespace_properties = {"description": "Test Namespace"}
|
114
|
+
|
115
|
+
# Create the table
|
116
|
+
table_definition = catalog.create_table(
|
117
|
+
table=table_name,
|
118
|
+
namespace=namespace_name,
|
119
|
+
schema=schema,
|
120
|
+
sort_keys=sample_sort_keys,
|
121
|
+
table_description="Test table for unit tests",
|
122
|
+
table_properties=table_properties,
|
123
|
+
namespace_properties=namespace_properties,
|
124
|
+
inner=catalog_properties,
|
125
|
+
)
|
126
|
+
|
127
|
+
# Verify table was created
|
128
|
+
assert catalog.table_exists(
|
129
|
+
table_name,
|
130
|
+
namespace=namespace_name,
|
131
|
+
inner=catalog_properties,
|
132
|
+
)
|
133
|
+
|
134
|
+
table = table_definition.table
|
135
|
+
table_version = table_definition.table_version
|
136
|
+
|
137
|
+
# Verify table definition properties
|
138
|
+
assert table_version.table_name == table_name
|
139
|
+
assert table_version.namespace == namespace_name
|
140
|
+
assert table_version.description == "Test table for unit tests"
|
141
|
+
assert table_version.state == LifecycleState.ACTIVE
|
142
|
+
assert table.properties.get("owner") == "test-user"
|
143
|
+
assert table.properties.get("department") == "engineering"
|
144
|
+
assert table_version.schema.arrow.names == sample_arrow_schema.names
|
145
|
+
assert len(table_version.sort_scheme.keys) == 1
|
146
|
+
sort_key_paths = [key[0][0] for key in table_version.sort_scheme.keys]
|
147
|
+
assert "id" in sort_key_paths
|
148
|
+
|
149
|
+
def test_create_table_already_exists(self, test_namespace):
|
150
|
+
namespace_name, catalog_properties = test_namespace
|
151
|
+
table_name = "test_table_exists"
|
152
|
+
|
153
|
+
# Create the table
|
154
|
+
catalog.create_table(
|
155
|
+
table=table_name,
|
156
|
+
namespace=namespace_name,
|
157
|
+
table_description="First creation",
|
158
|
+
inner=catalog_properties,
|
159
|
+
)
|
160
|
+
|
161
|
+
# Verify table exists
|
162
|
+
assert catalog.table_exists(
|
163
|
+
table_name,
|
164
|
+
namespace=namespace_name,
|
165
|
+
inner=catalog_properties,
|
166
|
+
)
|
167
|
+
|
168
|
+
# Try to create the same table again, should raise TableAlreadyExistsError
|
169
|
+
with pytest.raises(
|
170
|
+
TableAlreadyExistsError,
|
171
|
+
match=f"Table {namespace_name}.{table_name} already exists",
|
172
|
+
):
|
173
|
+
catalog.create_table(
|
174
|
+
table=table_name,
|
175
|
+
namespace=namespace_name,
|
176
|
+
table_description="Second creation attempt",
|
177
|
+
inner=catalog_properties,
|
178
|
+
)
|
179
|
+
|
180
|
+
def test_create_table_already_exists_no_fail(self, test_namespace):
|
181
|
+
"""Test creating a table that already exists with fail_if_exists=False"""
|
182
|
+
namespace_name, catalog_properties = test_namespace
|
183
|
+
table_name = "test_table_exists_no_fail"
|
184
|
+
|
185
|
+
# Create the table with original description
|
186
|
+
catalog.create_table(
|
187
|
+
table=table_name,
|
188
|
+
namespace=namespace_name,
|
189
|
+
table_description="Original description",
|
190
|
+
inner=catalog_properties,
|
191
|
+
)
|
192
|
+
|
193
|
+
assert catalog.table_exists(
|
194
|
+
table_name,
|
195
|
+
namespace=namespace_name,
|
196
|
+
catalog=catalog_properties,
|
197
|
+
)
|
198
|
+
|
199
|
+
# Create the same table with fail_if_exists=False
|
200
|
+
table_definition = catalog.create_table(
|
201
|
+
table=table_name,
|
202
|
+
namespace=namespace_name,
|
203
|
+
table_description="Updated description",
|
204
|
+
fail_if_exists=False,
|
205
|
+
inner=catalog_properties,
|
206
|
+
)
|
207
|
+
|
208
|
+
table = table_definition.table
|
209
|
+
|
210
|
+
assert table.table_name == table_name
|
211
|
+
assert table.namespace == namespace_name
|
212
|
+
# Ensure description is unchanged
|
213
|
+
assert table.description == "Original description"
|
214
|
+
|
215
|
+
def test_drop_table(self, test_namespace):
|
216
|
+
namespace_name, catalog_properties = test_namespace
|
217
|
+
table_name = "test_drop_table"
|
218
|
+
|
219
|
+
# Create the table
|
220
|
+
catalog.create_table(
|
221
|
+
table=table_name,
|
222
|
+
namespace=namespace_name,
|
223
|
+
inner=catalog_properties,
|
224
|
+
)
|
225
|
+
|
226
|
+
# Verify table exists
|
227
|
+
assert catalog.table_exists(
|
228
|
+
table_name,
|
229
|
+
namespace=namespace_name,
|
230
|
+
inner=catalog_properties,
|
231
|
+
)
|
232
|
+
|
233
|
+
# Drop the table
|
234
|
+
catalog.drop_table(
|
235
|
+
table=table_name,
|
236
|
+
namespace=namespace_name,
|
237
|
+
inner=catalog_properties,
|
238
|
+
)
|
239
|
+
|
240
|
+
# Verify table no longer exists
|
241
|
+
assert not catalog.table_exists(
|
242
|
+
table_name,
|
243
|
+
namespace=namespace_name,
|
244
|
+
inner=catalog_properties,
|
245
|
+
)
|
246
|
+
|
247
|
+
def test_drop_table_not_exists(self, test_namespace):
|
248
|
+
namespace_name, catalog_properties = test_namespace
|
249
|
+
table_name = "nonexistent_table"
|
250
|
+
|
251
|
+
# Verify table doesn't exist
|
252
|
+
assert not catalog.table_exists(
|
253
|
+
table_name,
|
254
|
+
namespace=namespace_name,
|
255
|
+
inner=catalog_properties,
|
256
|
+
)
|
257
|
+
|
258
|
+
# Try to drop the table, should raise TableNotFoundError
|
259
|
+
with pytest.raises(TableNotFoundError, match=table_name):
|
260
|
+
catalog.drop_table(
|
261
|
+
table=table_name,
|
262
|
+
namespace=namespace_name,
|
263
|
+
inner=catalog_properties,
|
264
|
+
)
|
265
|
+
|
266
|
+
def test_rename_namespace(self, test_namespace):
|
267
|
+
namespace_name, catalog_properties = test_namespace
|
268
|
+
original_name = "test_original_table"
|
269
|
+
new_name = "test_renamed_namespace"
|
270
|
+
|
271
|
+
# Create the table with original name
|
272
|
+
catalog.create_table(
|
273
|
+
table=original_name,
|
274
|
+
namespace=namespace_name,
|
275
|
+
table_description="Table to in namespace to be renamed",
|
276
|
+
inner=catalog_properties,
|
277
|
+
)
|
278
|
+
|
279
|
+
# Verify original table exists
|
280
|
+
assert catalog.table_exists(
|
281
|
+
original_name,
|
282
|
+
namespace=namespace_name,
|
283
|
+
inner=catalog_properties,
|
284
|
+
)
|
285
|
+
|
286
|
+
# Rename the namespace
|
287
|
+
catalog.alter_namespace(
|
288
|
+
namespace=namespace_name,
|
289
|
+
new_namespace=new_name,
|
290
|
+
inner=catalog_properties,
|
291
|
+
)
|
292
|
+
|
293
|
+
# Verify new namespace exists and old namespace doesn't
|
294
|
+
assert catalog.namespace_exists(new_name, inner=catalog_properties)
|
295
|
+
assert not catalog.namespace_exists(namespace_name, inner=catalog_properties)
|
296
|
+
|
297
|
+
# Verify we can still discover the table in the new namespace
|
298
|
+
assert catalog.table_exists(
|
299
|
+
original_name,
|
300
|
+
namespace=new_name,
|
301
|
+
inner=catalog_properties,
|
302
|
+
)
|
303
|
+
|
304
|
+
def test_rename_table(self, test_namespace):
|
305
|
+
namespace_name, catalog_properties = test_namespace
|
306
|
+
original_name = "test_original_table"
|
307
|
+
new_name = "test_renamed_table"
|
308
|
+
|
309
|
+
# Create the table with original name
|
310
|
+
catalog.create_table(
|
311
|
+
table=original_name,
|
312
|
+
namespace=namespace_name,
|
313
|
+
table_description="Table to be renamed",
|
314
|
+
inner=catalog_properties,
|
315
|
+
)
|
316
|
+
|
317
|
+
# Verify original table exists
|
318
|
+
assert catalog.table_exists(
|
319
|
+
original_name,
|
320
|
+
namespace=namespace_name,
|
321
|
+
inner=catalog_properties,
|
322
|
+
)
|
323
|
+
|
324
|
+
# Rename the table
|
325
|
+
catalog.rename_table(
|
326
|
+
table=original_name,
|
327
|
+
new_name=new_name,
|
328
|
+
namespace=namespace_name,
|
329
|
+
inner=catalog_properties,
|
330
|
+
)
|
331
|
+
|
332
|
+
# Verify new table exists and old table doesn't
|
333
|
+
assert catalog.table_exists(
|
334
|
+
new_name,
|
335
|
+
namespace=namespace_name,
|
336
|
+
inner=catalog_properties,
|
337
|
+
)
|
338
|
+
assert not catalog.table_exists(
|
339
|
+
original_name,
|
340
|
+
namespace=namespace_name,
|
341
|
+
inner=catalog_properties,
|
342
|
+
)
|
343
|
+
|
344
|
+
def test_rename_table_not_exists(self, test_namespace):
|
345
|
+
namespace_name, catalog_properties = test_namespace
|
346
|
+
original_name = "nonexistent_table"
|
347
|
+
new_name = "test_renamed_nonexistent"
|
348
|
+
|
349
|
+
# Verify table doesn't exist
|
350
|
+
assert not catalog.table_exists(
|
351
|
+
original_name,
|
352
|
+
namespace=namespace_name,
|
353
|
+
inner=catalog_properties,
|
354
|
+
)
|
355
|
+
|
356
|
+
# Try to rename the table, should raise TableNotFoundError
|
357
|
+
with pytest.raises(TableNotFoundError, match=original_name):
|
358
|
+
catalog.rename_table(
|
359
|
+
table=original_name,
|
360
|
+
new_name=new_name,
|
361
|
+
namespace=namespace_name,
|
362
|
+
inner=catalog_properties,
|
363
|
+
)
|
364
|
+
|
365
|
+
def test_table_exists(self, test_namespace):
|
366
|
+
namespace_name, catalog_properties = test_namespace
|
367
|
+
existing_table = "test_table_exists_check"
|
368
|
+
non_existing_table = "nonexistent_table"
|
369
|
+
|
370
|
+
# Create a table
|
371
|
+
catalog.create_table(
|
372
|
+
table=existing_table,
|
373
|
+
namespace=namespace_name,
|
374
|
+
inner=catalog_properties,
|
375
|
+
)
|
376
|
+
|
377
|
+
# Check existing table
|
378
|
+
assert catalog.table_exists(
|
379
|
+
existing_table,
|
380
|
+
namespace=namespace_name,
|
381
|
+
inner=catalog_properties,
|
382
|
+
)
|
383
|
+
|
384
|
+
# Check non-existing table
|
385
|
+
assert not catalog.table_exists(
|
386
|
+
non_existing_table,
|
387
|
+
namespace=namespace_name,
|
388
|
+
inner=catalog_properties,
|
389
|
+
)
|
390
|
+
|
391
|
+
def test_create_table_with_default_namespace(self, catalog_setup):
|
392
|
+
_, catalog_properties = catalog_setup
|
393
|
+
table_name = "test_default_namespace_table"
|
394
|
+
|
395
|
+
# Create table with default namespace
|
396
|
+
table_definition = catalog.create_table(
|
397
|
+
table=table_name, inner=catalog_properties
|
398
|
+
)
|
399
|
+
|
400
|
+
table = table_definition.table
|
401
|
+
# Verify table was created in default namespace
|
402
|
+
default_ns = catalog.default_namespace()
|
403
|
+
assert table.namespace == default_ns
|
404
|
+
assert catalog.table_exists(
|
405
|
+
table_name,
|
406
|
+
namespace=default_ns,
|
407
|
+
inner=catalog_properties,
|
408
|
+
)
|
409
|
+
|
410
|
+
def test_create_table_with_missing_namespace(self, catalog_setup):
|
411
|
+
_, catalog_properties = catalog_setup
|
412
|
+
table_name = "test_namespace_not_found_table"
|
413
|
+
new_namespace = "nonexistent_namespace"
|
414
|
+
|
415
|
+
# Verify namespace doesn't exist yet
|
416
|
+
assert not catalog.namespace_exists(new_namespace, inner=catalog_properties)
|
417
|
+
|
418
|
+
# Try to create table with non-existent namespace
|
419
|
+
catalog.create_table(
|
420
|
+
table=table_name,
|
421
|
+
namespace=new_namespace,
|
422
|
+
inner=catalog_properties,
|
423
|
+
)
|
424
|
+
|
425
|
+
assert catalog.table_exists(
|
426
|
+
table_name,
|
427
|
+
namespace=new_namespace,
|
428
|
+
inner=catalog_properties,
|
429
|
+
)
|
430
|
+
assert catalog.namespace_exists(new_namespace, inner=catalog_properties)
|
431
|
+
|
432
|
+
def test_alter_table(self, test_namespace, sample_arrow_schema, sample_sort_keys):
|
433
|
+
namespace_name, catalog_properties = test_namespace
|
434
|
+
table_name = "test_alter_table"
|
435
|
+
|
436
|
+
# Create initial schema and properties
|
437
|
+
schema = Schema.of(schema=sample_arrow_schema)
|
438
|
+
initial_properties = {"owner": "original-user", "department": "engineering"}
|
439
|
+
|
440
|
+
# Create the table with initial properties
|
441
|
+
table = catalog.create_table(
|
442
|
+
table=table_name,
|
443
|
+
namespace=namespace_name,
|
444
|
+
schema=schema,
|
445
|
+
sort_keys=sample_sort_keys,
|
446
|
+
table_description="Initial description",
|
447
|
+
table_properties=initial_properties,
|
448
|
+
inner=catalog_properties,
|
449
|
+
)
|
450
|
+
old_schema = table.table_version.schema
|
451
|
+
|
452
|
+
# Verify table was created with initial properties
|
453
|
+
assert catalog.table_exists(
|
454
|
+
table_name,
|
455
|
+
namespace=namespace_name,
|
456
|
+
inner=catalog_properties,
|
457
|
+
)
|
458
|
+
|
459
|
+
# Create schema update operations to add a new field
|
460
|
+
new_field = Field.of(pa.field("count", pa.float64(), nullable=True))
|
461
|
+
schema_update = old_schema.update().add_field(new_field)
|
462
|
+
|
463
|
+
# Create updated properties
|
464
|
+
updated_properties = {
|
465
|
+
"owner": "new-user",
|
466
|
+
"department": "data-science",
|
467
|
+
"priority": "high",
|
468
|
+
}
|
469
|
+
|
470
|
+
# Alter the table with new properties and schema updates
|
471
|
+
catalog.alter_table(
|
472
|
+
table=table_name,
|
473
|
+
namespace=namespace_name,
|
474
|
+
schema_updates=schema_update,
|
475
|
+
table_description="Updated description",
|
476
|
+
table_properties=updated_properties,
|
477
|
+
inner=catalog_properties,
|
478
|
+
)
|
479
|
+
|
480
|
+
# Get the updated table definition
|
481
|
+
updated_table_def = catalog.get_table(
|
482
|
+
table_name,
|
483
|
+
namespace=namespace_name,
|
484
|
+
inner=catalog_properties,
|
485
|
+
)
|
486
|
+
|
487
|
+
updated_table = updated_table_def.table
|
488
|
+
updated_table_version = updated_table_def.table_version
|
489
|
+
|
490
|
+
# Verify table properties were updated
|
491
|
+
assert updated_table_version.description == "Updated description"
|
492
|
+
assert updated_table_version.state == LifecycleState.ACTIVE
|
493
|
+
assert updated_table.properties.get("owner") == "new-user"
|
494
|
+
assert updated_table.properties.get("department") == "data-science"
|
495
|
+
assert updated_table.properties.get("priority") == "high"
|
496
|
+
|
497
|
+
# Verify schema was updated with new field
|
498
|
+
updated_schema = updated_table_version.schema
|
499
|
+
assert updated_schema.field("count") is not None
|
500
|
+
assert updated_schema.field("count").arrow.type == pa.float64()
|
501
|
+
assert updated_schema.field("count").arrow.nullable is True
|
502
|
+
assert (
|
503
|
+
updated_schema.field("count").id == 3
|
504
|
+
) # Next sequential ID after id(0), name(1), value(2)
|
505
|
+
|
506
|
+
# Verify schema ID was incremented (proving SchemaUpdate was used)
|
507
|
+
assert updated_schema.id == old_schema.id + 1
|
508
|
+
|
509
|
+
def test_alter_table_not_exists(self, test_namespace):
|
510
|
+
"""Test altering a table that doesn't exist"""
|
511
|
+
namespace_name, catalog_properties = test_namespace
|
512
|
+
nonexistent_table = "nonexistent_alter_table"
|
513
|
+
|
514
|
+
# Verify table doesn't exist
|
515
|
+
assert not catalog.table_exists(
|
516
|
+
nonexistent_table,
|
517
|
+
namespace=namespace_name,
|
518
|
+
inner=catalog_properties,
|
519
|
+
)
|
520
|
+
|
521
|
+
# Try to alter the nonexistent table, should raise TableNotFoundError
|
522
|
+
with pytest.raises(TableNotFoundError, match=nonexistent_table):
|
523
|
+
catalog.alter_table(
|
524
|
+
table=nonexistent_table,
|
525
|
+
namespace=namespace_name,
|
526
|
+
table_description="Updated description",
|
527
|
+
inner=catalog_properties,
|
528
|
+
)
|
529
|
+
|
530
|
+
def test_alter_table_with_multiple_schema_operations(
|
531
|
+
self, test_namespace, sample_arrow_schema
|
532
|
+
):
|
533
|
+
"""Test altering a table with multiple schema update operations."""
|
534
|
+
namespace_name, catalog_properties = test_namespace
|
535
|
+
table_name = "test_alter_table_multiple_ops"
|
536
|
+
|
537
|
+
# Create initial schema
|
538
|
+
schema = Schema.of(schema=sample_arrow_schema)
|
539
|
+
print("schema.max_field_id", schema.max_field_id)
|
540
|
+
|
541
|
+
# Create the table
|
542
|
+
table = catalog.create_table(
|
543
|
+
table=table_name,
|
544
|
+
namespace=namespace_name,
|
545
|
+
schema=schema,
|
546
|
+
table_description="Initial description",
|
547
|
+
inner=catalog_properties,
|
548
|
+
)
|
549
|
+
|
550
|
+
original_schema = table.table_version.schema
|
551
|
+
|
552
|
+
# Create multiple schema update operations
|
553
|
+
new_field1 = Field.of(pa.field("count", pa.int64(), nullable=True))
|
554
|
+
new_field2 = Field.of(
|
555
|
+
pa.field("status", pa.string(), nullable=False),
|
556
|
+
past_default="active",
|
557
|
+
)
|
558
|
+
|
559
|
+
schema_update = (
|
560
|
+
original_schema.update().add_field(new_field1).add_field(new_field2)
|
561
|
+
)
|
562
|
+
print("original_schema.max_field_id", original_schema.max_field_id)
|
563
|
+
print(
|
564
|
+
"schema_update.base_schema.max_field_id",
|
565
|
+
schema_update.base_schema.max_field_id,
|
566
|
+
)
|
567
|
+
|
568
|
+
# Alter the table
|
569
|
+
catalog.alter_table(
|
570
|
+
table=table_name,
|
571
|
+
namespace=namespace_name,
|
572
|
+
schema_updates=schema_update,
|
573
|
+
table_description="Updated with multiple fields",
|
574
|
+
inner=catalog_properties,
|
575
|
+
)
|
576
|
+
|
577
|
+
# Get the updated table
|
578
|
+
updated_table_def = catalog.get_table(
|
579
|
+
table_name,
|
580
|
+
namespace=namespace_name,
|
581
|
+
inner=catalog_properties,
|
582
|
+
)
|
583
|
+
|
584
|
+
updated_schema = updated_table_def.table_version.schema
|
585
|
+
|
586
|
+
# Verify both fields were added
|
587
|
+
assert updated_schema.field("count") is not None
|
588
|
+
assert updated_schema.field("count").arrow.type == pa.int64()
|
589
|
+
assert (
|
590
|
+
updated_schema.field("count").id == 3
|
591
|
+
) # Next sequential ID after id(0), name(1), value(2)
|
592
|
+
|
593
|
+
assert updated_schema.field("status") is not None
|
594
|
+
assert updated_schema.field("status").arrow.type == pa.string()
|
595
|
+
assert (
|
596
|
+
updated_schema.field("status").id == 4
|
597
|
+
) # Next sequential ID after count(3)
|
598
|
+
assert updated_schema.field("status").past_default == "active"
|
599
|
+
|
600
|
+
# Verify schema ID was incremented
|
601
|
+
assert updated_schema.id == original_schema.id + 1
|
602
|
+
|
603
|
+
def test_alter_table_with_remove_operation(self, test_namespace):
|
604
|
+
"""Test altering a table with field removal (requires allow_incompatible_changes)."""
|
605
|
+
namespace_name, catalog_properties = test_namespace
|
606
|
+
table_name = "test_alter_table_remove"
|
607
|
+
|
608
|
+
# Create schema with multiple fields
|
609
|
+
initial_fields = [
|
610
|
+
Field.of(
|
611
|
+
pa.field("id", pa.int64(), nullable=False),
|
612
|
+
is_merge_key=True,
|
613
|
+
field_id=1,
|
614
|
+
),
|
615
|
+
Field.of(pa.field("name", pa.string(), nullable=True), field_id=2),
|
616
|
+
Field.of(pa.field("temp_field", pa.float64(), nullable=True), field_id=3),
|
617
|
+
]
|
618
|
+
schema = Schema.of(initial_fields)
|
619
|
+
|
620
|
+
# Create the table
|
621
|
+
table = catalog.create_table(
|
622
|
+
table=table_name,
|
623
|
+
namespace=namespace_name,
|
624
|
+
schema=schema,
|
625
|
+
inner=catalog_properties,
|
626
|
+
)
|
627
|
+
original_schema = table.table_version.schema
|
628
|
+
temp_field = original_schema.field("temp_field")
|
629
|
+
assert temp_field is not None
|
630
|
+
|
631
|
+
schema_update = original_schema.update(True).remove_field("temp_field")
|
632
|
+
|
633
|
+
catalog.alter_table(
|
634
|
+
table=table_name,
|
635
|
+
namespace=namespace_name,
|
636
|
+
schema_updates=schema_update,
|
637
|
+
inner=catalog_properties,
|
638
|
+
)
|
639
|
+
|
640
|
+
# If successful, verify the field was removed
|
641
|
+
updated_table_def = catalog.get_table(
|
642
|
+
table_name,
|
643
|
+
namespace=namespace_name,
|
644
|
+
inner=catalog_properties,
|
645
|
+
)
|
646
|
+
updated_schema = updated_table_def.table_version.schema
|
647
|
+
|
648
|
+
# temp_field should be removed
|
649
|
+
with pytest.raises(KeyError):
|
650
|
+
updated_schema.field("temp_field")
|
651
|
+
|
652
|
+
# all other fields should be present
|
653
|
+
assert updated_schema.field("id") is not None
|
654
|
+
assert updated_schema.field("id").arrow.type == pa.int64()
|
655
|
+
assert updated_schema.field("id").id == 1
|
656
|
+
assert updated_schema.field("name") is not None
|
657
|
+
assert updated_schema.field("name").arrow.type == pa.string()
|
658
|
+
assert updated_schema.field("name").id == 2
|
659
|
+
|
660
|
+
def test_alter_table_with_update_operation(self, test_namespace):
|
661
|
+
"""Test altering a table with field update operation."""
|
662
|
+
namespace_name, catalog_properties = test_namespace
|
663
|
+
table_name = "test_alter_table_update"
|
664
|
+
|
665
|
+
# Create schema with a field to update
|
666
|
+
initial_fields = [
|
667
|
+
Field.of(
|
668
|
+
pa.field("id", pa.int64(), nullable=False),
|
669
|
+
is_merge_key=True,
|
670
|
+
field_id=1,
|
671
|
+
),
|
672
|
+
Field.of(pa.field("value", pa.int32(), nullable=True), field_id=2),
|
673
|
+
]
|
674
|
+
schema = Schema.of(initial_fields)
|
675
|
+
|
676
|
+
# Create the table
|
677
|
+
table = catalog.create_table(
|
678
|
+
table=table_name,
|
679
|
+
namespace=namespace_name,
|
680
|
+
schema=schema,
|
681
|
+
inner=catalog_properties,
|
682
|
+
)
|
683
|
+
|
684
|
+
original_schema = table.table_version.schema
|
685
|
+
|
686
|
+
# Update the value field to int64 (compatible type widening)
|
687
|
+
schema_update = original_schema.update().update_field_type("value", pa.int64())
|
688
|
+
|
689
|
+
# Alter the table
|
690
|
+
catalog.alter_table(
|
691
|
+
table=table_name,
|
692
|
+
namespace=namespace_name,
|
693
|
+
schema_updates=schema_update,
|
694
|
+
inner=catalog_properties,
|
695
|
+
)
|
696
|
+
|
697
|
+
# Get the updated table
|
698
|
+
updated_table_def = catalog.get_table(
|
699
|
+
table_name,
|
700
|
+
namespace=namespace_name,
|
701
|
+
inner=catalog_properties,
|
702
|
+
)
|
703
|
+
|
704
|
+
updated_schema = updated_table_def.table_version.schema
|
705
|
+
|
706
|
+
# Verify field was updated
|
707
|
+
assert updated_schema.field("value").arrow.type == pa.int64()
|
708
|
+
assert updated_schema.field("value").id == 2
|
709
|
+
|
710
|
+
# Verify schema ID was incremented
|
711
|
+
assert updated_schema.id == original_schema.id + 1
|
712
|
+
|
713
|
+
def test_alter_table_with_schema_evolution_disabled(self, test_namespace):
|
714
|
+
"""Test that alter_table raises TableValidationError when schema evolution is disabled."""
|
715
|
+
namespace_name, catalog_properties = test_namespace
|
716
|
+
table_name = "test_alter_table_schema_evolution_disabled"
|
717
|
+
|
718
|
+
# Create initial schema
|
719
|
+
initial_fields = [
|
720
|
+
Field.of(
|
721
|
+
pa.field("id", pa.int64(), nullable=False),
|
722
|
+
is_merge_key=True,
|
723
|
+
field_id=1,
|
724
|
+
),
|
725
|
+
Field.of(pa.field("value", pa.int32(), nullable=True), field_id=2),
|
726
|
+
]
|
727
|
+
schema = Schema.of(initial_fields)
|
728
|
+
|
729
|
+
# Create table with SCHEMA_EVOLUTION_MODE.DISABLED
|
730
|
+
table_properties = {
|
731
|
+
TableProperty.SCHEMA_EVOLUTION_MODE: SchemaEvolutionMode.DISABLED
|
732
|
+
}
|
733
|
+
|
734
|
+
table = catalog.create_table(
|
735
|
+
table=table_name,
|
736
|
+
namespace=namespace_name,
|
737
|
+
schema=schema,
|
738
|
+
table_properties=table_properties,
|
739
|
+
inner=catalog_properties,
|
740
|
+
)
|
741
|
+
|
742
|
+
original_schema = table.table_version.schema
|
743
|
+
|
744
|
+
# Try to add a new field - this should be blocked
|
745
|
+
new_field = Field.of(pa.field("description", pa.string(), nullable=True))
|
746
|
+
schema_update = original_schema.update().add_field(new_field)
|
747
|
+
|
748
|
+
# Alter table with schema updates should raise TableValidationError
|
749
|
+
with pytest.raises(
|
750
|
+
TableValidationError,
|
751
|
+
match="Schema evolution is disabled for this table. Please enable schema evolution or remove schema updates.",
|
752
|
+
):
|
753
|
+
catalog.alter_table(
|
754
|
+
table=table_name,
|
755
|
+
namespace=namespace_name,
|
756
|
+
schema_updates=schema_update,
|
757
|
+
inner=catalog_properties,
|
758
|
+
)
|
759
|
+
|
760
|
+
# Verify the schema wasn't changed
|
761
|
+
unchanged_table_def = catalog.get_table(
|
762
|
+
table_name,
|
763
|
+
namespace=namespace_name,
|
764
|
+
inner=catalog_properties,
|
765
|
+
)
|
766
|
+
unchanged_schema = unchanged_table_def.table_version.schema
|
767
|
+
|
768
|
+
# Schema should be unchanged
|
769
|
+
assert unchanged_schema.id == original_schema.id
|
770
|
+
assert len(unchanged_schema.fields) == len(original_schema.fields)
|
771
|
+
|
772
|
+
# Verify the new field was not added
|
773
|
+
field_names = [field.arrow.name for field in unchanged_schema.fields]
|
774
|
+
assert "description" not in field_names
|
775
|
+
|
776
|
+
# Test that alter_table works without schema_updates even when schema evolution is disabled
|
777
|
+
catalog.alter_table(
|
778
|
+
table=table_name,
|
779
|
+
namespace=namespace_name,
|
780
|
+
table_description="Updated description without schema changes",
|
781
|
+
inner=catalog_properties,
|
782
|
+
)
|
783
|
+
|
784
|
+
# Verify that table description was updated but schema remains unchanged
|
785
|
+
final_table_def = catalog.get_table(
|
786
|
+
table_name,
|
787
|
+
namespace=namespace_name,
|
788
|
+
inner=catalog_properties,
|
789
|
+
)
|
790
|
+
assert (
|
791
|
+
final_table_def.table_version.description
|
792
|
+
== "Updated description without schema changes"
|
793
|
+
)
|
794
|
+
assert final_table_def.table_version.schema.id == original_schema.id
|
795
|
+
|
796
|
+
def test_drop_with_purge_validation(self, test_namespace):
|
797
|
+
"""Test that using purge flag raises ValidationError"""
|
798
|
+
namespace_name, catalog_properties = test_namespace
|
799
|
+
table_name = "test_drop_with_purge"
|
800
|
+
|
801
|
+
# Create the table
|
802
|
+
catalog.create_table(
|
803
|
+
table=table_name,
|
804
|
+
namespace=namespace_name,
|
805
|
+
inner=catalog_properties,
|
806
|
+
)
|
807
|
+
|
808
|
+
# Try to drop with purge=True, should raise ValidationError
|
809
|
+
with pytest.raises(
|
810
|
+
NotImplementedError, match="Purge flag is not currently supported"
|
811
|
+
):
|
812
|
+
catalog.drop_table(
|
813
|
+
table=table_name,
|
814
|
+
namespace=namespace_name,
|
815
|
+
purge=True,
|
816
|
+
inner=catalog_properties,
|
817
|
+
)
|
818
|
+
|
819
|
+
def test_create_table_basic(self):
|
820
|
+
"""Test basic table creation"""
|
821
|
+
table_name = "test_create_table_basic"
|
822
|
+
schema = Schema.of(
|
823
|
+
schema=pa.schema(
|
824
|
+
[
|
825
|
+
("id", pa.int64()),
|
826
|
+
("name", pa.string()),
|
827
|
+
]
|
828
|
+
)
|
829
|
+
)
|
830
|
+
|
831
|
+
table_def = catalog.create_table(
|
832
|
+
table=table_name,
|
833
|
+
namespace=self.test_namespace,
|
834
|
+
schema=schema,
|
835
|
+
inner=self.catalog_properties,
|
836
|
+
)
|
837
|
+
|
838
|
+
assert table_def.table.table_name == table_name
|
839
|
+
assert table_def.table_version.schema.equivalent_to(schema)
|
840
|
+
|
841
|
+
# Verify table exists
|
842
|
+
assert catalog.table_exists(
|
843
|
+
table=table_name,
|
844
|
+
namespace=self.test_namespace,
|
845
|
+
inner=self.catalog_properties,
|
846
|
+
)
|
847
|
+
|
848
|
+
def test_create_table_already_exists_fail_if_exists_true(self):
|
849
|
+
"""Test creating a table that already exists with fail_if_exists=True"""
|
850
|
+
table_name = "test_create_table_exists"
|
851
|
+
schema = Schema.of(schema=pa.schema([("id", pa.int64())]))
|
852
|
+
|
853
|
+
# Create table first
|
854
|
+
catalog.create_table(
|
855
|
+
table=table_name,
|
856
|
+
namespace=self.test_namespace,
|
857
|
+
schema=schema,
|
858
|
+
inner=self.catalog_properties,
|
859
|
+
)
|
860
|
+
|
861
|
+
# Try to create again with fail_if_exists=True (default)
|
862
|
+
with pytest.raises(TableAlreadyExistsError):
|
863
|
+
catalog.create_table(
|
864
|
+
table=table_name,
|
865
|
+
namespace=self.test_namespace,
|
866
|
+
schema=schema,
|
867
|
+
fail_if_exists=True,
|
868
|
+
inner=self.catalog_properties,
|
869
|
+
)
|
870
|
+
|
871
|
+
def test_create_table_already_exists_fail_if_exists_false(self):
|
872
|
+
"""Test creating a table that already exists with fail_if_exists=False"""
|
873
|
+
table_name = "test_create_table_exists_ok"
|
874
|
+
schema = Schema.of(schema=pa.schema([("id", pa.int64())]))
|
875
|
+
|
876
|
+
# Create table first
|
877
|
+
table_def1 = catalog.create_table(
|
878
|
+
table=table_name,
|
879
|
+
namespace=self.test_namespace,
|
880
|
+
schema=schema,
|
881
|
+
inner=self.catalog_properties,
|
882
|
+
)
|
883
|
+
|
884
|
+
# Create again with fail_if_exists=False should return existing table
|
885
|
+
table_def2 = catalog.create_table(
|
886
|
+
table=table_name,
|
887
|
+
namespace=self.test_namespace,
|
888
|
+
schema=schema,
|
889
|
+
fail_if_exists=False,
|
890
|
+
inner=self.catalog_properties,
|
891
|
+
)
|
892
|
+
|
893
|
+
assert table_def1.table.table_name == table_def2.table.table_name
|
894
|
+
|
895
|
+
|
896
|
+
class TestWriteToTable:
|
897
|
+
"""Test the write_to_table implementation with different modes and data types."""
|
898
|
+
|
899
|
+
@classmethod
|
900
|
+
def setup_class(cls):
|
901
|
+
cls.temp_dir = tempfile.mkdtemp()
|
902
|
+
cls.catalog_properties = get_catalog_properties(root=cls.temp_dir)
|
903
|
+
|
904
|
+
# Create a test namespace
|
905
|
+
cls.test_namespace = "test_write_to_table"
|
906
|
+
catalog.create_namespace(
|
907
|
+
namespace=cls.test_namespace, inner=cls.catalog_properties
|
908
|
+
)
|
909
|
+
|
910
|
+
@classmethod
|
911
|
+
def teardown_class(cls):
|
912
|
+
shutil.rmtree(cls.temp_dir)
|
913
|
+
|
914
|
+
def _create_test_pandas_data(self):
|
915
|
+
"""Create test pandas DataFrame"""
|
916
|
+
return pd.DataFrame(
|
917
|
+
{
|
918
|
+
"id": [1, 2, 3, 4, 5],
|
919
|
+
"name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
|
920
|
+
"age": [25, 30, 35, 40, 45],
|
921
|
+
"city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
|
922
|
+
}
|
923
|
+
)
|
924
|
+
|
925
|
+
def _create_test_pyarrow_data(self):
|
926
|
+
"""Create test PyArrow Table"""
|
927
|
+
return pa.table(
|
928
|
+
{
|
929
|
+
"id": [1, 2, 3, 4, 5],
|
930
|
+
"name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
|
931
|
+
"age": [25, 30, 35, 40, 45],
|
932
|
+
"city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
|
933
|
+
}
|
934
|
+
)
|
935
|
+
|
936
|
+
def _create_test_polars_data(self):
|
937
|
+
"""Create test Polars DataFrame"""
|
938
|
+
return pl.DataFrame(
|
939
|
+
{
|
940
|
+
"id": [1, 2, 3, 4, 5],
|
941
|
+
"name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
|
942
|
+
"age": [25, 30, 35, 40, 45],
|
943
|
+
"city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
|
944
|
+
}
|
945
|
+
)
|
946
|
+
|
947
|
+
def _create_second_batch_pandas_data(self):
|
948
|
+
"""Create second batch of test data for append tests"""
|
949
|
+
return pd.DataFrame(
|
950
|
+
{
|
951
|
+
"id": [6, 7, 8],
|
952
|
+
"name": ["Frank", "Grace", "Henry"],
|
953
|
+
"age": [50, 55, 60],
|
954
|
+
"city": ["Boston", "Seattle", "Denver"],
|
955
|
+
}
|
956
|
+
)
|
957
|
+
|
958
|
+
def _create_test_ray_data(self):
|
959
|
+
"""Create test Ray Dataset for schema inference testing."""
|
960
|
+
import ray
|
961
|
+
|
962
|
+
# Initialize Ray if not already initialized
|
963
|
+
# Note: Use distributed mode (not local_mode=True) to avoid Ray 2.46.0 internal bug
|
964
|
+
if not ray.is_initialized():
|
965
|
+
ray.init()
|
966
|
+
|
967
|
+
data = pa.table(
|
968
|
+
{
|
969
|
+
"id": [1, 2, 3, 4, 5],
|
970
|
+
"name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
|
971
|
+
"age": [25, 30, 35, 40, 45],
|
972
|
+
"city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
|
973
|
+
}
|
974
|
+
)
|
975
|
+
return rd.from_arrow(data)
|
976
|
+
|
977
|
+
def _create_test_daft_data(self):
|
978
|
+
"""Create test Daft DataFrame for schema inference testing."""
|
979
|
+
data = {
|
980
|
+
"id": [1, 2, 3],
|
981
|
+
"name": ["Alice", "Bob", "Charlie"],
|
982
|
+
"age": [25, 30, 35],
|
983
|
+
"city": ["NYC", "LA", "Chicago"],
|
984
|
+
}
|
985
|
+
return daft.from_pydict(data)
|
986
|
+
|
987
|
+
def _create_test_numpy_1d_data(self):
|
988
|
+
"""Create test 1D numpy array for schema inference testing."""
|
989
|
+
return np.array([1, 2, 3, 4, 5])
|
990
|
+
|
991
|
+
def _create_test_numpy_2d_data(self):
|
992
|
+
"""Create test 2D numpy array for schema inference testing."""
|
993
|
+
return np.array([[1, 25], [2, 30], [3, 35]], dtype=np.int64)
|
994
|
+
|
995
|
+
def _create_table_with_merge_keys(self, table_name: str):
|
996
|
+
"""Create a table with merge keys for testing MERGE mode"""
|
997
|
+
from deltacat.storage.model.schema import Schema, Field
|
998
|
+
|
999
|
+
# Create schema with merge keys
|
1000
|
+
schema = Schema.of(
|
1001
|
+
[
|
1002
|
+
Field.of(pa.field("id", pa.int64()), is_merge_key=True), # merge key
|
1003
|
+
Field.of(pa.field("name", pa.string())),
|
1004
|
+
Field.of(pa.field("age", pa.int32())),
|
1005
|
+
Field.of(pa.field("city", pa.string())),
|
1006
|
+
]
|
1007
|
+
)
|
1008
|
+
|
1009
|
+
catalog.create_table(
|
1010
|
+
table=table_name,
|
1011
|
+
namespace=self.test_namespace,
|
1012
|
+
schema=schema,
|
1013
|
+
inner=self.catalog_properties,
|
1014
|
+
)
|
1015
|
+
|
1016
|
+
return schema
|
1017
|
+
|
1018
|
+
def _create_table_without_merge_keys(self, table_name: str):
|
1019
|
+
"""Create a table without merge keys for testing APPEND mode"""
|
1020
|
+
# Use schema inference with no merge keys
|
1021
|
+
data = self._create_test_pandas_data()
|
1022
|
+
catalog.write_to_table(
|
1023
|
+
data=data,
|
1024
|
+
table=table_name,
|
1025
|
+
namespace=self.test_namespace,
|
1026
|
+
mode=TableWriteMode.CREATE,
|
1027
|
+
inner=self.catalog_properties,
|
1028
|
+
)
|
1029
|
+
|
1030
|
+
# Test TableWriteMode.AUTO
|
1031
|
+
def test_write_to_table_auto_create_new_table_pandas(self):
|
1032
|
+
"""Test AUTO mode creating a new table with pandas data"""
|
1033
|
+
table_name = "test_auto_create_pandas"
|
1034
|
+
data = self._create_test_pandas_data()
|
1035
|
+
|
1036
|
+
# Table doesn't exist, AUTO should create it
|
1037
|
+
catalog.write_to_table(
|
1038
|
+
data=data,
|
1039
|
+
table=table_name,
|
1040
|
+
namespace=self.test_namespace,
|
1041
|
+
mode=TableWriteMode.AUTO,
|
1042
|
+
inner=self.catalog_properties,
|
1043
|
+
)
|
1044
|
+
|
1045
|
+
# Verify table was created
|
1046
|
+
assert catalog.table_exists(
|
1047
|
+
table=table_name,
|
1048
|
+
namespace=self.test_namespace,
|
1049
|
+
inner=self.catalog_properties,
|
1050
|
+
)
|
1051
|
+
|
1052
|
+
# Verify table has correct schema
|
1053
|
+
table_def = catalog.get_table(
|
1054
|
+
table=table_name,
|
1055
|
+
namespace=self.test_namespace,
|
1056
|
+
inner=self.catalog_properties,
|
1057
|
+
)
|
1058
|
+
assert table_def.table_version.schema is not None
|
1059
|
+
|
1060
|
+
def test_write_to_table_auto_create_new_table_pyarrow(self):
|
1061
|
+
"""Test AUTO mode creating a new table with PyArrow data"""
|
1062
|
+
table_name = "test_auto_create_pyarrow"
|
1063
|
+
data = self._create_test_pyarrow_data()
|
1064
|
+
|
1065
|
+
catalog.write_to_table(
|
1066
|
+
data=data,
|
1067
|
+
table=table_name,
|
1068
|
+
namespace=self.test_namespace,
|
1069
|
+
mode=TableWriteMode.AUTO,
|
1070
|
+
inner=self.catalog_properties,
|
1071
|
+
)
|
1072
|
+
|
1073
|
+
assert catalog.table_exists(
|
1074
|
+
table=table_name,
|
1075
|
+
namespace=self.test_namespace,
|
1076
|
+
inner=self.catalog_properties,
|
1077
|
+
)
|
1078
|
+
|
1079
|
+
def test_write_to_table_auto_create_new_table_polars(self):
|
1080
|
+
"""Test AUTO mode creating a new table with Polars data"""
|
1081
|
+
table_name = "test_auto_create_polars"
|
1082
|
+
data = self._create_test_polars_data()
|
1083
|
+
|
1084
|
+
catalog.write_to_table(
|
1085
|
+
data=data,
|
1086
|
+
table=table_name,
|
1087
|
+
namespace=self.test_namespace,
|
1088
|
+
mode=TableWriteMode.AUTO,
|
1089
|
+
inner=self.catalog_properties,
|
1090
|
+
)
|
1091
|
+
|
1092
|
+
assert catalog.table_exists(
|
1093
|
+
table=table_name,
|
1094
|
+
namespace=self.test_namespace,
|
1095
|
+
inner=self.catalog_properties,
|
1096
|
+
)
|
1097
|
+
|
1098
|
+
def test_write_to_table_auto_append_existing_table(self):
|
1099
|
+
"""Test AUTO mode appending to existing table"""
|
1100
|
+
table_name = "test_auto_append"
|
1101
|
+
data1 = self._create_test_pandas_data()
|
1102
|
+
data2 = self._create_second_batch_pandas_data()
|
1103
|
+
|
1104
|
+
# First write creates table
|
1105
|
+
catalog.write_to_table(
|
1106
|
+
data=data1,
|
1107
|
+
table=table_name,
|
1108
|
+
namespace=self.test_namespace,
|
1109
|
+
mode=TableWriteMode.AUTO,
|
1110
|
+
inner=self.catalog_properties,
|
1111
|
+
)
|
1112
|
+
|
1113
|
+
# Second write should append
|
1114
|
+
catalog.write_to_table(
|
1115
|
+
data=data2,
|
1116
|
+
table=table_name,
|
1117
|
+
namespace=self.test_namespace,
|
1118
|
+
mode=TableWriteMode.AUTO,
|
1119
|
+
inner=self.catalog_properties,
|
1120
|
+
)
|
1121
|
+
|
1122
|
+
# Verify table still exists
|
1123
|
+
assert catalog.table_exists(
|
1124
|
+
table=table_name,
|
1125
|
+
namespace=self.test_namespace,
|
1126
|
+
inner=self.catalog_properties,
|
1127
|
+
)
|
1128
|
+
|
1129
|
+
# Test TableWriteMode.CREATE
|
1130
|
+
def test_write_to_table_create_new_table(self):
|
1131
|
+
"""Test CREATE mode with new table"""
|
1132
|
+
table_name = "test_create_new"
|
1133
|
+
data = self._create_test_pandas_data()
|
1134
|
+
|
1135
|
+
catalog.write_to_table(
|
1136
|
+
data=data,
|
1137
|
+
table=table_name,
|
1138
|
+
namespace=self.test_namespace,
|
1139
|
+
mode=TableWriteMode.CREATE,
|
1140
|
+
inner=self.catalog_properties,
|
1141
|
+
)
|
1142
|
+
|
1143
|
+
assert catalog.table_exists(
|
1144
|
+
table=table_name,
|
1145
|
+
namespace=self.test_namespace,
|
1146
|
+
inner=self.catalog_properties,
|
1147
|
+
)
|
1148
|
+
|
1149
|
+
def test_write_to_table_create_existing_table_fails(self):
|
1150
|
+
"""Test CREATE mode fails when table exists"""
|
1151
|
+
table_name = "test_create_fail"
|
1152
|
+
data = self._create_test_pandas_data()
|
1153
|
+
|
1154
|
+
# Create table first
|
1155
|
+
catalog.write_to_table(
|
1156
|
+
data=data,
|
1157
|
+
table=table_name,
|
1158
|
+
namespace=self.test_namespace,
|
1159
|
+
mode=TableWriteMode.CREATE,
|
1160
|
+
inner=self.catalog_properties,
|
1161
|
+
)
|
1162
|
+
|
1163
|
+
# Try to create again should fail
|
1164
|
+
with pytest.raises(
|
1165
|
+
TableAlreadyExistsError, match="already exists and mode is CREATE"
|
1166
|
+
):
|
1167
|
+
catalog.write_to_table(
|
1168
|
+
data=data,
|
1169
|
+
table=table_name,
|
1170
|
+
namespace=self.test_namespace,
|
1171
|
+
mode=TableWriteMode.CREATE,
|
1172
|
+
inner=self.catalog_properties,
|
1173
|
+
)
|
1174
|
+
|
1175
|
+
# Test TableWriteMode.APPEND
|
1176
|
+
def test_write_to_table_append_existing_table(self):
|
1177
|
+
"""Test APPEND mode with existing table"""
|
1178
|
+
table_name = "test_append_existing"
|
1179
|
+
data1 = self._create_test_pandas_data()
|
1180
|
+
data2 = self._create_second_batch_pandas_data()
|
1181
|
+
|
1182
|
+
# Create table first
|
1183
|
+
catalog.write_to_table(
|
1184
|
+
data=data1,
|
1185
|
+
table=table_name,
|
1186
|
+
namespace=self.test_namespace,
|
1187
|
+
mode=TableWriteMode.CREATE,
|
1188
|
+
inner=self.catalog_properties,
|
1189
|
+
)
|
1190
|
+
|
1191
|
+
# Append to existing table
|
1192
|
+
catalog.write_to_table(
|
1193
|
+
data=data2,
|
1194
|
+
table=table_name,
|
1195
|
+
namespace=self.test_namespace,
|
1196
|
+
mode=TableWriteMode.APPEND,
|
1197
|
+
inner=self.catalog_properties,
|
1198
|
+
)
|
1199
|
+
|
1200
|
+
def test_write_to_table_append_nonexistent_table_fails(self):
|
1201
|
+
"""Test APPEND mode fails when table doesn't exist"""
|
1202
|
+
table_name = "test_append_fail"
|
1203
|
+
data = self._create_test_pandas_data()
|
1204
|
+
|
1205
|
+
with pytest.raises(
|
1206
|
+
TableNotFoundError,
|
1207
|
+
match="does not exist and write mode is append. Use CREATE or AUTO mode",
|
1208
|
+
):
|
1209
|
+
catalog.write_to_table(
|
1210
|
+
data=data,
|
1211
|
+
table=table_name,
|
1212
|
+
namespace=self.test_namespace,
|
1213
|
+
mode=TableWriteMode.APPEND,
|
1214
|
+
inner=self.catalog_properties,
|
1215
|
+
)
|
1216
|
+
|
1217
|
+
def test_write_to_table_append_with_merge_keys_fails(self):
|
1218
|
+
"""Test APPEND mode fails when table has merge keys"""
|
1219
|
+
table_name = "test_append_with_merge_keys"
|
1220
|
+
|
1221
|
+
# Create a table with merge keys
|
1222
|
+
self._create_table_with_merge_keys(table_name)
|
1223
|
+
|
1224
|
+
# Create test data that matches the schema
|
1225
|
+
data = pd.DataFrame(
|
1226
|
+
{
|
1227
|
+
"id": [1, 2, 3],
|
1228
|
+
"name": ["Alice", "Bob", "Charlie"],
|
1229
|
+
"age": [25, 30, 35],
|
1230
|
+
"city": ["NYC", "LA", "Chicago"],
|
1231
|
+
}
|
1232
|
+
)
|
1233
|
+
|
1234
|
+
# APPEND mode should fail since table has merge keys
|
1235
|
+
with pytest.raises(
|
1236
|
+
SchemaValidationError,
|
1237
|
+
match="APPEND mode cannot be used with tables that have merge keys",
|
1238
|
+
):
|
1239
|
+
catalog.write_to_table(
|
1240
|
+
data=data,
|
1241
|
+
table=table_name,
|
1242
|
+
namespace=self.test_namespace,
|
1243
|
+
mode=TableWriteMode.APPEND,
|
1244
|
+
inner=self.catalog_properties,
|
1245
|
+
)
|
1246
|
+
|
1247
|
+
def test_write_to_table_append_without_merge_keys_succeeds(self):
|
1248
|
+
"""Test APPEND mode works when table has no merge keys"""
|
1249
|
+
table_name = "test_append_no_merge_keys"
|
1250
|
+
|
1251
|
+
# Create a table without merge keys
|
1252
|
+
self._create_table_without_merge_keys(table_name)
|
1253
|
+
|
1254
|
+
# Add more data to the table
|
1255
|
+
data2 = self._create_second_batch_pandas_data()
|
1256
|
+
|
1257
|
+
# APPEND mode should work since table has no merge keys
|
1258
|
+
catalog.write_to_table(
|
1259
|
+
data=data2,
|
1260
|
+
table=table_name,
|
1261
|
+
namespace=self.test_namespace,
|
1262
|
+
mode=TableWriteMode.APPEND,
|
1263
|
+
inner=self.catalog_properties,
|
1264
|
+
)
|
1265
|
+
|
1266
|
+
# Table should still exist
|
1267
|
+
assert catalog.table_exists(
|
1268
|
+
table=table_name,
|
1269
|
+
namespace=self.test_namespace,
|
1270
|
+
inner=self.catalog_properties,
|
1271
|
+
)
|
1272
|
+
|
1273
|
+
# Test explicit schema specification
|
1274
|
+
def test_write_to_table_explicit_schema(self):
|
1275
|
+
"""Test writing with explicit schema specification"""
|
1276
|
+
table_name = "test_explicit_schema"
|
1277
|
+
data = self._create_test_pandas_data()
|
1278
|
+
|
1279
|
+
# Define explicit schema with COERCE consistency types to preserve exact types
|
1280
|
+
explicit_schema = Schema.of(
|
1281
|
+
schema=[
|
1282
|
+
Field.of(
|
1283
|
+
pa.field("id", pa.int64()),
|
1284
|
+
consistency_type=SchemaConsistencyType.COERCE,
|
1285
|
+
),
|
1286
|
+
Field.of(
|
1287
|
+
pa.field("name", pa.string()),
|
1288
|
+
consistency_type=SchemaConsistencyType.COERCE,
|
1289
|
+
),
|
1290
|
+
Field.of(
|
1291
|
+
pa.field("age", pa.int32()),
|
1292
|
+
consistency_type=SchemaConsistencyType.COERCE,
|
1293
|
+
), # Different from inferred schema
|
1294
|
+
Field.of(
|
1295
|
+
pa.field("city", pa.string()),
|
1296
|
+
consistency_type=SchemaConsistencyType.COERCE,
|
1297
|
+
),
|
1298
|
+
]
|
1299
|
+
)
|
1300
|
+
|
1301
|
+
catalog.write_to_table(
|
1302
|
+
data=data,
|
1303
|
+
table=table_name,
|
1304
|
+
namespace=self.test_namespace,
|
1305
|
+
mode=TableWriteMode.CREATE,
|
1306
|
+
schema=explicit_schema,
|
1307
|
+
inner=self.catalog_properties,
|
1308
|
+
)
|
1309
|
+
|
1310
|
+
# Verify schema was used
|
1311
|
+
table_def = catalog.get_table(
|
1312
|
+
table=table_name,
|
1313
|
+
namespace=self.test_namespace,
|
1314
|
+
inner=self.catalog_properties,
|
1315
|
+
)
|
1316
|
+
assert table_def.table_version.schema.equivalent_to(explicit_schema)
|
1317
|
+
|
1318
|
+
def test_write_to_table_explicit_schema_none(self):
|
1319
|
+
"""Test writing with explicit schema=None to create schemaless table"""
|
1320
|
+
table_name = "test_explicit_schema_none"
|
1321
|
+
data = self._create_test_pandas_data()
|
1322
|
+
|
1323
|
+
catalog.write_to_table(
|
1324
|
+
data=data,
|
1325
|
+
table=table_name,
|
1326
|
+
namespace=self.test_namespace,
|
1327
|
+
mode=TableWriteMode.CREATE,
|
1328
|
+
schema=None, # Explicitly set schema=None
|
1329
|
+
inner=self.catalog_properties,
|
1330
|
+
)
|
1331
|
+
|
1332
|
+
# Verify table was created with schema=None (schemaless)
|
1333
|
+
table_def = catalog.get_table(
|
1334
|
+
table=table_name,
|
1335
|
+
namespace=self.test_namespace,
|
1336
|
+
inner=self.catalog_properties,
|
1337
|
+
)
|
1338
|
+
|
1339
|
+
# The table should exist but have a None/empty schema
|
1340
|
+
assert table_def is not None
|
1341
|
+
# Note: The exact behavior of schemaless tables may vary by storage implementation
|
1342
|
+
# We're mainly testing that the create_table call succeeded with schema=None
|
1343
|
+
|
1344
|
+
def test_schema_behavior_comparison(self):
|
1345
|
+
"""Test that demonstrates the difference between no schema vs explicit schema=None"""
|
1346
|
+
data = self._create_test_pandas_data()
|
1347
|
+
|
1348
|
+
# Case 1: No schema argument - should infer schema
|
1349
|
+
table_name_inferred = "test_schema_inferred"
|
1350
|
+
catalog.write_to_table(
|
1351
|
+
data=data,
|
1352
|
+
table=table_name_inferred,
|
1353
|
+
namespace=self.test_namespace,
|
1354
|
+
mode=TableWriteMode.CREATE,
|
1355
|
+
# No schema argument provided - should infer from data
|
1356
|
+
inner=self.catalog_properties,
|
1357
|
+
)
|
1358
|
+
|
1359
|
+
# Case 2: Explicit schema=None - should create schemaless table
|
1360
|
+
table_name_schemaless = "test_schema_none"
|
1361
|
+
catalog.write_to_table(
|
1362
|
+
data=data,
|
1363
|
+
table=table_name_schemaless,
|
1364
|
+
namespace=self.test_namespace,
|
1365
|
+
mode=TableWriteMode.CREATE,
|
1366
|
+
schema=None, # Explicitly set schema=None
|
1367
|
+
inner=self.catalog_properties,
|
1368
|
+
)
|
1369
|
+
|
1370
|
+
# Verify both tables were created
|
1371
|
+
table_inferred = catalog.get_table(
|
1372
|
+
table=table_name_inferred,
|
1373
|
+
namespace=self.test_namespace,
|
1374
|
+
inner=self.catalog_properties,
|
1375
|
+
)
|
1376
|
+
|
1377
|
+
table_schemaless = catalog.get_table(
|
1378
|
+
table=table_name_schemaless,
|
1379
|
+
namespace=self.test_namespace,
|
1380
|
+
inner=self.catalog_properties,
|
1381
|
+
)
|
1382
|
+
|
1383
|
+
# Both tables should exist
|
1384
|
+
assert table_inferred is not None
|
1385
|
+
assert table_schemaless is not None
|
1386
|
+
|
1387
|
+
# The inferred schema table should have a schema with the expected columns
|
1388
|
+
inferred_schema = table_inferred.table_version.schema.arrow
|
1389
|
+
assert "id" in inferred_schema.names
|
1390
|
+
assert "name" in inferred_schema.names
|
1391
|
+
assert "age" in inferred_schema.names
|
1392
|
+
assert "city" in inferred_schema.names
|
1393
|
+
|
1394
|
+
# Test schema inference from different data types
|
1395
|
+
def test_schema_inference_pandas(self):
|
1396
|
+
"""Test schema inference from pandas DataFrame"""
|
1397
|
+
table_name = "test_schema_inference_pandas"
|
1398
|
+
data = pd.DataFrame(
|
1399
|
+
{
|
1400
|
+
"int_col": [1, 2, 3],
|
1401
|
+
"float_col": [1.1, 2.2, 3.3],
|
1402
|
+
"str_col": ["a", "b", "c"],
|
1403
|
+
"bool_col": [True, False, True],
|
1404
|
+
}
|
1405
|
+
)
|
1406
|
+
|
1407
|
+
catalog.write_to_table(
|
1408
|
+
data=data,
|
1409
|
+
table=table_name,
|
1410
|
+
namespace=self.test_namespace,
|
1411
|
+
mode=TableWriteMode.CREATE,
|
1412
|
+
inner=self.catalog_properties,
|
1413
|
+
)
|
1414
|
+
|
1415
|
+
table_def = catalog.get_table(
|
1416
|
+
table=table_name,
|
1417
|
+
namespace=self.test_namespace,
|
1418
|
+
inner=self.catalog_properties,
|
1419
|
+
)
|
1420
|
+
|
1421
|
+
schema = table_def.table_version.schema.arrow
|
1422
|
+
assert "int_col" in schema.names
|
1423
|
+
assert "float_col" in schema.names
|
1424
|
+
assert "str_col" in schema.names
|
1425
|
+
assert "bool_col" in schema.names
|
1426
|
+
|
1427
|
+
def test_schema_inference_pyarrow(self):
|
1428
|
+
"""Test schema inference from PyArrow Table"""
|
1429
|
+
table_name = "test_schema_inference_pyarrow"
|
1430
|
+
data = pa.table(
|
1431
|
+
{
|
1432
|
+
"int64_col": pa.array([1, 2, 3], type=pa.int64()),
|
1433
|
+
"string_col": pa.array(["x", "y", "z"], type=pa.string()),
|
1434
|
+
"double_col": pa.array([1.1, 2.2, 3.3], type=pa.float64()),
|
1435
|
+
}
|
1436
|
+
)
|
1437
|
+
|
1438
|
+
catalog.write_to_table(
|
1439
|
+
data=data,
|
1440
|
+
table=table_name,
|
1441
|
+
namespace=self.test_namespace,
|
1442
|
+
mode=TableWriteMode.CREATE,
|
1443
|
+
inner=self.catalog_properties,
|
1444
|
+
)
|
1445
|
+
|
1446
|
+
table_def = catalog.get_table(
|
1447
|
+
table=table_name,
|
1448
|
+
namespace=self.test_namespace,
|
1449
|
+
inner=self.catalog_properties,
|
1450
|
+
)
|
1451
|
+
|
1452
|
+
schema = table_def.table_version.schema.arrow
|
1453
|
+
assert schema.field("int64_col").type == pa.int64()
|
1454
|
+
assert schema.field("string_col").type == pa.string()
|
1455
|
+
assert schema.field("double_col").type == pa.float64()
|
1456
|
+
|
1457
|
+
def test_schema_inference_polars(self):
|
1458
|
+
"""Test schema inference from Polars DataFrame"""
|
1459
|
+
table_name = "test_schema_inference_polars"
|
1460
|
+
data = pl.DataFrame(
|
1461
|
+
{
|
1462
|
+
"int_col": [1, 2, 3],
|
1463
|
+
"str_col": ["a", "b", "c"],
|
1464
|
+
"float_col": [1.1, 2.2, 3.3],
|
1465
|
+
}
|
1466
|
+
)
|
1467
|
+
|
1468
|
+
catalog.write_to_table(
|
1469
|
+
data=data,
|
1470
|
+
table=table_name,
|
1471
|
+
namespace=self.test_namespace,
|
1472
|
+
mode=TableWriteMode.CREATE,
|
1473
|
+
inner=self.catalog_properties,
|
1474
|
+
)
|
1475
|
+
|
1476
|
+
table_def = catalog.get_table(
|
1477
|
+
table=table_name,
|
1478
|
+
namespace=self.test_namespace,
|
1479
|
+
inner=self.catalog_properties,
|
1480
|
+
)
|
1481
|
+
|
1482
|
+
schema = table_def.table_version.schema.arrow
|
1483
|
+
assert "int_col" in schema.names
|
1484
|
+
assert "str_col" in schema.names
|
1485
|
+
assert "float_col" in schema.names
|
1486
|
+
|
1487
|
+
def test_schema_inference_ray_dataset(self):
|
1488
|
+
"""Test schema inference from Ray Dataset"""
|
1489
|
+
table_name = "test_schema_inference_ray"
|
1490
|
+
ray_data = self._create_test_ray_data()
|
1491
|
+
|
1492
|
+
catalog.write_to_table(
|
1493
|
+
data=ray_data,
|
1494
|
+
table=table_name,
|
1495
|
+
namespace=self.test_namespace,
|
1496
|
+
mode=TableWriteMode.CREATE,
|
1497
|
+
inner=self.catalog_properties,
|
1498
|
+
)
|
1499
|
+
|
1500
|
+
table_def = catalog.get_table(
|
1501
|
+
table=table_name,
|
1502
|
+
namespace=self.test_namespace,
|
1503
|
+
inner=self.catalog_properties,
|
1504
|
+
)
|
1505
|
+
|
1506
|
+
schema = table_def.table_version.schema.arrow
|
1507
|
+
assert "id" in schema.names
|
1508
|
+
assert "name" in schema.names
|
1509
|
+
assert "age" in schema.names
|
1510
|
+
assert "city" in schema.names
|
1511
|
+
|
1512
|
+
def test_schema_inference_daft_dataframe(self):
|
1513
|
+
"""Test schema inference from Daft DataFrame"""
|
1514
|
+
table_name = "test_schema_inference_daft"
|
1515
|
+
data = self._create_test_daft_data()
|
1516
|
+
|
1517
|
+
catalog.write_to_table(
|
1518
|
+
data=data,
|
1519
|
+
table=table_name,
|
1520
|
+
namespace=self.test_namespace,
|
1521
|
+
mode=TableWriteMode.CREATE,
|
1522
|
+
inner=self.catalog_properties,
|
1523
|
+
)
|
1524
|
+
|
1525
|
+
table_def = catalog.get_table(
|
1526
|
+
table=table_name,
|
1527
|
+
namespace=self.test_namespace,
|
1528
|
+
inner=self.catalog_properties,
|
1529
|
+
)
|
1530
|
+
|
1531
|
+
schema = table_def.table_version.schema.arrow
|
1532
|
+
assert "id" in schema.names
|
1533
|
+
assert "name" in schema.names
|
1534
|
+
assert "age" in schema.names
|
1535
|
+
assert "city" in schema.names
|
1536
|
+
|
1537
|
+
def test_schema_inference_numpy_1d(self):
|
1538
|
+
"""Test schema inference from 1D numpy array"""
|
1539
|
+
table_name = "test_schema_inference_numpy_1d"
|
1540
|
+
data = self._create_test_numpy_1d_data()
|
1541
|
+
|
1542
|
+
catalog.write_to_table(
|
1543
|
+
data=data,
|
1544
|
+
table=table_name,
|
1545
|
+
namespace=self.test_namespace,
|
1546
|
+
mode=TableWriteMode.CREATE,
|
1547
|
+
inner=self.catalog_properties,
|
1548
|
+
)
|
1549
|
+
|
1550
|
+
table_def = catalog.get_table(
|
1551
|
+
table=table_name,
|
1552
|
+
namespace=self.test_namespace,
|
1553
|
+
inner=self.catalog_properties,
|
1554
|
+
)
|
1555
|
+
|
1556
|
+
schema = table_def.table_version.schema.arrow
|
1557
|
+
assert (
|
1558
|
+
"0" in schema.names
|
1559
|
+
) # pandas converts 1D numpy array with column name "0"
|
1560
|
+
assert len(schema.names) == 1
|
1561
|
+
|
1562
|
+
def test_schema_inference_numpy_2d(self):
|
1563
|
+
"""Test schema inference from 2D numpy array"""
|
1564
|
+
table_name = "test_schema_inference_numpy_2d"
|
1565
|
+
data = self._create_test_numpy_2d_data()
|
1566
|
+
|
1567
|
+
catalog.write_to_table(
|
1568
|
+
data=data,
|
1569
|
+
table=table_name,
|
1570
|
+
namespace=self.test_namespace,
|
1571
|
+
mode=TableWriteMode.CREATE,
|
1572
|
+
inner=self.catalog_properties,
|
1573
|
+
)
|
1574
|
+
|
1575
|
+
table_def = catalog.get_table(
|
1576
|
+
table=table_name,
|
1577
|
+
namespace=self.test_namespace,
|
1578
|
+
inner=self.catalog_properties,
|
1579
|
+
)
|
1580
|
+
|
1581
|
+
schema = table_def.table_version.schema.arrow
|
1582
|
+
assert (
|
1583
|
+
"0" in schema.names
|
1584
|
+
) # pandas converts 2D numpy array with column names "0", "1"
|
1585
|
+
assert "1" in schema.names
|
1586
|
+
assert len(schema.names) == 2
|
1587
|
+
|
1588
|
+
def test_numpy_3d_array_error(self):
|
1589
|
+
"""Test that 3D numpy arrays raise an error"""
|
1590
|
+
table_name = "test_numpy_3d_error"
|
1591
|
+
data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # 3D array
|
1592
|
+
|
1593
|
+
with pytest.raises(
|
1594
|
+
ValueError, match="NumPy arrays with 3 dimensions are not supported"
|
1595
|
+
):
|
1596
|
+
catalog.write_to_table(
|
1597
|
+
data=data,
|
1598
|
+
table=table_name,
|
1599
|
+
namespace=self.test_namespace,
|
1600
|
+
mode=TableWriteMode.CREATE,
|
1601
|
+
inner=self.catalog_properties,
|
1602
|
+
)
|
1603
|
+
|
1604
|
+
# Test different content types
|
1605
|
+
def test_write_to_table_different_content_types(self):
|
1606
|
+
"""Test writing with different content types"""
|
1607
|
+
data = self._create_test_pandas_data()
|
1608
|
+
|
1609
|
+
content_types = [
|
1610
|
+
ContentType.PARQUET,
|
1611
|
+
ContentType.CSV,
|
1612
|
+
ContentType.JSON,
|
1613
|
+
]
|
1614
|
+
|
1615
|
+
for i, content_type in enumerate(content_types):
|
1616
|
+
table_name = f"test_content_type_{content_type.value}_{i}"
|
1617
|
+
|
1618
|
+
catalog.write_to_table(
|
1619
|
+
data=data,
|
1620
|
+
table=table_name,
|
1621
|
+
namespace=self.test_namespace,
|
1622
|
+
mode=TableWriteMode.CREATE,
|
1623
|
+
content_type=content_type,
|
1624
|
+
inner=self.catalog_properties,
|
1625
|
+
schema=None,
|
1626
|
+
)
|
1627
|
+
|
1628
|
+
assert catalog.table_exists(
|
1629
|
+
table=table_name,
|
1630
|
+
namespace=self.test_namespace,
|
1631
|
+
inner=self.catalog_properties,
|
1632
|
+
)
|
1633
|
+
|
1634
|
+
# Test table creation parameters
|
1635
|
+
def test_write_to_table_with_table_properties(self):
|
1636
|
+
"""Test writing with table creation parameters"""
|
1637
|
+
table_name = "test_table_properties"
|
1638
|
+
data = self._create_test_pandas_data()
|
1639
|
+
|
1640
|
+
catalog.write_to_table(
|
1641
|
+
data=data,
|
1642
|
+
table=table_name,
|
1643
|
+
namespace=self.test_namespace,
|
1644
|
+
mode=TableWriteMode.CREATE,
|
1645
|
+
table_description="Test table with properties",
|
1646
|
+
lifecycle_state=LifecycleState.ACTIVE,
|
1647
|
+
inner=self.catalog_properties,
|
1648
|
+
)
|
1649
|
+
|
1650
|
+
table_def = catalog.get_table(
|
1651
|
+
table=table_name,
|
1652
|
+
namespace=self.test_namespace,
|
1653
|
+
inner=self.catalog_properties,
|
1654
|
+
)
|
1655
|
+
|
1656
|
+
assert table_def.table.description == "Test table with properties"
|
1657
|
+
# Note: lifecycle_state defaults to ACTIVE in create_table, but may be overridden
|
1658
|
+
# We'll accept either ACTIVE or CREATED as both are valid for our test purpose
|
1659
|
+
assert table_def.table_version.state in [
|
1660
|
+
LifecycleState.ACTIVE,
|
1661
|
+
LifecycleState.CREATED,
|
1662
|
+
]
|
1663
|
+
|
1664
|
+
# Test error conditions
|
1665
|
+
def test_write_to_table_unsupported_data_type(self):
|
1666
|
+
"""Test error when data type cannot be inferred"""
|
1667
|
+
table_name = "test_unsupported_data"
|
1668
|
+
|
1669
|
+
# Use a plain dict which doesn't have schema inference
|
1670
|
+
unsupported_data = {"key": "value"}
|
1671
|
+
|
1672
|
+
with pytest.raises(
|
1673
|
+
ValueError, match="No schema inference function found for table type"
|
1674
|
+
):
|
1675
|
+
catalog.write_to_table(
|
1676
|
+
data=unsupported_data,
|
1677
|
+
table=table_name,
|
1678
|
+
namespace=self.test_namespace,
|
1679
|
+
mode=TableWriteMode.CREATE,
|
1680
|
+
inner=self.catalog_properties,
|
1681
|
+
)
|
1682
|
+
|
1683
|
+
def test_write_to_table_replace_mode(self):
|
1684
|
+
"""Test REPLACE mode creating a new stream to replace existing data"""
|
1685
|
+
table_name = "test_replace_mode"
|
1686
|
+
data1 = self._create_test_pandas_data()
|
1687
|
+
data2 = self._create_second_batch_pandas_data()
|
1688
|
+
|
1689
|
+
# First, create the table
|
1690
|
+
catalog.write_to_table(
|
1691
|
+
data=data1,
|
1692
|
+
table=table_name,
|
1693
|
+
namespace=self.test_namespace,
|
1694
|
+
mode=TableWriteMode.CREATE,
|
1695
|
+
inner=self.catalog_properties,
|
1696
|
+
)
|
1697
|
+
|
1698
|
+
# Verify table exists
|
1699
|
+
assert catalog.table_exists(
|
1700
|
+
table=table_name,
|
1701
|
+
namespace=self.test_namespace,
|
1702
|
+
inner=self.catalog_properties,
|
1703
|
+
)
|
1704
|
+
|
1705
|
+
# Now use REPLACE mode to replace all existing data
|
1706
|
+
catalog.write_to_table(
|
1707
|
+
data=data2,
|
1708
|
+
table=table_name,
|
1709
|
+
namespace=self.test_namespace,
|
1710
|
+
mode=TableWriteMode.REPLACE,
|
1711
|
+
inner=self.catalog_properties,
|
1712
|
+
)
|
1713
|
+
|
1714
|
+
# Table should still exist
|
1715
|
+
assert catalog.table_exists(
|
1716
|
+
table=table_name,
|
1717
|
+
namespace=self.test_namespace,
|
1718
|
+
inner=self.catalog_properties,
|
1719
|
+
)
|
1720
|
+
|
1721
|
+
def test_write_to_table_merge_mode_with_merge_keys(self):
|
1722
|
+
"""Test MERGE mode works when table has merge keys"""
|
1723
|
+
table_name = "test_merge_mode_with_keys"
|
1724
|
+
|
1725
|
+
# Create a table with merge keys
|
1726
|
+
self._create_table_with_merge_keys(table_name)
|
1727
|
+
|
1728
|
+
# Create test data that matches the schema
|
1729
|
+
data = pd.DataFrame(
|
1730
|
+
{
|
1731
|
+
"id": [1, 2, 3],
|
1732
|
+
"name": ["Alice", "Bob", "Charlie"],
|
1733
|
+
"age": [25, 30, 35],
|
1734
|
+
"city": ["NYC", "LA", "Chicago"],
|
1735
|
+
}
|
1736
|
+
)
|
1737
|
+
|
1738
|
+
# MERGE mode should work since table has merge keys
|
1739
|
+
catalog.write_to_table(
|
1740
|
+
data=data,
|
1741
|
+
table=table_name,
|
1742
|
+
namespace=self.test_namespace,
|
1743
|
+
mode=TableWriteMode.MERGE,
|
1744
|
+
inner=self.catalog_properties,
|
1745
|
+
)
|
1746
|
+
|
1747
|
+
# Table should still exist
|
1748
|
+
assert catalog.table_exists(
|
1749
|
+
table=table_name,
|
1750
|
+
namespace=self.test_namespace,
|
1751
|
+
inner=self.catalog_properties,
|
1752
|
+
)
|
1753
|
+
|
1754
|
+
def test_write_to_table_merge_mode_without_merge_keys_fails(self):
|
1755
|
+
"""Test MERGE mode fails when table has no merge keys"""
|
1756
|
+
table_name = "test_merge_mode_no_keys"
|
1757
|
+
|
1758
|
+
# Create a table without merge keys
|
1759
|
+
self._create_table_without_merge_keys(table_name)
|
1760
|
+
|
1761
|
+
data = self._create_test_pandas_data()
|
1762
|
+
|
1763
|
+
# MERGE mode should fail since table has no merge keys
|
1764
|
+
with pytest.raises(
|
1765
|
+
TableValidationError,
|
1766
|
+
match="MERGE mode requires tables to have at least one merge key",
|
1767
|
+
):
|
1768
|
+
catalog.write_to_table(
|
1769
|
+
data=data,
|
1770
|
+
table=table_name,
|
1771
|
+
namespace=self.test_namespace,
|
1772
|
+
mode=TableWriteMode.MERGE,
|
1773
|
+
inner=self.catalog_properties,
|
1774
|
+
)
|
1775
|
+
|
1776
|
+
# Test default namespace behavior
|
1777
|
+
def test_write_to_table_default_namespace(self):
|
1778
|
+
"""Test writing to table using default namespace"""
|
1779
|
+
table_name = "test_default_namespace"
|
1780
|
+
data = self._create_test_pandas_data()
|
1781
|
+
|
1782
|
+
# Don't specify namespace, should use default
|
1783
|
+
catalog.write_to_table(
|
1784
|
+
data=data,
|
1785
|
+
table=table_name,
|
1786
|
+
mode=TableWriteMode.CREATE,
|
1787
|
+
inner=self.catalog_properties,
|
1788
|
+
)
|
1789
|
+
|
1790
|
+
# Should be able to find table in default namespace
|
1791
|
+
default_ns = catalog.default_namespace(inner=self.catalog_properties)
|
1792
|
+
assert catalog.table_exists(
|
1793
|
+
table=table_name, namespace=default_ns, inner=self.catalog_properties
|
1794
|
+
)
|
1795
|
+
|
1796
|
+
def test_write_to_table_append_creates_separate_deltas(self):
|
1797
|
+
"""Test that APPEND mode creates separate deltas in the same partition"""
|
1798
|
+
from deltacat.catalog.main.impl import _get_storage
|
1799
|
+
|
1800
|
+
table_name = "test_append_separate_deltas"
|
1801
|
+
data1 = self._create_test_pandas_data()
|
1802
|
+
data2 = self._create_second_batch_pandas_data()
|
1803
|
+
|
1804
|
+
# Create table with first batch
|
1805
|
+
catalog.write_to_table(
|
1806
|
+
data=data1,
|
1807
|
+
table=table_name,
|
1808
|
+
namespace=self.test_namespace,
|
1809
|
+
mode=TableWriteMode.CREATE,
|
1810
|
+
inner=self.catalog_properties,
|
1811
|
+
)
|
1812
|
+
|
1813
|
+
# Get the table definition to access stream information
|
1814
|
+
table_def = catalog.get_table(
|
1815
|
+
table=table_name,
|
1816
|
+
namespace=self.test_namespace,
|
1817
|
+
inner=self.catalog_properties,
|
1818
|
+
)
|
1819
|
+
|
1820
|
+
# Get storage interface
|
1821
|
+
storage = _get_storage(inner=self.catalog_properties)
|
1822
|
+
|
1823
|
+
# Get the stream
|
1824
|
+
stream = storage.get_stream(
|
1825
|
+
namespace=self.test_namespace,
|
1826
|
+
table_name=table_name,
|
1827
|
+
table_version=table_def.table_version.table_version,
|
1828
|
+
inner=self.catalog_properties,
|
1829
|
+
)
|
1830
|
+
|
1831
|
+
# Get the partition (should be only one for unpartitioned table)
|
1832
|
+
partition = storage.get_partition(
|
1833
|
+
stream_locator=stream.locator,
|
1834
|
+
partition_values=None, # unpartitioned
|
1835
|
+
inner=self.catalog_properties,
|
1836
|
+
)
|
1837
|
+
|
1838
|
+
# List deltas before second write
|
1839
|
+
deltas_before = storage.list_partition_deltas(
|
1840
|
+
partition_like=partition,
|
1841
|
+
inner=self.catalog_properties,
|
1842
|
+
).all_items()
|
1843
|
+
|
1844
|
+
assert (
|
1845
|
+
len(deltas_before) == 1
|
1846
|
+
), f"Expected 1 delta before append, got {len(deltas_before)}"
|
1847
|
+
|
1848
|
+
# Append second batch using APPEND mode
|
1849
|
+
catalog.write_to_table(
|
1850
|
+
data=data2,
|
1851
|
+
table=table_name,
|
1852
|
+
namespace=self.test_namespace,
|
1853
|
+
mode=TableWriteMode.APPEND,
|
1854
|
+
inner=self.catalog_properties,
|
1855
|
+
)
|
1856
|
+
|
1857
|
+
# Get the same partition again (should be the same partition object)
|
1858
|
+
partition_after = storage.get_partition(
|
1859
|
+
stream_locator=stream.locator,
|
1860
|
+
partition_values=None, # unpartitioned
|
1861
|
+
inner=self.catalog_properties,
|
1862
|
+
)
|
1863
|
+
|
1864
|
+
# Verify it's the same partition
|
1865
|
+
assert (
|
1866
|
+
partition.partition_id == partition_after.partition_id
|
1867
|
+
), "APPEND should reuse the same partition"
|
1868
|
+
|
1869
|
+
# List deltas after second write
|
1870
|
+
deltas_after = storage.list_partition_deltas(
|
1871
|
+
partition_like=partition_after,
|
1872
|
+
inner=self.catalog_properties,
|
1873
|
+
).all_items()
|
1874
|
+
|
1875
|
+
# Should now have 2 deltas in the same partition
|
1876
|
+
assert (
|
1877
|
+
len(deltas_after) == 2
|
1878
|
+
), f"Expected 2 deltas after append, got {len(deltas_after)}"
|
1879
|
+
|
1880
|
+
# Verify deltas have different stream positions
|
1881
|
+
stream_positions = [delta.stream_position for delta in deltas_after]
|
1882
|
+
assert (
|
1883
|
+
len(set(stream_positions)) == 2
|
1884
|
+
), "Deltas should have different stream positions"
|
1885
|
+
assert min(stream_positions) == 1, "First delta should have stream position 1"
|
1886
|
+
assert max(stream_positions) == 2, "Second delta should have stream position 2"
|
1887
|
+
|
1888
|
+
def test_write_to_table_partitioned_table_raises_not_implemented(self):
|
1889
|
+
"""Test that write_to_table raises NotImplementedError for partitioned tables"""
|
1890
|
+
from deltacat.storage.model.partition import (
|
1891
|
+
PartitionScheme,
|
1892
|
+
PartitionKey,
|
1893
|
+
PartitionKeyList,
|
1894
|
+
)
|
1895
|
+
from deltacat.storage.model.transform import IdentityTransform
|
1896
|
+
|
1897
|
+
table_name = "test_partitioned_table"
|
1898
|
+
data = self._create_test_pandas_data()
|
1899
|
+
|
1900
|
+
# Create a partition scheme with partition keys
|
1901
|
+
partition_keys = [
|
1902
|
+
PartitionKey.of(
|
1903
|
+
key=["city"],
|
1904
|
+
name="city_partition",
|
1905
|
+
transform=IdentityTransform.of(),
|
1906
|
+
)
|
1907
|
+
]
|
1908
|
+
partition_scheme = PartitionScheme.of(
|
1909
|
+
keys=PartitionKeyList.of(partition_keys),
|
1910
|
+
name="test_partition_scheme",
|
1911
|
+
scheme_id="test_partition_scheme_id",
|
1912
|
+
)
|
1913
|
+
|
1914
|
+
# Try to create a partitioned table using write_to_table
|
1915
|
+
with pytest.raises(
|
1916
|
+
NotImplementedError,
|
1917
|
+
match="write_to_table does not yet support partitioned tables",
|
1918
|
+
):
|
1919
|
+
catalog.write_to_table(
|
1920
|
+
data=data,
|
1921
|
+
table=table_name,
|
1922
|
+
namespace=self.test_namespace,
|
1923
|
+
mode=TableWriteMode.CREATE,
|
1924
|
+
partition_scheme=partition_scheme, # This makes it partitioned
|
1925
|
+
inner=self.catalog_properties,
|
1926
|
+
)
|
1927
|
+
|
1928
|
+
def test_write_to_table_sorted_table_raises_not_implemented(self):
|
1929
|
+
"""Test that write_to_table raises NotImplementedError for tables with sort keys"""
|
1930
|
+
from deltacat.storage.model.sort_key import SortScheme, SortKey, SortKeyList
|
1931
|
+
from deltacat.storage.model.types import SortOrder, NullOrder
|
1932
|
+
|
1933
|
+
table_name = "test_sorted_table"
|
1934
|
+
data = self._create_test_pandas_data()
|
1935
|
+
|
1936
|
+
# Create sort scheme with sort keys
|
1937
|
+
sort_scheme = SortScheme.of(
|
1938
|
+
keys=SortKeyList.of(
|
1939
|
+
[
|
1940
|
+
SortKey.of(
|
1941
|
+
key=["id"],
|
1942
|
+
sort_order=SortOrder.ASCENDING,
|
1943
|
+
null_order=NullOrder.AT_END,
|
1944
|
+
)
|
1945
|
+
]
|
1946
|
+
),
|
1947
|
+
name="test_sort_scheme",
|
1948
|
+
scheme_id="test_sort_scheme_id",
|
1949
|
+
)
|
1950
|
+
|
1951
|
+
# Create table with sort keys
|
1952
|
+
catalog.create_table(
|
1953
|
+
table=table_name,
|
1954
|
+
namespace=self.test_namespace,
|
1955
|
+
sort_keys=sort_scheme,
|
1956
|
+
inner=self.catalog_properties,
|
1957
|
+
)
|
1958
|
+
|
1959
|
+
# Attempt to write to the sorted table should raise NotImplementedError
|
1960
|
+
with pytest.raises(NotImplementedError) as exc_info:
|
1961
|
+
catalog.write_to_table(
|
1962
|
+
data=data,
|
1963
|
+
table=table_name,
|
1964
|
+
namespace=self.test_namespace,
|
1965
|
+
mode=TableWriteMode.APPEND,
|
1966
|
+
inner=self.catalog_properties,
|
1967
|
+
)
|
1968
|
+
|
1969
|
+
# Verify the error message contains expected information
|
1970
|
+
assert "sort keys" in str(exc_info.value)
|
1971
|
+
assert "sort scheme with 1 sort key(s)" in str(exc_info.value)
|
1972
|
+
assert "id" in str(exc_info.value)
|