PyPI - deltacat - Versions diffs - 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl - Mend

deltacat 2.0.0b11py3-none-any.whl → 2.0.0b12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

deltacat/__init__.py +78 -3
deltacat/api.py +122 -67
deltacat/aws/constants.py +0 -23
deltacat/aws/s3u.py +4 -631
deltacat/benchmarking/conftest.py +0 -18
deltacat/catalog/__init__.py +2 -0
deltacat/catalog/delegate.py +445 -63
deltacat/catalog/interface.py +188 -62
deltacat/catalog/main/impl.py +2417 -271
deltacat/catalog/model/catalog.py +49 -10
deltacat/catalog/model/properties.py +38 -0
deltacat/compute/compactor/compaction_session.py +97 -75
deltacat/compute/compactor/model/compact_partition_params.py +75 -30
deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
deltacat/compute/compactor/model/round_completion_info.py +16 -6
deltacat/compute/compactor/repartition_session.py +8 -21
deltacat/compute/compactor/steps/hash_bucket.py +5 -5
deltacat/compute/compactor/steps/materialize.py +9 -7
deltacat/compute/compactor/steps/repartition.py +12 -11
deltacat/compute/compactor/utils/io.py +6 -5
deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
deltacat/compute/compactor/utils/system_columns.py +3 -1
deltacat/compute/compactor_v2/compaction_session.py +17 -14
deltacat/compute/compactor_v2/constants.py +30 -1
deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
deltacat/compute/compactor_v2/model/merge_input.py +33 -8
deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
deltacat/compute/compactor_v2/steps/merge.py +267 -55
deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
deltacat/compute/compactor_v2/utils/delta.py +5 -3
deltacat/compute/compactor_v2/utils/io.py +11 -4
deltacat/compute/compactor_v2/utils/merge.py +15 -2
deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
deltacat/compute/compactor_v2/utils/task_options.py +45 -33
deltacat/compute/converter/converter_session.py +145 -32
deltacat/compute/converter/model/convert_input.py +26 -19
deltacat/compute/converter/model/convert_input_files.py +33 -16
deltacat/compute/converter/model/convert_result.py +35 -16
deltacat/compute/converter/model/converter_session_params.py +24 -21
deltacat/compute/converter/pyiceberg/catalog.py +21 -18
deltacat/compute/converter/pyiceberg/overrides.py +18 -9
deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
deltacat/compute/converter/steps/convert.py +157 -50
deltacat/compute/converter/steps/dedupe.py +24 -11
deltacat/compute/converter/utils/convert_task_options.py +27 -12
deltacat/compute/converter/utils/converter_session_utils.py +126 -60
deltacat/compute/converter/utils/iceberg_columns.py +8 -8
deltacat/compute/converter/utils/io.py +101 -12
deltacat/compute/converter/utils/s3u.py +33 -27
deltacat/compute/janitor.py +205 -0
deltacat/compute/jobs/client.py +19 -8
deltacat/compute/resource_estimation/delta.py +38 -6
deltacat/compute/resource_estimation/model.py +8 -0
deltacat/constants.py +44 -0
deltacat/docs/autogen/schema/__init__.py +0 -0
deltacat/docs/autogen/schema/inference/__init__.py +0 -0
deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
deltacat/examples/compactor/__init__.py +0 -0
deltacat/examples/compactor/aws/__init__.py +1 -0
deltacat/examples/compactor/bootstrap.py +863 -0
deltacat/examples/compactor/compactor.py +373 -0
deltacat/examples/compactor/explorer.py +473 -0
deltacat/examples/compactor/gcp/__init__.py +1 -0
deltacat/examples/compactor/job_runner.py +439 -0
deltacat/examples/compactor/utils/__init__.py +1 -0
deltacat/examples/compactor/utils/common.py +261 -0
deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
deltacat/exceptions.py +66 -4
deltacat/experimental/catalog/iceberg/impl.py +2 -2
deltacat/experimental/compatibility/__init__.py +0 -0
deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
deltacat/experimental/converter_agent/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/managed.py +173 -0
deltacat/experimental/converter_agent/table_monitor.py +479 -0
deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
deltacat/experimental/storage/iceberg/impl.py +5 -3
deltacat/experimental/storage/iceberg/model.py +7 -3
deltacat/experimental/storage/iceberg/visitor.py +119 -0
deltacat/experimental/storage/rivulet/dataset.py +0 -3
deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
deltacat/io/datasource/deltacat_datasource.py +0 -1
deltacat/storage/__init__.py +20 -2
deltacat/storage/interface.py +54 -32
deltacat/storage/main/impl.py +1494 -541
deltacat/storage/model/delta.py +27 -3
deltacat/storage/model/locator.py +6 -12
deltacat/storage/model/manifest.py +182 -6
deltacat/storage/model/metafile.py +151 -78
deltacat/storage/model/namespace.py +8 -1
deltacat/storage/model/partition.py +117 -42
deltacat/storage/model/schema.py +2427 -159
deltacat/storage/model/sort_key.py +40 -0
deltacat/storage/model/stream.py +9 -2
deltacat/storage/model/table.py +12 -1
deltacat/storage/model/table_version.py +11 -0
deltacat/storage/model/transaction.py +1184 -208
deltacat/storage/model/transform.py +81 -2
deltacat/storage/model/types.py +48 -26
deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
deltacat/tests/aws/test_s3u.py +2 -31
deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
deltacat/tests/catalog/test_catalogs.py +54 -11
deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
deltacat/tests/compute/compact_partition_test_cases.py +35 -8
deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
deltacat/tests/compute/compactor/utils/test_io.py +124 -120
deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
deltacat/tests/compute/conftest.py +8 -44
deltacat/tests/compute/converter/test_convert_session.py +675 -490
deltacat/tests/compute/converter/utils.py +15 -6
deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
deltacat/tests/compute/test_compact_partition_params.py +13 -8
deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
deltacat/tests/compute/test_janitor.py +236 -0
deltacat/tests/compute/test_util_common.py +716 -43
deltacat/tests/compute/test_util_constant.py +0 -1
deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
deltacat/tests/experimental/__init__.py +1 -0
deltacat/tests/experimental/compatibility/__init__.py +1 -0
deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
deltacat/tests/storage/main/test_main_storage.py +6900 -95
deltacat/tests/storage/model/test_metafile_io.py +78 -173
deltacat/tests/storage/model/test_partition_scheme.py +85 -0
deltacat/tests/storage/model/test_schema.py +171 -0
deltacat/tests/storage/model/test_schema_update.py +1925 -0
deltacat/tests/storage/model/test_sort_scheme.py +90 -0
deltacat/tests/storage/model/test_transaction.py +393 -48
deltacat/tests/storage/model/test_transaction_history.py +886 -0
deltacat/tests/test_deltacat_api.py +988 -4
deltacat/tests/test_exceptions.py +9 -5
deltacat/tests/test_utils/pyarrow.py +52 -21
deltacat/tests/test_utils/storage.py +23 -34
deltacat/tests/types/__init__.py +0 -0
deltacat/tests/types/test_tables.py +104 -0
deltacat/tests/utils/exceptions.py +22 -0
deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
deltacat/tests/utils/test_daft.py +121 -31
deltacat/tests/utils/test_numpy.py +1193 -0
deltacat/tests/utils/test_pandas.py +1106 -0
deltacat/tests/utils/test_polars.py +1040 -0
deltacat/tests/utils/test_pyarrow.py +1370 -89
deltacat/types/media.py +221 -11
deltacat/types/tables.py +2329 -59
deltacat/utils/arguments.py +33 -1
deltacat/utils/daft.py +411 -150
deltacat/utils/filesystem.py +100 -0
deltacat/utils/metafile_locator.py +2 -1
deltacat/utils/numpy.py +118 -26
deltacat/utils/pandas.py +577 -48
deltacat/utils/polars.py +658 -27
deltacat/utils/pyarrow.py +1258 -213
deltacat/utils/ray_utils/dataset.py +101 -10
deltacat/utils/reader_compatibility_mapping.py +3083 -0
deltacat/utils/url.py +56 -15
deltacat-2.0.0b12.dist-info/METADATA +1163 -0
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
deltacat/compute/compactor/utils/round_completion_file.py +0 -97
deltacat/compute/merge_on_read/__init__.py +0 -4
deltacat/compute/merge_on_read/daft.py +0 -40
deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
deltacat/compute/merge_on_read/utils/delta.py +0 -42
deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
deltacat/utils/s3fs.py +0 -21
deltacat-2.0.0b11.dist-info/METADATA +0 -67
/deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
/deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
{deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0

deltacat/tests/storage/model/test_transaction_history.py ADDED Viewed

@@ -0,0 +1,886 @@
+#!/usr/bin/env python3
+"""
+Comprehensive test suite for transaction history queries (dc.transactions function).
+Tests all functionality to prevent regression and ensure robust behavior.
+"""
+import time
+import inspect
+import pytest
+import pandas as pd
+import pyarrow as pa
+import polars as pl
+import daft
+import deltacat as dc
+from deltacat.types.tables import DatasetType
+from deltacat.catalog.model.catalog import Catalog
+from deltacat.storage.model.types import TransactionStatus, TransactionState
+class TestTransactionHistory:
+    """Comprehensive test suite for transaction history queries."""
+    def setup_method(self):
+        """Set up fresh catalog for each test."""
+        dc.clear_catalogs()  # Clear any existing catalogs
+    def teardown_method(self):
+        """Clean up after each test."""
+        dc.clear_catalogs()
+    def create_test_transactions(self):
+        """Create a variety of test transactions with different characteristics."""
+        transactions_created = []
+        # Transaction 1: Simple single-table
+        data1 = pd.DataFrame({"id": [1, 2], "name": ["Alice", "Bob"]})
+        with dc.transaction(commit_message="Initial user data") as txn:
+            dc.write(data1, "users")
+            transactions_created.append(
+                {
+                    "id": txn.id,
+                    "commit_message": "Initial user data",
+                    "expected_tables": 1,
+                    "start_time": txn.start_time,
+                }
+            )
+        time.sleep(0.01)  # Ensure different timestamps
+        # Transaction 2: Multi-table transaction
+        products = pd.DataFrame({"product_id": [101, 102], "name": ["Laptop", "Phone"]})
+        orders = pd.DataFrame(
+            {"order_id": [1, 2], "user_id": [1, 2], "product_id": [101, 102]}
+        )
+        with dc.transaction(commit_message="Create products and orders") as txn:
+            dc.write(products, "products", namespace="inventory")
+            dc.write(orders, "orders", namespace="sales")
+            transactions_created.append(
+                {
+                    "id": txn.id,
+                    "commit_message": "Create products and orders",
+                    "expected_tables": 2,
+                    "start_time": txn.start_time,
+                }
+            )
+        time.sleep(0.01)
+        # Transaction 3: Update existing table
+        more_users = pd.DataFrame({"id": [3, 4], "name": ["Charlie", "Diana"]})
+        with dc.transaction(commit_message="Add more users") as txn:
+            dc.write(more_users, "users")
+            transactions_created.append(
+                {
+                    "id": txn.id,
+                    "commit_message": "Add more users",
+                    "expected_tables": 1,
+                    "start_time": txn.start_time,
+                }
+            )
+        time.sleep(0.01)
+        # Transaction 4: Large transaction (for operation count testing)
+        analytics = pd.DataFrame(
+            {"metric": ["page_views", "clicks"], "value": [1000, 150]}
+        )
+        reports = pd.DataFrame({"report_id": [1], "status": ["complete"]})
+        with dc.transaction(commit_message="Analytics and reporting") as txn:
+            dc.write(analytics, "metrics", namespace="analytics")
+            dc.write(reports, "reports", namespace="analytics")
+            transactions_created.append(
+                {
+                    "id": txn.id,
+                    "commit_message": "Analytics and reporting",
+                    "expected_tables": 2,
+                    "start_time": txn.start_time,
+                }
+            )
+        return transactions_created
+    def test_basic_transaction_history_query(self, temp_catalog_properties):
+        """Test basic transaction history querying with default parameters."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        created_txns = self.create_test_transactions()
+        # Basic query - should return all SUCCESS transactions
+        result = dc.transactions(read_as=DatasetType.PANDAS)
+        assert isinstance(result, pd.DataFrame)
+        assert len(result) == 4, f"Expected 4 transactions, got {len(result)}"
+        # Verify schema
+        expected_columns = [
+            "transaction_id",
+            "commit_message",
+            "start_time",
+            "end_time",
+            "status",
+            "operation_count",
+            "operation_types",
+            "namespace_count",
+            "table_count",
+            "table_version_count",
+            "stream_count",
+            "partition_count",
+            "delta_count",
+        ]
+        assert list(result.columns) == expected_columns
+        # Verify all are SUCCESS status
+        assert all(result["status"] == "SUCCESS")
+        # Verify sorting (most recent first)
+        start_times = result["start_time"].tolist()
+        assert start_times == sorted(start_times, reverse=True)
+        # Verify commit messages are preserved
+        commit_messages = set(result["commit_message"])
+        expected_messages = {txn["commit_message"] for txn in created_txns}
+        assert commit_messages == expected_messages
+    def test_all_dataset_types_output(self, temp_catalog_properties):
+        """Test that all supported dataset types work correctly."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        self.create_test_transactions()
+        # Test each dataset type
+        dataset_types = [
+            DatasetType.PANDAS,
+            DatasetType.PYARROW,
+            DatasetType.POLARS,
+            DatasetType.RAY_DATASET,
+            DatasetType.DAFT,
+        ]
+        for dataset_type in dataset_types:
+            result = dc.transactions(read_as=dataset_type, limit=2)
+            # Verify basic properties based on type
+            if dataset_type == DatasetType.PANDAS:
+                assert isinstance(result, pd.DataFrame)
+                assert len(result) == 2
+                assert list(result.columns)[0] == "transaction_id"
+            elif dataset_type == DatasetType.PYARROW:
+                assert isinstance(result, pa.Table)
+                assert result.num_rows == 2
+                assert result.column_names[0] == "transaction_id"
+            elif dataset_type == DatasetType.POLARS:
+                assert isinstance(result, pl.DataFrame)
+                assert len(result) == 2
+                assert result.columns[0] == "transaction_id"
+            elif dataset_type == DatasetType.RAY_DATASET:
+                # Ray dataset might be returned as different types
+                assert result is not None
+                # Convert to check count
+                df = result.to_pandas()
+                assert len(df) == 2
+            elif dataset_type == DatasetType.DAFT:
+                assert isinstance(result, daft.DataFrame)
+                # Convert to check count
+                df = result.to_pandas()
+                assert len(df) == 2
+    def test_transaction_state_filtering(self, temp_catalog_properties):
+        """Test filtering by different transaction states."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        self.create_test_transactions()
+        # Test default (SUCCESS only)
+        success_only = dc.transactions(read_as=DatasetType.PANDAS)
+        assert all(success_only["status"] == "SUCCESS")
+        base_count = len(success_only)
+        # Test with RUNNING status (should be same as default since no running transactions)
+        with_running = dc.transactions(
+            read_as=DatasetType.PANDAS,
+            status_in=[TransactionStatus.SUCCESS, TransactionStatus.RUNNING],
+        )
+        assert len(with_running) == base_count
+        assert all(
+            status in ["SUCCESS", "RUNNING"] for status in with_running["status"]
+        )
+        # Test with FAILED status
+        with_failed = dc.transactions(
+            read_as=DatasetType.PANDAS,
+            status_in=[TransactionStatus.SUCCESS, TransactionStatus.FAILED],
+        )
+        assert len(with_failed) == base_count
+        assert all(status in ["SUCCESS", "FAILED"] for status in with_failed["status"])
+        # Test with PAUSED status
+        with_paused = dc.transactions(
+            read_as=DatasetType.PANDAS,
+            status_in=[TransactionStatus.SUCCESS, TransactionStatus.PAUSED],
+        )
+        assert len(with_paused) == base_count
+        assert all(status in ["SUCCESS", "PAUSED"] for status in with_paused["status"])
+        # Test all states
+        all_states = dc.transactions(
+            read_as=DatasetType.PANDAS,
+            status_in=[
+                TransactionStatus.SUCCESS,
+                TransactionStatus.RUNNING,
+                TransactionStatus.FAILED,
+                TransactionStatus.PAUSED,
+            ],
+        )
+        assert len(all_states) == base_count
+        # Test multiple states
+        multi_state = dc.transactions(
+            read_as=DatasetType.PANDAS,
+            status_in=[
+                TransactionStatus.SUCCESS,
+                TransactionStatus.RUNNING,
+                TransactionStatus.FAILED,
+                TransactionStatus.PAUSED,
+            ],
+        )
+        assert len(multi_state) == base_count
+    def test_time_based_filtering(self, temp_catalog_properties):
+        """Test filtering transactions by time ranges."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        created_txns = self.create_test_transactions()
+        # Get all transactions first
+        all_txns = dc.transactions(read_as=DatasetType.PANDAS)
+        # Basic verification - we should have 4 transactions
+        assert len(all_txns) == 4, f"Expected 4 transactions, got {len(all_txns)}"
+        # Test simple case: start_time that should include all transactions
+        earliest_time = min(txn["start_time"] for txn in created_txns)
+        all_from_start = dc.transactions(
+            read_as=DatasetType.PANDAS, start_time=earliest_time
+        )
+        assert len(all_from_start) == 4
+        # Test start_time filtering after some transactions
+        # Get the second-to-last transaction"s start time
+        sorted_times = sorted([txn["start_time"] for txn in created_txns])
+        mid_time = sorted_times[1]  # Second earliest
+        recent_txns = dc.transactions(read_as=DatasetType.PANDAS, start_time=mid_time)
+        # Should get at least the transactions at or after that time
+        assert (
+            len(recent_txns) >= 2
+        ), f"Expected at least 2 transactions after mid_time, got {len(recent_txns)}"
+        assert all(t >= mid_time for t in recent_txns["start_time"])
+        # Test future start_time (should return empty)
+        future_time = time.time_ns() + 1000000000  # 1 second in future
+        future_txns = dc.transactions(
+            read_as=DatasetType.PANDAS, start_time=future_time
+        )
+        assert len(future_txns) == 0
+    def test_limit_and_pagination(self, temp_catalog_properties):
+        """Test limiting results and pagination behavior."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        self.create_test_transactions()
+        # Test limit
+        limited = dc.transactions(read_as=DatasetType.PANDAS, limit=2)
+        assert len(limited) == 2
+        # Test limit larger than available
+        all_limited = dc.transactions(read_as=DatasetType.PANDAS, limit=10)
+        assert len(all_limited) == 4  # Only 4 transactions exist
+        # Test limit=0 (should raise ValueError for invalid limit)
+        with pytest.raises(ValueError):
+            dc.transactions(read_as=DatasetType.PANDAS, limit=0)
+        # Test limit=1 with different states
+        single_all_states = dc.transactions(
+            read_as=DatasetType.PANDAS,
+            limit=1,
+            status_in=[
+                TransactionStatus.SUCCESS,
+                TransactionStatus.RUNNING,
+                TransactionStatus.FAILED,
+                TransactionStatus.PAUSED,
+            ],
+        )
+        assert len(single_all_states) == 1
+        # Verify limit respects sorting (most recent first)
+        all_txns = dc.transactions(read_as=DatasetType.PANDAS)
+        limited_txns = dc.transactions(read_as=DatasetType.PANDAS, limit=2)
+        # Limited should be the first 2 from the full list
+        assert (
+            limited_txns["transaction_id"].tolist()
+            == all_txns["transaction_id"].head(2).tolist()
+        )
+    def test_commit_message_functionality(self, temp_catalog_properties):
+        """Test commit message storage and retrieval."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        # Create transactions with various commit message patterns
+        test_messages = [
+            "Simple commit message",
+            "Multi-word commit with special chars: @#$%",
+            "",  # Empty commit message
+            "Very long commit message that contains lots of text to test handling of lengthy descriptions and ensure they are properly stored and retrieved without truncation or corruption",
+            "Commit with\nmultiple\nlines",
+            "Unicode test: 🚀 ñ ü é",
+        ]
+        created_txn_ids = []
+        for msg in test_messages:
+            data = pd.DataFrame({"id": [1], "value": [f"data_{len(created_txn_ids)}"]})
+            with dc.transaction(commit_message=msg) as txn:
+                dc.write(data, f"table_{len(created_txn_ids)}")
+                created_txn_ids.append(txn.id)
+        # Query and verify all commit messages
+        result = dc.transactions(read_as=DatasetType.PANDAS)
+        # Create mapping of transaction_id to commit_message
+        result_messages = {
+            row["transaction_id"]: row["commit_message"] for _, row in result.iterrows()
+        }
+        # Verify each commit message is preserved correctly
+        for txn_id, expected_msg in zip(created_txn_ids, test_messages):
+            assert txn_id in result_messages
+            actual_msg = result_messages[txn_id]
+            # Handle empty commit messages
+            if expected_msg == "" and actual_msg is None:
+                continue
+            assert (
+                actual_msg == expected_msg
+            ), f"Commit message mismatch for {txn_id}: expected {expected_msg!r}, got {actual_msg!r}"
+    def test_transaction_metadata_accuracy(self, temp_catalog_properties):
+        """Test accuracy of operation counts and table counts."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        # Create transactions with known characteristics
+        test_cases = []
+        # Single table, single operation
+        data1 = pd.DataFrame({"id": [1], "name": ["test"]})
+        with dc.transaction(commit_message="Single table") as txn:
+            dc.write(data1, "single_table")
+            test_cases.append({"id": txn.id, "expected_tables": 1})
+        # Multi-table, multiple operations
+        data2a = pd.DataFrame({"id": [1], "name": ["table_a"]})
+        data2b = pd.DataFrame({"id": [1], "name": ["table_b"]})
+        data2c = pd.DataFrame({"id": [1], "name": ["table_c"]})
+        with dc.transaction(commit_message="Multi table") as txn:
+            dc.write(data2a, "multi_a", namespace="test_ns")
+            dc.write(data2b, "multi_b", namespace="test_ns")
+            dc.write(data2c, "multi_c", namespace="another_ns")
+            test_cases.append({"id": txn.id, "expected_tables": 3})
+        # Query results
+        result = dc.transactions(read_as=DatasetType.PANDAS)
+        # Create mapping of transaction_id to metadata
+        result_metadata = {
+            row["transaction_id"]: {
+                "table_count": row["table_count"],
+                "operation_count": row["operation_count"],
+            }
+            for _, row in result.iterrows()
+        }
+        # Verify metadata for each test case
+        for test_case in test_cases:
+            txn_id = test_case["id"]
+            assert txn_id in result_metadata
+            metadata = result_metadata[txn_id]
+            # Verify table count
+            expected_table_count = test_case["expected_tables"]
+            actual_table_count = metadata["table_count"]
+            assert (
+                actual_table_count == expected_table_count
+            ), f"Table count mismatch for {txn_id}: expected {expected_table_count}, got {actual_table_count}"
+            # Verify operation count is reasonable (should be > table count due to internal operations)
+            assert (
+                metadata["operation_count"] > 0
+            ), f"Operation count should be > 0 for {txn_id}"
+            assert (
+                metadata["operation_count"] >= metadata["table_count"]
+            ), f"Operation count should be >= table count for {txn_id}"
+    def test_empty_catalog_graceful_handling(self, temp_catalog_properties):
+        """Test graceful handling of catalogs with no transactions."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        # Test all parameter combinations on empty catalog
+        test_cases = [
+            {},
+            {"limit": 5},
+            {"status_in": [TransactionStatus.SUCCESS, TransactionStatus.RUNNING]},
+            {"status_in": [TransactionStatus.SUCCESS, TransactionStatus.FAILED]},
+            {"status_in": [TransactionStatus.SUCCESS, TransactionStatus.PAUSED]},
+            {
+                "status_in": [
+                    TransactionStatus.SUCCESS,
+                    TransactionStatus.RUNNING,
+                    TransactionStatus.FAILED,
+                    TransactionStatus.PAUSED,
+                ]
+            },
+            {"start_time": time.time_ns() - 3600000000000},  # 1 hour ago
+            {"end_time": time.time_ns()},
+            {
+                "limit": 1,
+                "status_in": [
+                    TransactionStatus.SUCCESS,
+                    TransactionStatus.RUNNING,
+                    TransactionStatus.FAILED,
+                    TransactionStatus.PAUSED,
+                ],
+            },
+        ]
+        for params in test_cases:
+            result = dc.transactions(read_as=DatasetType.PANDAS, **params)
+            assert isinstance(result, pd.DataFrame), f"Failed for params {params}"
+            assert len(result) == 0, f"Expected empty result for params {params}"
+            # Verify schema is correct even for empty results
+            expected_columns = [
+                "transaction_id",
+                "commit_message",
+                "start_time",
+                "end_time",
+                "status",
+                "operation_count",
+                "operation_types",
+                "namespace_count",
+                "table_count",
+                "table_version_count",
+                "stream_count",
+                "partition_count",
+                "delta_count",
+            ]
+            assert (
+                list(result.columns) == expected_columns
+            ), f"Schema mismatch for params {params}"
+    def test_error_handling_and_edge_cases(self, temp_catalog_properties):
+        """Test error handling for various edge cases."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        self.create_test_transactions()
+        # Test invalid dataset type
+        with pytest.raises((ValueError, AttributeError)):
+            dc.transactions(read_as="INVALID_TYPE")
+        # Test negative limit (should raise ValueError for invalid limit)
+        with pytest.raises(ValueError):
+            dc.transactions(read_as=DatasetType.PANDAS, limit=-1)
+        # Test invalid time values
+        # Very old start_time (should work, return all transactions)
+        old_time_result = dc.transactions(
+            read_as=DatasetType.PANDAS, start_time=1000000000  # Very old timestamp
+        )
+        assert len(old_time_result) == 4  # All transactions
+        # start_time > end_time (should return empty)
+        invalid_time_result = dc.transactions(
+            read_as=DatasetType.PANDAS,
+            start_time=time.time_ns(),
+            end_time=time.time_ns() - 1000000000,  # 1 second ago
+        )
+        assert len(invalid_time_result) == 0
+    def test_status_in_corner_cases(self, temp_catalog_properties):
+        """Test corner cases for status_in parameter."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        self.create_test_transactions()
+        # Get baseline count of SUCCESS transactions
+        baseline_result = dc.transactions(read_as=DatasetType.PANDAS)
+        baseline_count = len(baseline_result)
+        assert baseline_count == 4  # We created 4 transactions
+        assert all(baseline_result["status"] == "SUCCESS")
+        # Test status_in=None (should default to SUCCESS)
+        none_result = dc.transactions(read_as=DatasetType.PANDAS, status_in=None)
+        assert len(none_result) == baseline_count
+        assert all(none_result["status"] == "SUCCESS")
+        # Test status_in=[] (empty list - should default to SUCCESS)
+        empty_result = dc.transactions(read_as=DatasetType.PANDAS, status_in=[])
+        assert len(empty_result) == baseline_count
+        assert all(empty_result["status"] == "SUCCESS")
+        # Verify the results are identical
+        assert len(baseline_result) == len(none_result) == len(empty_result)
+        # Test that the transaction IDs are the same (same transactions returned)
+        baseline_ids = set(baseline_result["transaction_id"])
+        none_ids = set(none_result["transaction_id"])
+        empty_ids = set(empty_result["transaction_id"])
+        assert baseline_ids == none_ids == empty_ids
+    def test_concurrent_transaction_handling(self, temp_catalog_properties):
+        """Test behavior when transactions are created while querying."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        # Create initial transactions
+        initial_count = 2
+        for i in range(initial_count):
+            data = pd.DataFrame({"id": [i], "name": [f"user_{i}"]})
+            with dc.transaction(commit_message=f"Transaction {i}"):
+                dc.write(data, f"table_{i}")
+        # Query initial state
+        initial_result = dc.transactions(read_as=DatasetType.PANDAS)
+        assert len(initial_result) == initial_count
+        # Create another transaction
+        new_data = pd.DataFrame({"id": [999], "name": ["new_user"]})
+        with dc.transaction(commit_message="New transaction"):
+            dc.write(new_data, "new_table")
+        # Query updated state
+        updated_result = dc.transactions(read_as=DatasetType.PANDAS)
+        assert len(updated_result) == initial_count + 1
+        # Verify new transaction appears first (most recent)
+        assert updated_result.iloc[0]["commit_message"] == "New transaction"
+    def test_namespace_isolation_in_table_counting(self, temp_catalog_properties):
+        """Test that table counting correctly handles namespace isolation."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        # Create transaction with tables in different namespaces
+        data = pd.DataFrame({"id": [1], "value": ["test"]})
+        with dc.transaction(commit_message="Multi-namespace transaction") as txn:
+            # Same table name in different namespaces should count as different tables
+            dc.write(data, "shared_name", namespace="namespace_a")
+            dc.write(data, "shared_name", namespace="namespace_b")
+            dc.write(data, "unique_name", namespace="namespace_a")
+            expected_txn_id = txn.id
+        result = dc.transactions(read_as=DatasetType.PANDAS)
+        # Find our transaction
+        our_txn = result[result["transaction_id"] == expected_txn_id]
+        assert len(our_txn) == 1
+        # Should count distinct tables
+        table_count = our_txn.iloc[0]["table_count"]
+        assert table_count == 3, f"Expected 3 tables, got {table_count}"
+        table_count = our_txn.iloc[0]["table_version_count"]
+        assert table_count == 3, f"Expected 3 table versions, got {table_count}"
+        stream_count = our_txn.iloc[0]["stream_count"]
+        assert stream_count == 3, f"Expected 3 streams, got {stream_count}"
+        partition_count = our_txn.iloc[0]["partition_count"]
+        assert partition_count == 3, f"Expected 3 partitions, got {partition_count}"
+        delta_count = our_txn.iloc[0]["delta_count"]
+        assert delta_count == 3, f"Expected 3 deltas, got {delta_count}"
+    def test_parameter_combinations(self, temp_catalog_properties):
+        """Test various parameter combinations work correctly."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        self.create_test_transactions()
+        # Complex parameter combinations
+        test_combinations = [
+            # Time + limit
+            {"start_time": time.time_ns() - 3600000000000, "limit": 2},
+            # States + limit
+            {
+                "status_in": [
+                    TransactionStatus.SUCCESS,
+                    TransactionStatus.RUNNING,
+                    TransactionStatus.FAILED,
+                    TransactionStatus.PAUSED,
+                ],
+                "limit": 1,
+            },
+            # Time + states
+            {
+                "start_time": time.time_ns() - 3600000000000,
+                "status_in": [TransactionStatus.SUCCESS, TransactionStatus.RUNNING],
+            },
+            # Everything combined
+            {
+                "start_time": time.time_ns() - 3600000000000,
+                "end_time": time.time_ns(),
+                "limit": 3,
+                "status_in": [
+                    TransactionStatus.SUCCESS,
+                    TransactionStatus.RUNNING,
+                    TransactionStatus.FAILED,
+                    TransactionStatus.PAUSED,
+                ],
+            },
+        ]
+        for params in test_combinations:
+            result = dc.transactions(read_as=DatasetType.PANDAS, **params)
+            # Should not crash and should return valid DataFrame
+            assert isinstance(result, pd.DataFrame)
+            assert len(result) >= 0
+            # If limit is specified, result should not exceed it
+            if "limit" in params and params["limit"] > 0:
+                assert len(result) <= params["limit"]
+class TestTransactionHistoryRegression:
+    """Regression tests to ensure consistent behavior over time."""
+    def setup_method(self):
+        """Set up fresh catalog for each test."""
+        dc.clear_catalogs()
+    def teardown_method(self):
+        """Clean up after each test."""
+        dc.clear_catalogs()
+    def test_schema_consistency(self, temp_catalog_properties):
+        """Ensure the output schema remains consistent."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        # Create a simple transaction
+        data = pd.DataFrame({"id": [1], "name": ["test"]})
+        with dc.transaction(commit_message="Schema test"):
+            dc.write(data, "test_table")
+        result = dc.transactions(read_as=DatasetType.PANDAS)
+        # Verify exact schema
+        expected_columns = [
+            "transaction_id",
+            "commit_message",
+            "start_time",
+            "end_time",
+            "status",
+            "operation_count",
+            "operation_types",
+            "namespace_count",
+            "table_count",
+            "table_version_count",
+            "stream_count",
+            "partition_count",
+            "delta_count",
+        ]
+        assert list(result.columns) == expected_columns
+        # Verify data types
+        assert result["transaction_id"].dtype == "object"  # String
+        assert result["commit_message"].dtype == "object"  # String
+        assert result["start_time"].dtype == "int64"  # Integer timestamp
+        assert result["end_time"].dtype == "int64"  # Integer timestamp
+        assert result["status"].dtype == "object"  # String
+        assert result["operation_count"].dtype == "int64"  # Integer
+        assert result["table_count"].dtype == "int64"  # Integer
+    def test_sorting_consistency(self, temp_catalog_properties):
+        """Ensure transactions are consistently sorted by start_time descending."""
+        # Initialize catalog using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        # Create transactions with deliberate timing
+        transaction_times = []
+        for i in range(5):
+            data = pd.DataFrame({"id": [i], "name": [f"user_{i}"]})
+            with dc.transaction(commit_message=f"Transaction {i}") as txn:
+                dc.write(data, f"table_{i}")
+                transaction_times.append(txn.start_time)
+            time.sleep(0.01)  # Small delay
+        result = dc.transactions(read_as=DatasetType.PANDAS)
+        # Verify descending order by start_time
+        result_times = result["start_time"].tolist()
+        expected_times = sorted(transaction_times, reverse=True)
+        assert (
+            result_times == expected_times
+        ), "Transactions not properly sorted by start_time descending"
+    def test_function_signature_stability(self, temp_catalog_properties):
+        """Ensure function signature remains stable."""
+        # Initialize catalog using the fixture (though not needed for signature test)
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        sig = inspect.signature(dc.transactions)
+        expected_params = [
+            "catalog_name",
+            "read_as",
+            "start_time",
+            "end_time",
+            "limit",
+            "status_in",
+        ]
+        actual_params = list(sig.parameters.keys())
+        assert (
+            actual_params == expected_params
+        ), f"Function signature changed: {actual_params}"
+        # Verify default values
+        assert sig.parameters["catalog_name"].default is None
+        assert sig.parameters["read_as"].default is None
+        assert sig.parameters["start_time"].default is None
+        assert sig.parameters["end_time"].default is None
+        assert sig.parameters["limit"].default is None
+        assert sig.parameters["status_in"].default == [TransactionStatus.SUCCESS]
+    def test_read_transaction(self, temp_catalog_properties):
+        """Test the read_transaction() function for loading transactions returned by transactions()."""
+        # Initialize a clean catalog for testing using the fixture
+        dc.init()
+        dc.put_catalog("test", Catalog(temp_catalog_properties))
+        # Create multiple transactions with data
+        commit_msg_1 = "First test transaction"
+        commit_msg_2 = "Second test transaction"
+        # Create first transaction
+        data1 = pd.DataFrame({"id": [1, 2], "name": ["Alice", "Bob"]})
+        with dc.transaction(commit_message=commit_msg_1):
+            dc.write(data1, "users")
+        # Create second transaction
+        data2 = pd.DataFrame({"id": [3, 4], "name": ["Charlie", "Diana"]})
+        with dc.transaction(commit_message=commit_msg_2):
+            dc.write(data2, "customers")
+        # Test transactions() query functionality
+        result = dc.transactions(read_as=DatasetType.PANDAS)
+        # Verify we have the right number of transactions
+        assert len(result) == 2
+        # Verify column structure
+        expected_columns = [
+            "transaction_id",
+            "commit_message",
+            "start_time",
+            "end_time",
+            "status",
+            "operation_count",
+            "operation_types",
+            "namespace_count",
+            "table_count",
+            "table_version_count",
+            "stream_count",
+            "partition_count",
+            "delta_count",
+        ]
+        assert list(result.columns) == expected_columns
+        # Verify commit messages are preserved
+        commit_messages = set(result["commit_message"])
+        assert commit_msg_1 in commit_messages
+        assert commit_msg_2 in commit_messages
+        # Verify transaction metadata
+        assert all(result["status"] == "SUCCESS")
+        assert all(result["operation_count"] > 0)
+        assert all(result["namespace_count"] == 1)
+        assert all(result["table_count"] == 1)
+        assert all(result["table_version_count"] == 1)
+        assert all(result["stream_count"] == 1)
+        assert all(result["partition_count"] == 1)
+        assert all(result["delta_count"] == 1)
+        # Read and validate the transactions
+        transaction_id = result.iloc[0]["transaction_id"]
+        transaction_obj = dc.read_transaction(transaction_id)
+        assert transaction_obj.id == transaction_id
+        assert transaction_obj.commit_message == commit_msg_2
+        assert transaction_obj.start_time == result.iloc[0]["start_time"]
+        assert transaction_obj.end_time == result.iloc[0]["end_time"]
+        assert (
+            transaction_obj.state(temp_catalog_properties.root)
+            == TransactionState.SUCCESS
+        )
+        assert (
+            len(transaction_obj.operations) == 24 == result.iloc[0]["operation_count"]
+        )
+        transaction_id = result.iloc[1]["transaction_id"]
+        transaction_obj = dc.read_transaction(transaction_id)
+        assert transaction_obj.id == transaction_id
+        assert transaction_obj.commit_message == commit_msg_1
+        assert transaction_obj.start_time == result.iloc[1]["start_time"]
+        assert transaction_obj.end_time == result.iloc[1]["end_time"]
+        assert (
+            transaction_obj.state(temp_catalog_properties.root)
+            == TransactionState.SUCCESS
+        )
+        # 1st transaction contains more operations than 2nd since only it needed to create the namespace
+        assert (
+            len(transaction_obj.operations) == 26 == result.iloc[1]["operation_count"]
+        )

deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

deltacat 2.0.0b11py3-none-any.whl → 2.0.0b12py3-none-any.whl