deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,136 @@
|
|
1
|
+
import daft
|
2
|
+
from daft import Table, Identifier
|
3
|
+
import pytest
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from deltacat.catalog import Catalog as DeltaCATCatalog
|
7
|
+
from deltacat.catalog import CatalogProperties
|
8
|
+
from deltacat.experimental.daft.daft_catalog import DaftCatalog
|
9
|
+
import shutil
|
10
|
+
import tempfile
|
11
|
+
|
12
|
+
from deltacat.experimental.catalog.iceberg import impl as IcebergCatalog
|
13
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
14
|
+
|
15
|
+
from pyiceberg.catalog import CatalogType
|
16
|
+
|
17
|
+
|
18
|
+
class TestCatalogIntegration:
|
19
|
+
@classmethod
|
20
|
+
def setup_method(cls):
|
21
|
+
cls.tmpdir = tempfile.mkdtemp()
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def teardown_method(cls):
|
25
|
+
shutil.rmtree(cls.tmpdir)
|
26
|
+
|
27
|
+
def test_create_table(self):
|
28
|
+
"""Demonstrate DeltaCAT-Daft integration."""
|
29
|
+
# Create a DeltaCAT catalog
|
30
|
+
catalog_props = CatalogProperties(root=self.tmpdir)
|
31
|
+
dc_catalog = DeltaCATCatalog(catalog_props)
|
32
|
+
|
33
|
+
# Use a random catalog name to prevent namespacing conflicts with other tests
|
34
|
+
# Convert the DeltaCAT catalog to a Daft catalog
|
35
|
+
catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
|
36
|
+
|
37
|
+
daft_catalog = DaftCatalog(catalog=dc_catalog, name=catalog_name)
|
38
|
+
|
39
|
+
# Register the catalog with Daft's catalog system
|
40
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
41
|
+
|
42
|
+
# Create a sample DataFrame
|
43
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
44
|
+
# Create then get table
|
45
|
+
daft_catalog.create_table(Identifier("example_table"), df)
|
46
|
+
table: Table = daft_catalog.get_table(Identifier("example_table"))
|
47
|
+
assert table.name == "example_table"
|
48
|
+
|
49
|
+
def test_get_table(self):
|
50
|
+
"""Test getting a table from the DeltaCAT-Daft catalog."""
|
51
|
+
# Create a DeltaCAT catalog using the existing tmpdir
|
52
|
+
catalog_props = CatalogProperties(root=self.tmpdir)
|
53
|
+
dc_catalog = DeltaCATCatalog(catalog_props)
|
54
|
+
|
55
|
+
# Convert to DaftCatalog and attach to Daft
|
56
|
+
catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
|
57
|
+
daft_catalog = DaftCatalog(dc_catalog, catalog_name)
|
58
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
59
|
+
|
60
|
+
# Create a sample DataFrame and table
|
61
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
62
|
+
table_name = "test_get_table"
|
63
|
+
daft_catalog.create_table(Identifier(table_name), df)
|
64
|
+
|
65
|
+
# Get the table using different forms of identifiers
|
66
|
+
table2 = daft_catalog.get_table(Identifier(table_name))
|
67
|
+
assert table2 is not None
|
68
|
+
assert table2.name == table_name
|
69
|
+
|
70
|
+
# 3. With namespace. DeltaCAT used the default namespace since it was not provided
|
71
|
+
table3 = daft_catalog.get_table(Identifier("default", table_name))
|
72
|
+
assert table3 is not None
|
73
|
+
assert table3.name == table_name
|
74
|
+
|
75
|
+
# Test non-existent table raises an appropriate error
|
76
|
+
with pytest.raises(ValueError, match="Table nonexistent_table not found"):
|
77
|
+
daft_catalog.get_table(Identifier("nonexistent_table"))
|
78
|
+
|
79
|
+
|
80
|
+
class TestIcebergCatalogIntegration:
|
81
|
+
@classmethod
|
82
|
+
def setup_method(cls):
|
83
|
+
cls.tmpdir = tempfile.mkdtemp()
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def teardown_method(cls):
|
87
|
+
shutil.rmtree(cls.tmpdir)
|
88
|
+
|
89
|
+
def test_iceberg_catalog_integration(self):
|
90
|
+
# Create a unique warehouse path for this test
|
91
|
+
warehouse_path = self.tmpdir
|
92
|
+
|
93
|
+
# Configure an Iceberg catalog with the warehouse path
|
94
|
+
config = IcebergCatalogConfig(
|
95
|
+
type=CatalogType.SQL,
|
96
|
+
properties={
|
97
|
+
"warehouse": warehouse_path,
|
98
|
+
"uri": f"sqlite:////{warehouse_path}/sql-catalog.db",
|
99
|
+
},
|
100
|
+
)
|
101
|
+
dc_catalog = IcebergCatalog.from_config(config)
|
102
|
+
|
103
|
+
# Convert the DeltaCAT catalog to a Daft catalog
|
104
|
+
catalog_name = f"deltacat_iceberg_{uuid.uuid4().hex[:8]}"
|
105
|
+
daft_catalog = DaftCatalog(dc_catalog, catalog_name)
|
106
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
107
|
+
|
108
|
+
# Create a sample DataFrame
|
109
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
110
|
+
|
111
|
+
# Create a table with the Daft catalog
|
112
|
+
table_name = "example_table"
|
113
|
+
namespace = "example_namespace"
|
114
|
+
daft_catalog.create_table(Identifier(namespace, table_name), df)
|
115
|
+
|
116
|
+
# Query that Iceberg table exists using PyIceberg
|
117
|
+
iceberg_catalog = dc_catalog.inner
|
118
|
+
|
119
|
+
# Verify the table exists in the Iceberg catalog
|
120
|
+
tables = iceberg_catalog.list_tables(namespace)
|
121
|
+
|
122
|
+
assert any(
|
123
|
+
t[0] == namespace and t[1] == table_name for t in tables
|
124
|
+
), f"Table {table_name} not found in Iceberg catalog"
|
125
|
+
|
126
|
+
# Load the table from Iceberg catalog and verify its properties
|
127
|
+
iceberg_table = iceberg_catalog.load_table(f"{namespace}.{table_name}")
|
128
|
+
|
129
|
+
# Check that the schema matches our DataFrame
|
130
|
+
schema = iceberg_table.schema()
|
131
|
+
assert (
|
132
|
+
schema.find_field("id") is not None
|
133
|
+
), "Field 'id' not fcound in table schema"
|
134
|
+
assert (
|
135
|
+
schema.find_field("value") is not None
|
136
|
+
), "Field 'value' not found in table schema"
|
File without changes
|
File without changes
|
@@ -3,9 +3,9 @@ import io
|
|
3
3
|
import pytest
|
4
4
|
from faker import Faker
|
5
5
|
|
6
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
7
|
-
from deltacat.storage.rivulet.mvp.Table import MvpTable
|
8
|
-
from deltacat.storage.rivulet.schema.schema import Schema
|
6
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
7
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
|
8
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
9
9
|
import random
|
10
10
|
import string
|
11
11
|
from PIL import Image
|
File without changes
|
@@ -2,9 +2,9 @@ import pytest
|
|
2
2
|
|
3
3
|
import pyarrow as pa
|
4
4
|
import pyarrow.parquet as pq
|
5
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
6
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
7
|
-
from deltacat.storage.rivulet import Schema, Field
|
5
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
6
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
7
|
+
from deltacat.experimental.storage.rivulet import Schema, Field
|
8
8
|
from deltacat.utils.metafile_locator import _find_partition_path
|
9
9
|
|
10
10
|
|
File without changes
|
@@ -0,0 +1,80 @@
|
|
1
|
+
import pytest
|
2
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
3
|
+
QueryExpression,
|
4
|
+
)
|
5
|
+
from deltacat.experimental.storage.rivulet.shard.range_shard import RangeShard
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.fixture
|
9
|
+
def sample_range_shard():
|
10
|
+
return RangeShard(min_key=5, max_key=15)
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def sample_string_shard():
|
15
|
+
return RangeShard(min_key="apple", max_key="zebra")
|
16
|
+
|
17
|
+
|
18
|
+
def test_with_key():
|
19
|
+
query = QueryExpression[int]()
|
20
|
+
query.with_key(5)
|
21
|
+
assert query.min_key == 5
|
22
|
+
assert query.max_key == 5
|
23
|
+
with pytest.raises(ValueError):
|
24
|
+
query.with_key(10)
|
25
|
+
|
26
|
+
|
27
|
+
def test_with_range():
|
28
|
+
query = QueryExpression[int]()
|
29
|
+
query.with_range(10, 5)
|
30
|
+
assert query.min_key == 5
|
31
|
+
assert query.max_key == 10
|
32
|
+
with pytest.raises(ValueError):
|
33
|
+
query.with_range(20, 25)
|
34
|
+
|
35
|
+
|
36
|
+
def test_matches_query():
|
37
|
+
query = QueryExpression[int]()
|
38
|
+
assert query.matches_query(5)
|
39
|
+
assert query.matches_query(-999)
|
40
|
+
query.with_range(10, 20)
|
41
|
+
assert query.matches_query(15)
|
42
|
+
assert not query.matches_query(25)
|
43
|
+
assert not query.matches_query(5)
|
44
|
+
|
45
|
+
|
46
|
+
def test_below_query_range():
|
47
|
+
query = QueryExpression[int]()
|
48
|
+
assert not query.below_query_range(5)
|
49
|
+
query.with_range(10, 20)
|
50
|
+
assert query.below_query_range(5)
|
51
|
+
assert not query.below_query_range(15)
|
52
|
+
assert not query.below_query_range(25)
|
53
|
+
|
54
|
+
|
55
|
+
def test_with_shard_existing_query(sample_range_shard):
|
56
|
+
query = QueryExpression[int]().with_range(10, 20)
|
57
|
+
new_query = QueryExpression.with_shard(query, sample_range_shard)
|
58
|
+
assert new_query.min_key == 5
|
59
|
+
assert new_query.max_key == 20
|
60
|
+
|
61
|
+
|
62
|
+
def test_with_shard_none_shard():
|
63
|
+
query = QueryExpression[int]().with_range(10, 20)
|
64
|
+
result = QueryExpression.with_shard(query, None)
|
65
|
+
assert result.min_key == 10
|
66
|
+
assert result.max_key == 20
|
67
|
+
|
68
|
+
|
69
|
+
def test_with_shard_existing_query_string(sample_string_shard):
|
70
|
+
query = QueryExpression[str]().with_range("banana", "yellow")
|
71
|
+
new_query = QueryExpression.with_shard(query, sample_string_shard)
|
72
|
+
assert new_query.min_key == "apple"
|
73
|
+
assert new_query.max_key == "zebra"
|
74
|
+
|
75
|
+
|
76
|
+
def test_query_expression_string_matches():
|
77
|
+
query = QueryExpression[str]().with_range("apple", "cat")
|
78
|
+
assert query.matches_query("apple")
|
79
|
+
assert query.matches_query("banana")
|
80
|
+
assert not query.matches_query("dog")
|
@@ -0,0 +1,119 @@
|
|
1
|
+
import pytest
|
2
|
+
from deltacat.tests.experimental.storage.rivulet.test_utils import verify_pyarrow_scan
|
3
|
+
import pyarrow as pa
|
4
|
+
from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.fixture
|
9
|
+
def combined_schema():
|
10
|
+
return Schema(
|
11
|
+
fields=[
|
12
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
13
|
+
Field("name", Datatype.string()),
|
14
|
+
Field("age", Datatype.int32()),
|
15
|
+
Field("height", Datatype.int64()),
|
16
|
+
Field("gender", Datatype.string()),
|
17
|
+
]
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@pytest.fixture
|
22
|
+
def initial_schema():
|
23
|
+
return Schema(
|
24
|
+
fields=[
|
25
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
26
|
+
Field("name", Datatype.string()),
|
27
|
+
Field("age", Datatype.int32()),
|
28
|
+
]
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
@pytest.fixture
|
33
|
+
def extended_schema():
|
34
|
+
return Schema(
|
35
|
+
fields=[
|
36
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
37
|
+
Field("height", Datatype.int64()),
|
38
|
+
Field("gender", Datatype.string()),
|
39
|
+
]
|
40
|
+
)
|
41
|
+
|
42
|
+
|
43
|
+
@pytest.fixture
|
44
|
+
def sample_data():
|
45
|
+
return {
|
46
|
+
"id": [1, 2, 3],
|
47
|
+
"name": ["Alice", "Bob", "Charlie"],
|
48
|
+
"age": [25, 30, 35],
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
@pytest.fixture
|
53
|
+
def extended_data():
|
54
|
+
return {
|
55
|
+
"id": [1, 2, 3],
|
56
|
+
"height": [150, 160, 159],
|
57
|
+
"gender": ["male", "female", "male"],
|
58
|
+
}
|
59
|
+
|
60
|
+
|
61
|
+
@pytest.fixture
|
62
|
+
def combined_data(sample_data, extended_data):
|
63
|
+
data = sample_data.copy()
|
64
|
+
data.update(extended_data)
|
65
|
+
return data
|
66
|
+
|
67
|
+
|
68
|
+
@pytest.fixture
|
69
|
+
def parquet_data(tmp_path, sample_data):
|
70
|
+
parquet_path = tmp_path / "test.parquet"
|
71
|
+
table = pa.Table.from_pydict(sample_data)
|
72
|
+
pa.parquet.write_table(table, parquet_path)
|
73
|
+
return parquet_path
|
74
|
+
|
75
|
+
|
76
|
+
@pytest.fixture
|
77
|
+
def sample_dataset(parquet_data, tmp_path):
|
78
|
+
return Dataset.from_parquet(
|
79
|
+
name="test_dataset",
|
80
|
+
file_uri=str(parquet_data),
|
81
|
+
metadata_uri=str(tmp_path),
|
82
|
+
merge_keys="id",
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
def test_end_to_end_scan_with_multiple_schemas(
|
87
|
+
sample_dataset,
|
88
|
+
initial_schema,
|
89
|
+
extended_schema,
|
90
|
+
combined_schema,
|
91
|
+
sample_data,
|
92
|
+
extended_data,
|
93
|
+
combined_data,
|
94
|
+
):
|
95
|
+
# Verify initial scan.
|
96
|
+
verify_pyarrow_scan(sample_dataset.scan().to_arrow(), initial_schema, sample_data)
|
97
|
+
|
98
|
+
# Add a new schema to the dataset
|
99
|
+
sample_dataset.add_schema(schema=extended_schema, schema_name="schema2")
|
100
|
+
new_data = [
|
101
|
+
{"id": 1, "height": 150, "gender": "male"},
|
102
|
+
{"id": 2, "height": 160, "gender": "female"},
|
103
|
+
{"id": 3, "height": 159, "gender": "male"},
|
104
|
+
]
|
105
|
+
writer = sample_dataset.writer(schema_name="schema2")
|
106
|
+
writer.write(new_data)
|
107
|
+
writer.flush()
|
108
|
+
|
109
|
+
# Verify scan with the extended schema retrieves only extended datfa
|
110
|
+
verify_pyarrow_scan(
|
111
|
+
sample_dataset.scan(schema_name="schema2").to_arrow(),
|
112
|
+
extended_schema,
|
113
|
+
extended_data,
|
114
|
+
)
|
115
|
+
|
116
|
+
# Verify a combined scan retrieves data matching the combined schema
|
117
|
+
verify_pyarrow_scan(
|
118
|
+
sample_dataset.scan().to_arrow(), combined_schema, combined_data
|
119
|
+
)
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import pytest
|
2
|
+
import os
|
3
|
+
|
4
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
|
5
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
6
|
+
DatasetMetastore,
|
7
|
+
)
|
8
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
9
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
10
|
+
from deltacat.experimental.storage.rivulet import Schema
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def sample_schema():
|
15
|
+
return Schema(
|
16
|
+
{("id", Datatype.int32()), ("name", Datatype.string())},
|
17
|
+
"id",
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@pytest.fixture
|
22
|
+
def sample_pydict():
|
23
|
+
return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
|
24
|
+
|
25
|
+
|
26
|
+
def test_dataset_metastore_e2e(sample_schema, tmp_path):
|
27
|
+
# Setup
|
28
|
+
dataset = Dataset(metadata_uri=tmp_path, dataset_name="dataset")
|
29
|
+
file_provider = dataset._file_provider
|
30
|
+
manifest_io = DeltacatManifestIO(file_provider.uri, dataset._locator)
|
31
|
+
|
32
|
+
# Create multiple manifests
|
33
|
+
manifests_data = [
|
34
|
+
{"sst_files": ["sst1.sst", "sst2.sst"], "level": 1},
|
35
|
+
{"sst_files": ["sst3.sst", "sst4.sst"], "level": 2},
|
36
|
+
]
|
37
|
+
|
38
|
+
# Create SST files and manifests
|
39
|
+
manifest_paths = []
|
40
|
+
for manifest_data in manifests_data:
|
41
|
+
sst_files = manifest_data["sst_files"]
|
42
|
+
for sst in sst_files:
|
43
|
+
with open(os.path.join(file_provider.uri, sst), "w") as f:
|
44
|
+
f.write("test data")
|
45
|
+
|
46
|
+
manifest_path = manifest_io.write(
|
47
|
+
sst_files, sample_schema, manifest_data["level"]
|
48
|
+
)
|
49
|
+
manifest_paths.append(manifest_path)
|
50
|
+
|
51
|
+
# Initialize DatasetMetastore
|
52
|
+
metastore = DatasetMetastore(
|
53
|
+
file_provider.uri,
|
54
|
+
file_provider,
|
55
|
+
file_provider._locator,
|
56
|
+
manifest_io=manifest_io,
|
57
|
+
)
|
58
|
+
|
59
|
+
# Test manifest generation
|
60
|
+
manifest_accessors = list(metastore.generate_manifests())
|
61
|
+
assert len(manifest_accessors) == len(manifests_data)
|
62
|
+
|
63
|
+
# Verify each manifest accessor
|
64
|
+
for accessor in manifest_accessors:
|
65
|
+
assert accessor.context.schema == sample_schema
|
66
|
+
manifests_data_index = 0 if accessor.context.level == 1 else 1
|
67
|
+
assert accessor.context.level == manifests_data[manifests_data_index]["level"]
|
68
|
+
assert (
|
69
|
+
accessor.manifest.sst_files
|
70
|
+
== manifests_data[manifests_data_index]["sst_files"]
|
71
|
+
)
|
File without changes
|
File without changes
|
@@ -0,0 +1,162 @@
|
|
1
|
+
import pytest
|
2
|
+
import pyarrow as pa
|
3
|
+
import pyarrow.parquet as pq
|
4
|
+
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
from deltacat.experimental.storage.rivulet.shard.range_shard import (
|
7
|
+
RangeShard,
|
8
|
+
RangeShardingStrategy,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def sample_numeric_dataset(tmp_path):
|
14
|
+
"""
|
15
|
+
Creates a small Parquet file with integer-based min/max keys and
|
16
|
+
initializes a Dataset from it. Merge key is 'id' with values [1,2,3].
|
17
|
+
So min_key=1, max_key=3.
|
18
|
+
"""
|
19
|
+
data = {
|
20
|
+
"id": [1, 2, 3],
|
21
|
+
"name": ["Alice", "Bob", "Charlie"],
|
22
|
+
"age": [25, 30, 35],
|
23
|
+
}
|
24
|
+
table = pa.Table.from_pydict(data)
|
25
|
+
parquet_file = tmp_path / "numeric_data.parquet"
|
26
|
+
pq.write_table(table, parquet_file)
|
27
|
+
|
28
|
+
ds = Dataset.from_parquet(
|
29
|
+
name="numeric_dataset",
|
30
|
+
file_uri=str(parquet_file),
|
31
|
+
metadata_uri=tmp_path,
|
32
|
+
merge_keys="id",
|
33
|
+
)
|
34
|
+
return ds
|
35
|
+
|
36
|
+
|
37
|
+
@pytest.fixture
|
38
|
+
def sample_string_dataset(tmp_path):
|
39
|
+
"""
|
40
|
+
Creates a small Parquet file with a string-based merge key ('name')
|
41
|
+
and initializes a Dataset from it. Merge key has values
|
42
|
+
['Alice', 'Bob', 'Charlie'] => min_key='Alice', max_key='Charlie'.
|
43
|
+
"""
|
44
|
+
data = {
|
45
|
+
"name": ["Alice", "Charlie", "Bob"], # random order
|
46
|
+
"value": [100, 200, 150],
|
47
|
+
}
|
48
|
+
table = pa.Table.from_pydict(data)
|
49
|
+
parquet_file = tmp_path / "string_data.parquet"
|
50
|
+
pq.write_table(table, parquet_file)
|
51
|
+
|
52
|
+
ds = Dataset.from_parquet(
|
53
|
+
name="string_dataset",
|
54
|
+
file_uri=str(parquet_file),
|
55
|
+
metadata_uri=tmp_path,
|
56
|
+
merge_keys="name",
|
57
|
+
)
|
58
|
+
return ds
|
59
|
+
|
60
|
+
|
61
|
+
def test_shards(sample_numeric_dataset, sample_string_dataset):
|
62
|
+
shards = sample_numeric_dataset.shards(num_shards=2)
|
63
|
+
|
64
|
+
num_shards = len(list(shards))
|
65
|
+
assert num_shards == 2
|
66
|
+
|
67
|
+
shard = shards[0]
|
68
|
+
records = list(sample_numeric_dataset.scan(shard=shard).to_pydict())
|
69
|
+
num_records = len(records)
|
70
|
+
assert num_records == 2
|
71
|
+
|
72
|
+
assert records[0]["id"] == 1
|
73
|
+
assert records[0]["name"] == "Alice"
|
74
|
+
|
75
|
+
assert records[1]["id"] == 2
|
76
|
+
assert records[1]["name"] == "Bob"
|
77
|
+
|
78
|
+
|
79
|
+
def test_range_shard_repr():
|
80
|
+
shard = RangeShard(min_key=5, max_key=15)
|
81
|
+
assert repr(shard) == "Shard(type=range, min_key=5, max_key=15)"
|
82
|
+
|
83
|
+
|
84
|
+
def test_range_shard_split_integers():
|
85
|
+
shards = RangeShard.split(global_min=1, global_max=10, num_shards=2)
|
86
|
+
assert len(shards) == 2
|
87
|
+
|
88
|
+
assert shards[0].min_key == 1
|
89
|
+
assert shards[0].max_key == 5
|
90
|
+
assert shards[1].min_key == 6
|
91
|
+
assert shards[1].max_key == 10
|
92
|
+
|
93
|
+
|
94
|
+
def test_range_shard_split_integers_single_shard():
|
95
|
+
shards = RangeShard.split(global_min=1, global_max=10, num_shards=1)
|
96
|
+
assert len(shards) == 1
|
97
|
+
assert shards[0].min_key == 1
|
98
|
+
assert shards[0].max_key == 10
|
99
|
+
|
100
|
+
|
101
|
+
def test_range_shard_split_integers_same_value():
|
102
|
+
shards = RangeShard.split(global_min=5, global_max=5, num_shards=3)
|
103
|
+
assert len(shards) == 1
|
104
|
+
|
105
|
+
|
106
|
+
def test_range_sharding_strategy_integers(sample_numeric_dataset):
|
107
|
+
strategy = RangeShardingStrategy()
|
108
|
+
shards = list(
|
109
|
+
strategy.shards(num_shards=2, metastore=sample_numeric_dataset._metastore)
|
110
|
+
)
|
111
|
+
|
112
|
+
assert len(shards) == 2, "Expected 2 shards for dataset with keys [1,2,3]"
|
113
|
+
|
114
|
+
shard1, shard2 = shards
|
115
|
+
assert isinstance(shard1, RangeShard)
|
116
|
+
assert isinstance(shard2, RangeShard)
|
117
|
+
assert shard1.min_key == 1
|
118
|
+
assert shard1.max_key == 2
|
119
|
+
assert shard2.min_key == 3
|
120
|
+
assert shard2.max_key == 3
|
121
|
+
|
122
|
+
|
123
|
+
def test_range_sharding_strategy_integers_single_shard(sample_numeric_dataset):
|
124
|
+
strategy = RangeShardingStrategy()
|
125
|
+
shards = list(
|
126
|
+
strategy.shards(num_shards=1, metastore=sample_numeric_dataset._metastore)
|
127
|
+
)
|
128
|
+
assert len(shards) == 1
|
129
|
+
shard = shards[0]
|
130
|
+
assert shard.min_key == 1
|
131
|
+
assert shard.max_key == 3
|
132
|
+
|
133
|
+
|
134
|
+
def test_range_sharding_strategy_strings(sample_string_dataset):
|
135
|
+
strategy = RangeShardingStrategy()
|
136
|
+
shards = list(
|
137
|
+
strategy.shards(num_shards=2, metastore=sample_string_dataset._metastore)
|
138
|
+
)
|
139
|
+
|
140
|
+
assert len(shards) == 2, "Expected 2 shards for string-based dataset"
|
141
|
+
shard1, shard2 = shards
|
142
|
+
assert isinstance(shard1, RangeShard)
|
143
|
+
assert isinstance(shard2, RangeShard)
|
144
|
+
|
145
|
+
assert shard1.min_key == "Alice"
|
146
|
+
assert shard1.max_key < "Charlie"
|
147
|
+
|
148
|
+
assert shard2.min_key == shard1.max_key
|
149
|
+
assert shard2.max_key == "Charlie"
|
150
|
+
|
151
|
+
|
152
|
+
def test_range_sharding_strategy_strings_single_shard(sample_string_dataset):
|
153
|
+
strategy = RangeShardingStrategy()
|
154
|
+
shards = list(
|
155
|
+
strategy.shards(num_shards=1, metastore=sample_string_dataset._metastore)
|
156
|
+
)
|
157
|
+
|
158
|
+
assert len(shards) == 1
|
159
|
+
|
160
|
+
shard = shards[0]
|
161
|
+
assert shard.min_key == "Alice"
|
162
|
+
assert shard.max_key == "Charlie"
|
@@ -3,9 +3,11 @@ from deltacat.utils.metafile_locator import _find_partition_path
|
|
3
3
|
import pytest
|
4
4
|
|
5
5
|
import pyarrow as pa
|
6
|
-
from deltacat.storage.rivulet import Schema, Field, Datatype
|
7
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
8
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
6
|
+
from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
|
7
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
9
|
+
QueryExpression,
|
10
|
+
)
|
9
11
|
|
10
12
|
|
11
13
|
@pytest.fixture
|
@@ -2,11 +2,11 @@ import os
|
|
2
2
|
|
3
3
|
import pytest
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
6
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
7
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
8
|
-
from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
|
9
|
-
from deltacat.storage.rivulet import Schema, Field
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
7
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
8
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
|
9
|
+
from deltacat.experimental.storage.rivulet import Schema, Field
|
10
10
|
import pyarrow as pa
|
11
11
|
import pyarrow.parquet
|
12
12
|
|
@@ -2,16 +2,16 @@ from typing import List, FrozenSet, Dict
|
|
2
2
|
|
3
3
|
import pytest
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.metastore.delta import DeltaContext
|
6
|
-
from deltacat.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
7
|
-
from deltacat.storage.rivulet.metastore.sst_interval_tree import (
|
5
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
7
|
+
from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
|
8
8
|
BlockIntervalTree,
|
9
9
|
BlockGroup,
|
10
10
|
OrderedBlockGroups,
|
11
11
|
Block,
|
12
12
|
)
|
13
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
14
|
-
from deltacat.storage.rivulet import Schema
|
13
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
14
|
+
from deltacat.experimental.storage.rivulet import Schema
|
15
15
|
|
16
16
|
|
17
17
|
@pytest.fixture
|