deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -16
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +26 -10
- deltacat/catalog/model/catalog.py +165 -109
- deltacat/catalog/model/properties.py +25 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/client.py +406 -0
- deltacat/constants.py +5 -6
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/types.py +5 -3
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +531 -5
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -111
- deltacat/daft/model.py +0 -258
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → compute/jobs}/__init__.py +0 -0
- /deltacat/examples/{common → experimental}/__init__.py +0 -0
- /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,97 @@
|
|
1
|
+
import pytest
|
2
|
+
import pyarrow as pa
|
3
|
+
from daft import DataType, TimeUnit
|
4
|
+
from daft.logical.schema import Field as DaftField
|
5
|
+
|
6
|
+
from deltacat.storage.model.transform import IdentityTransform
|
7
|
+
from deltacat.storage.model.partition import PartitionKey
|
8
|
+
from deltacat.utils.daft import DaftFieldMapper, DaftPartitionKeyMapper
|
9
|
+
|
10
|
+
from deltacat.storage.model.schema import Field, Schema
|
11
|
+
|
12
|
+
|
13
|
+
class TestDaftFieldMapper:
|
14
|
+
def test_field_mapper_basic_types(self):
|
15
|
+
"""Test mapping basic data types between Daft and PyArrow fields"""
|
16
|
+
test_cases = [
|
17
|
+
(DataType.int32(), pa.int32()),
|
18
|
+
(DataType.int64(), pa.int64()),
|
19
|
+
(DataType.float32(), pa.float32()),
|
20
|
+
(DataType.float64(), pa.float64()),
|
21
|
+
(DataType.string(), pa.large_string()),
|
22
|
+
(DataType.bool(), pa.bool_()),
|
23
|
+
(DataType.binary(), pa.large_binary()),
|
24
|
+
(DataType.date(), pa.date32()),
|
25
|
+
(DataType.timestamp(TimeUnit.ns()), pa.timestamp("ns")),
|
26
|
+
]
|
27
|
+
|
28
|
+
for daft_type, pa_type in test_cases:
|
29
|
+
# Create test fields
|
30
|
+
daft_field = DaftField.create(
|
31
|
+
name="test_field",
|
32
|
+
dtype=daft_type,
|
33
|
+
)
|
34
|
+
|
35
|
+
# Daft to PyArrow
|
36
|
+
pa_field = DaftFieldMapper.map(daft_field)
|
37
|
+
assert pa_field is not None
|
38
|
+
assert pa_field.name == "test_field"
|
39
|
+
assert pa_field.type == pa_type # type: ignore
|
40
|
+
assert pa_field.nullable is True
|
41
|
+
|
42
|
+
# PyArrow to Daft
|
43
|
+
daft_field_back = DaftFieldMapper.unmap(pa_field)
|
44
|
+
assert daft_field_back is not None
|
45
|
+
assert daft_field_back.name == daft_field.name
|
46
|
+
assert daft_field_back.dtype == daft_field.dtype
|
47
|
+
|
48
|
+
|
49
|
+
class TestDaftPartitionKeyMapper:
|
50
|
+
def test_unmap(self):
|
51
|
+
"""
|
52
|
+
Test unmap method of DaftPartitionKeyMapper when obj is not None, schema is provided,
|
53
|
+
len(obj.key) is 1, and dc_field is found in the schema.
|
54
|
+
|
55
|
+
This test verifies that the method correctly converts a PartitionKey to a DaftPartitionField
|
56
|
+
when all conditions are met and the field exists in the schema.
|
57
|
+
"""
|
58
|
+
# Create a mock schema
|
59
|
+
schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
|
60
|
+
# Create a PartitionKey object
|
61
|
+
partition_key = PartitionKey(
|
62
|
+
key=["test_field"], transform=IdentityTransform(), name="partition_field"
|
63
|
+
)
|
64
|
+
|
65
|
+
result = DaftPartitionKeyMapper.unmap(obj=partition_key, schema=schema)
|
66
|
+
assert result is not None
|
67
|
+
assert result.field.name() == "partition_field"
|
68
|
+
assert DataType._from_pydatatype(result.field.dtype()) == DataType.int32()
|
69
|
+
|
70
|
+
def test_unmap_no_field_locator(self):
|
71
|
+
schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
|
72
|
+
partition_key = PartitionKey(key=[], name="partition_field")
|
73
|
+
|
74
|
+
with pytest.raises(ValueError) as excinfo:
|
75
|
+
DaftPartitionKeyMapper.unmap(partition_key, schema)
|
76
|
+
|
77
|
+
assert "At least 1 PartitionKey FieldLocator is expected" in str(excinfo.value)
|
78
|
+
|
79
|
+
def test_unmap_partition_key_not_found(self):
|
80
|
+
schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
|
81
|
+
partition_key = PartitionKey(
|
82
|
+
key=["test_field_2"], transform=IdentityTransform(), name="partition_field"
|
83
|
+
)
|
84
|
+
|
85
|
+
with pytest.raises(KeyError) as excinfo:
|
86
|
+
DaftPartitionKeyMapper.unmap(partition_key, schema)
|
87
|
+
|
88
|
+
assert "Column test_field_2 does not exist in schema" in str(excinfo.value)
|
89
|
+
|
90
|
+
def test_unmap_partition_name_not_defined(self):
|
91
|
+
schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
|
92
|
+
partition_key = PartitionKey(key=[])
|
93
|
+
|
94
|
+
with pytest.raises(ValueError) as excinfo:
|
95
|
+
DaftPartitionKeyMapper.unmap(partition_key, schema)
|
96
|
+
|
97
|
+
assert "Name is required for PartitionKey conversion" in str(excinfo.value)
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import tempfile
|
2
|
+
import shutil
|
3
|
+
import uuid
|
4
|
+
import deltacat
|
5
|
+
import pytest
|
6
|
+
from deltacat import Field, Schema
|
7
|
+
from pyiceberg.catalog import CatalogType
|
8
|
+
|
9
|
+
import pyarrow as pa
|
10
|
+
|
11
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
12
|
+
|
13
|
+
|
14
|
+
@pytest.fixture
|
15
|
+
def schema_a():
|
16
|
+
return Schema.of(
|
17
|
+
[
|
18
|
+
Field.of(
|
19
|
+
field=pa.field("col1", pa.int32(), nullable=False),
|
20
|
+
field_id=1,
|
21
|
+
is_merge_key=True,
|
22
|
+
)
|
23
|
+
]
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
class TestIcebergCatalogInitialization:
|
28
|
+
temp_dir = None
|
29
|
+
|
30
|
+
@classmethod
|
31
|
+
def setup_class(cls):
|
32
|
+
cls.temp_dir = tempfile.mkdtemp()
|
33
|
+
|
34
|
+
@classmethod
|
35
|
+
def teardown_class(cls):
|
36
|
+
shutil.rmtree(cls.temp_dir)
|
37
|
+
|
38
|
+
def test_iceberg_catalog_and_table_create(self, schema_a):
|
39
|
+
|
40
|
+
# Register a random catalog name to avoid concurrent test conflicts
|
41
|
+
catalog_name = str(uuid.uuid4())
|
42
|
+
|
43
|
+
config = IcebergCatalogConfig(
|
44
|
+
type=CatalogType.SQL,
|
45
|
+
properties={
|
46
|
+
"warehouse": self.temp_dir,
|
47
|
+
"uri": f"sqlite:////{self.temp_dir}/sql-catalog.db",
|
48
|
+
},
|
49
|
+
)
|
50
|
+
|
51
|
+
# Initialize with the PyIceberg catalog
|
52
|
+
catalog = deltacat.IcebergCatalog.from_config(config)
|
53
|
+
deltacat.init(
|
54
|
+
{catalog_name: catalog},
|
55
|
+
force=True,
|
56
|
+
)
|
57
|
+
|
58
|
+
table_def = deltacat.create_table(
|
59
|
+
"test_table", catalog=catalog_name, schema=schema_a
|
60
|
+
)
|
61
|
+
|
62
|
+
# Fetch table we just created
|
63
|
+
fetched_table_def = deltacat.get_table("test_table", catalog=catalog_name)
|
64
|
+
assert table_def.table_version == fetched_table_def.table_version
|
65
|
+
|
66
|
+
# For now, just check that we created a table version with an equivalent schema
|
67
|
+
assert table_def.table_version.schema.equivalent_to(schema_a)
|
68
|
+
|
69
|
+
# Sanity check that list namespaces works
|
70
|
+
namespaces = deltacat.list_namespaces(catalog=catalog_name).all_items()
|
71
|
+
assert table_def.table.namespace in [n.namespace for n in namespaces]
|
File without changes
|
@@ -0,0 +1,136 @@
|
|
1
|
+
import daft
|
2
|
+
from daft import Table, Identifier
|
3
|
+
import pytest
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from deltacat.catalog import Catalog as DeltaCATCatalog
|
7
|
+
from deltacat.catalog import CatalogProperties
|
8
|
+
from deltacat.experimental.daft.daft_catalog import DaftCatalog
|
9
|
+
import shutil
|
10
|
+
import tempfile
|
11
|
+
|
12
|
+
from deltacat.experimental.catalog.iceberg import impl as IcebergCatalog
|
13
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
14
|
+
|
15
|
+
from pyiceberg.catalog import CatalogType
|
16
|
+
|
17
|
+
|
18
|
+
class TestCatalogIntegration:
|
19
|
+
@classmethod
|
20
|
+
def setup_method(cls):
|
21
|
+
cls.tmpdir = tempfile.mkdtemp()
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def teardown_method(cls):
|
25
|
+
shutil.rmtree(cls.tmpdir)
|
26
|
+
|
27
|
+
def test_create_table(self):
|
28
|
+
"""Demonstrate DeltaCAT-Daft integration."""
|
29
|
+
# Create a DeltaCAT catalog
|
30
|
+
catalog_props = CatalogProperties(root=self.tmpdir)
|
31
|
+
dc_catalog = DeltaCATCatalog(catalog_props)
|
32
|
+
|
33
|
+
# Use a random catalog name to prevent namespacing conflicts with other tests
|
34
|
+
# Convert the DeltaCAT catalog to a Daft catalog
|
35
|
+
catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
|
36
|
+
|
37
|
+
daft_catalog = DaftCatalog(catalog=dc_catalog, name=catalog_name)
|
38
|
+
|
39
|
+
# Register the catalog with Daft's catalog system
|
40
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
41
|
+
|
42
|
+
# Create a sample DataFrame
|
43
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
44
|
+
# Create then get table
|
45
|
+
daft_catalog.create_table(Identifier("example_table"), df)
|
46
|
+
table: Table = daft_catalog.get_table(Identifier("example_table"))
|
47
|
+
assert table.name == "example_table"
|
48
|
+
|
49
|
+
def test_get_table(self):
|
50
|
+
"""Test getting a table from the DeltaCAT-Daft catalog."""
|
51
|
+
# Create a DeltaCAT catalog using the existing tmpdir
|
52
|
+
catalog_props = CatalogProperties(root=self.tmpdir)
|
53
|
+
dc_catalog = DeltaCATCatalog(catalog_props)
|
54
|
+
|
55
|
+
# Convert to DaftCatalog and attach to Daft
|
56
|
+
catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
|
57
|
+
daft_catalog = DaftCatalog(dc_catalog, catalog_name)
|
58
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
59
|
+
|
60
|
+
# Create a sample DataFrame and table
|
61
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
62
|
+
table_name = "test_get_table"
|
63
|
+
daft_catalog.create_table(Identifier(table_name), df)
|
64
|
+
|
65
|
+
# Get the table using different forms of identifiers
|
66
|
+
table2 = daft_catalog.get_table(Identifier(table_name))
|
67
|
+
assert table2 is not None
|
68
|
+
assert table2.name == table_name
|
69
|
+
|
70
|
+
# 3. With namespace. DeltaCAT used the default namespace since it was not provided
|
71
|
+
table3 = daft_catalog.get_table(Identifier("default", table_name))
|
72
|
+
assert table3 is not None
|
73
|
+
assert table3.name == table_name
|
74
|
+
|
75
|
+
# Test non-existent table raises an appropriate error
|
76
|
+
with pytest.raises(ValueError, match="Table nonexistent_table not found"):
|
77
|
+
daft_catalog.get_table(Identifier("nonexistent_table"))
|
78
|
+
|
79
|
+
|
80
|
+
class TestIcebergCatalogIntegration:
|
81
|
+
@classmethod
|
82
|
+
def setup_method(cls):
|
83
|
+
cls.tmpdir = tempfile.mkdtemp()
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def teardown_method(cls):
|
87
|
+
shutil.rmtree(cls.tmpdir)
|
88
|
+
|
89
|
+
def test_iceberg_catalog_integration(self):
|
90
|
+
# Create a unique warehouse path for this test
|
91
|
+
warehouse_path = self.tmpdir
|
92
|
+
|
93
|
+
# Configure an Iceberg catalog with the warehouse path
|
94
|
+
config = IcebergCatalogConfig(
|
95
|
+
type=CatalogType.SQL,
|
96
|
+
properties={
|
97
|
+
"warehouse": warehouse_path,
|
98
|
+
"uri": f"sqlite:////{warehouse_path}/sql-catalog.db",
|
99
|
+
},
|
100
|
+
)
|
101
|
+
dc_catalog = IcebergCatalog.from_config(config)
|
102
|
+
|
103
|
+
# Convert the DeltaCAT catalog to a Daft catalog
|
104
|
+
catalog_name = f"deltacat_iceberg_{uuid.uuid4().hex[:8]}"
|
105
|
+
daft_catalog = DaftCatalog(dc_catalog, catalog_name)
|
106
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
107
|
+
|
108
|
+
# Create a sample DataFrame
|
109
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
110
|
+
|
111
|
+
# Create a table with the Daft catalog
|
112
|
+
table_name = "example_table"
|
113
|
+
namespace = "example_namespace"
|
114
|
+
daft_catalog.create_table(Identifier(namespace, table_name), df)
|
115
|
+
|
116
|
+
# Query that Iceberg table exists using PyIceberg
|
117
|
+
iceberg_catalog = dc_catalog.inner
|
118
|
+
|
119
|
+
# Verify the table exists in the Iceberg catalog
|
120
|
+
tables = iceberg_catalog.list_tables(namespace)
|
121
|
+
|
122
|
+
assert any(
|
123
|
+
t[0] == namespace and t[1] == table_name for t in tables
|
124
|
+
), f"Table {table_name} not found in Iceberg catalog"
|
125
|
+
|
126
|
+
# Load the table from Iceberg catalog and verify its properties
|
127
|
+
iceberg_table = iceberg_catalog.load_table(f"{namespace}.{table_name}")
|
128
|
+
|
129
|
+
# Check that the schema matches our DataFrame
|
130
|
+
schema = iceberg_table.schema()
|
131
|
+
assert (
|
132
|
+
schema.find_field("id") is not None
|
133
|
+
), "Field 'id' not fcound in table schema"
|
134
|
+
assert (
|
135
|
+
schema.find_field("value") is not None
|
136
|
+
), "Field 'value' not found in table schema"
|
File without changes
|
File without changes
|
@@ -3,9 +3,9 @@ import io
|
|
3
3
|
import pytest
|
4
4
|
from faker import Faker
|
5
5
|
|
6
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
7
|
-
from deltacat.storage.rivulet.mvp.Table import MvpTable
|
8
|
-
from deltacat.storage.rivulet.schema.schema import Schema
|
6
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
7
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
|
8
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
9
9
|
import random
|
10
10
|
import string
|
11
11
|
from PIL import Image
|
File without changes
|
@@ -2,8 +2,9 @@ import pytest
|
|
2
2
|
|
3
3
|
import pyarrow as pa
|
4
4
|
import pyarrow.parquet as pq
|
5
|
-
from deltacat import Datatype
|
6
|
-
from deltacat.storage.rivulet import
|
5
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
6
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
7
|
+
from deltacat.experimental.storage.rivulet import Schema, Field
|
7
8
|
from deltacat.utils.metafile_locator import _find_partition_path
|
8
9
|
|
9
10
|
|
File without changes
|
@@ -0,0 +1,80 @@
|
|
1
|
+
import pytest
|
2
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
3
|
+
QueryExpression,
|
4
|
+
)
|
5
|
+
from deltacat.experimental.storage.rivulet.shard.range_shard import RangeShard
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.fixture
|
9
|
+
def sample_range_shard():
|
10
|
+
return RangeShard(min_key=5, max_key=15)
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def sample_string_shard():
|
15
|
+
return RangeShard(min_key="apple", max_key="zebra")
|
16
|
+
|
17
|
+
|
18
|
+
def test_with_key():
|
19
|
+
query = QueryExpression[int]()
|
20
|
+
query.with_key(5)
|
21
|
+
assert query.min_key == 5
|
22
|
+
assert query.max_key == 5
|
23
|
+
with pytest.raises(ValueError):
|
24
|
+
query.with_key(10)
|
25
|
+
|
26
|
+
|
27
|
+
def test_with_range():
|
28
|
+
query = QueryExpression[int]()
|
29
|
+
query.with_range(10, 5)
|
30
|
+
assert query.min_key == 5
|
31
|
+
assert query.max_key == 10
|
32
|
+
with pytest.raises(ValueError):
|
33
|
+
query.with_range(20, 25)
|
34
|
+
|
35
|
+
|
36
|
+
def test_matches_query():
|
37
|
+
query = QueryExpression[int]()
|
38
|
+
assert query.matches_query(5)
|
39
|
+
assert query.matches_query(-999)
|
40
|
+
query.with_range(10, 20)
|
41
|
+
assert query.matches_query(15)
|
42
|
+
assert not query.matches_query(25)
|
43
|
+
assert not query.matches_query(5)
|
44
|
+
|
45
|
+
|
46
|
+
def test_below_query_range():
|
47
|
+
query = QueryExpression[int]()
|
48
|
+
assert not query.below_query_range(5)
|
49
|
+
query.with_range(10, 20)
|
50
|
+
assert query.below_query_range(5)
|
51
|
+
assert not query.below_query_range(15)
|
52
|
+
assert not query.below_query_range(25)
|
53
|
+
|
54
|
+
|
55
|
+
def test_with_shard_existing_query(sample_range_shard):
|
56
|
+
query = QueryExpression[int]().with_range(10, 20)
|
57
|
+
new_query = QueryExpression.with_shard(query, sample_range_shard)
|
58
|
+
assert new_query.min_key == 5
|
59
|
+
assert new_query.max_key == 20
|
60
|
+
|
61
|
+
|
62
|
+
def test_with_shard_none_shard():
|
63
|
+
query = QueryExpression[int]().with_range(10, 20)
|
64
|
+
result = QueryExpression.with_shard(query, None)
|
65
|
+
assert result.min_key == 10
|
66
|
+
assert result.max_key == 20
|
67
|
+
|
68
|
+
|
69
|
+
def test_with_shard_existing_query_string(sample_string_shard):
|
70
|
+
query = QueryExpression[str]().with_range("banana", "yellow")
|
71
|
+
new_query = QueryExpression.with_shard(query, sample_string_shard)
|
72
|
+
assert new_query.min_key == "apple"
|
73
|
+
assert new_query.max_key == "zebra"
|
74
|
+
|
75
|
+
|
76
|
+
def test_query_expression_string_matches():
|
77
|
+
query = QueryExpression[str]().with_range("apple", "cat")
|
78
|
+
assert query.matches_query("apple")
|
79
|
+
assert query.matches_query("banana")
|
80
|
+
assert not query.matches_query("dog")
|
@@ -0,0 +1,119 @@
|
|
1
|
+
import pytest
|
2
|
+
from deltacat.tests.experimental.storage.rivulet.test_utils import verify_pyarrow_scan
|
3
|
+
import pyarrow as pa
|
4
|
+
from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.fixture
|
9
|
+
def combined_schema():
|
10
|
+
return Schema(
|
11
|
+
fields=[
|
12
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
13
|
+
Field("name", Datatype.string()),
|
14
|
+
Field("age", Datatype.int32()),
|
15
|
+
Field("height", Datatype.int64()),
|
16
|
+
Field("gender", Datatype.string()),
|
17
|
+
]
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@pytest.fixture
|
22
|
+
def initial_schema():
|
23
|
+
return Schema(
|
24
|
+
fields=[
|
25
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
26
|
+
Field("name", Datatype.string()),
|
27
|
+
Field("age", Datatype.int32()),
|
28
|
+
]
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
@pytest.fixture
|
33
|
+
def extended_schema():
|
34
|
+
return Schema(
|
35
|
+
fields=[
|
36
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
37
|
+
Field("height", Datatype.int64()),
|
38
|
+
Field("gender", Datatype.string()),
|
39
|
+
]
|
40
|
+
)
|
41
|
+
|
42
|
+
|
43
|
+
@pytest.fixture
|
44
|
+
def sample_data():
|
45
|
+
return {
|
46
|
+
"id": [1, 2, 3],
|
47
|
+
"name": ["Alice", "Bob", "Charlie"],
|
48
|
+
"age": [25, 30, 35],
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
@pytest.fixture
|
53
|
+
def extended_data():
|
54
|
+
return {
|
55
|
+
"id": [1, 2, 3],
|
56
|
+
"height": [150, 160, 159],
|
57
|
+
"gender": ["male", "female", "male"],
|
58
|
+
}
|
59
|
+
|
60
|
+
|
61
|
+
@pytest.fixture
|
62
|
+
def combined_data(sample_data, extended_data):
|
63
|
+
data = sample_data.copy()
|
64
|
+
data.update(extended_data)
|
65
|
+
return data
|
66
|
+
|
67
|
+
|
68
|
+
@pytest.fixture
|
69
|
+
def parquet_data(tmp_path, sample_data):
|
70
|
+
parquet_path = tmp_path / "test.parquet"
|
71
|
+
table = pa.Table.from_pydict(sample_data)
|
72
|
+
pa.parquet.write_table(table, parquet_path)
|
73
|
+
return parquet_path
|
74
|
+
|
75
|
+
|
76
|
+
@pytest.fixture
|
77
|
+
def sample_dataset(parquet_data, tmp_path):
|
78
|
+
return Dataset.from_parquet(
|
79
|
+
name="test_dataset",
|
80
|
+
file_uri=str(parquet_data),
|
81
|
+
metadata_uri=str(tmp_path),
|
82
|
+
merge_keys="id",
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
def test_end_to_end_scan_with_multiple_schemas(
|
87
|
+
sample_dataset,
|
88
|
+
initial_schema,
|
89
|
+
extended_schema,
|
90
|
+
combined_schema,
|
91
|
+
sample_data,
|
92
|
+
extended_data,
|
93
|
+
combined_data,
|
94
|
+
):
|
95
|
+
# Verify initial scan.
|
96
|
+
verify_pyarrow_scan(sample_dataset.scan().to_arrow(), initial_schema, sample_data)
|
97
|
+
|
98
|
+
# Add a new schema to the dataset
|
99
|
+
sample_dataset.add_schema(schema=extended_schema, schema_name="schema2")
|
100
|
+
new_data = [
|
101
|
+
{"id": 1, "height": 150, "gender": "male"},
|
102
|
+
{"id": 2, "height": 160, "gender": "female"},
|
103
|
+
{"id": 3, "height": 159, "gender": "male"},
|
104
|
+
]
|
105
|
+
writer = sample_dataset.writer(schema_name="schema2")
|
106
|
+
writer.write(new_data)
|
107
|
+
writer.flush()
|
108
|
+
|
109
|
+
# Verify scan with the extended schema retrieves only extended datfa
|
110
|
+
verify_pyarrow_scan(
|
111
|
+
sample_dataset.scan(schema_name="schema2").to_arrow(),
|
112
|
+
extended_schema,
|
113
|
+
extended_data,
|
114
|
+
)
|
115
|
+
|
116
|
+
# Verify a combined scan retrieves data matching the combined schema
|
117
|
+
verify_pyarrow_scan(
|
118
|
+
sample_dataset.scan().to_arrow(), combined_schema, combined_data
|
119
|
+
)
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import pytest
|
2
|
+
import os
|
3
|
+
|
4
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
|
5
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
6
|
+
DatasetMetastore,
|
7
|
+
)
|
8
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
9
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
10
|
+
from deltacat.experimental.storage.rivulet import Schema
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def sample_schema():
|
15
|
+
return Schema(
|
16
|
+
{("id", Datatype.int32()), ("name", Datatype.string())},
|
17
|
+
"id",
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@pytest.fixture
|
22
|
+
def sample_pydict():
|
23
|
+
return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
|
24
|
+
|
25
|
+
|
26
|
+
def test_dataset_metastore_e2e(sample_schema, tmp_path):
|
27
|
+
# Setup
|
28
|
+
dataset = Dataset(metadata_uri=tmp_path, dataset_name="dataset")
|
29
|
+
file_provider = dataset._file_provider
|
30
|
+
manifest_io = DeltacatManifestIO(file_provider.uri, dataset._locator)
|
31
|
+
|
32
|
+
# Create multiple manifests
|
33
|
+
manifests_data = [
|
34
|
+
{"sst_files": ["sst1.sst", "sst2.sst"], "level": 1},
|
35
|
+
{"sst_files": ["sst3.sst", "sst4.sst"], "level": 2},
|
36
|
+
]
|
37
|
+
|
38
|
+
# Create SST files and manifests
|
39
|
+
manifest_paths = []
|
40
|
+
for manifest_data in manifests_data:
|
41
|
+
sst_files = manifest_data["sst_files"]
|
42
|
+
for sst in sst_files:
|
43
|
+
with open(os.path.join(file_provider.uri, sst), "w") as f:
|
44
|
+
f.write("test data")
|
45
|
+
|
46
|
+
manifest_path = manifest_io.write(
|
47
|
+
sst_files, sample_schema, manifest_data["level"]
|
48
|
+
)
|
49
|
+
manifest_paths.append(manifest_path)
|
50
|
+
|
51
|
+
# Initialize DatasetMetastore
|
52
|
+
metastore = DatasetMetastore(
|
53
|
+
file_provider.uri,
|
54
|
+
file_provider,
|
55
|
+
file_provider._locator,
|
56
|
+
manifest_io=manifest_io,
|
57
|
+
)
|
58
|
+
|
59
|
+
# Test manifest generation
|
60
|
+
manifest_accessors = list(metastore.generate_manifests())
|
61
|
+
assert len(manifest_accessors) == len(manifests_data)
|
62
|
+
|
63
|
+
# Verify each manifest accessor
|
64
|
+
for accessor in manifest_accessors:
|
65
|
+
assert accessor.context.schema == sample_schema
|
66
|
+
manifests_data_index = 0 if accessor.context.level == 1 else 1
|
67
|
+
assert accessor.context.level == manifests_data[manifests_data_index]["level"]
|
68
|
+
assert (
|
69
|
+
accessor.manifest.sst_files
|
70
|
+
== manifests_data[manifests_data_index]["sst_files"]
|
71
|
+
)
|
File without changes
|
File without changes
|