deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +19 -15
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +18 -8
- deltacat/catalog/model/catalog.py +111 -73
- deltacat/catalog/model/properties.py +25 -22
- deltacat/compute/jobs/client.py +7 -5
- deltacat/constants.py +1 -2
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/model/shard.py +6 -2
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/types/media.py +3 -3
- deltacat/utils/daft.py +530 -4
- deltacat/utils/export.py +3 -1
- deltacat/utils/url.py +1 -1
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +4 -5
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +120 -100
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → examples/experimental}/__init__.py +0 -0
- /deltacat/examples/{common → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{examples/iceberg → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/iceberg → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/fs → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/shard}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/writer}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- /deltacat/tests/{storage/rivulet/schema → catalog/data}/__init__.py +0 -0
- /deltacat/tests/{storage/rivulet/writer → catalog/main}/__init__.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,9 @@ import pytest
|
|
2
2
|
|
3
3
|
import pyarrow as pa
|
4
4
|
import pyarrow.parquet as pq
|
5
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
6
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
7
|
-
from deltacat.storage.rivulet import Schema, Field
|
5
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
6
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
7
|
+
from deltacat.experimental.storage.rivulet import Schema, Field
|
8
8
|
from deltacat.utils.metafile_locator import _find_partition_path
|
9
9
|
|
10
10
|
|
File without changes
|
@@ -0,0 +1,80 @@
|
|
1
|
+
import pytest
|
2
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
3
|
+
QueryExpression,
|
4
|
+
)
|
5
|
+
from deltacat.experimental.storage.rivulet.shard.range_shard import RangeShard
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.fixture
|
9
|
+
def sample_range_shard():
|
10
|
+
return RangeShard(min_key=5, max_key=15)
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def sample_string_shard():
|
15
|
+
return RangeShard(min_key="apple", max_key="zebra")
|
16
|
+
|
17
|
+
|
18
|
+
def test_with_key():
|
19
|
+
query = QueryExpression[int]()
|
20
|
+
query.with_key(5)
|
21
|
+
assert query.min_key == 5
|
22
|
+
assert query.max_key == 5
|
23
|
+
with pytest.raises(ValueError):
|
24
|
+
query.with_key(10)
|
25
|
+
|
26
|
+
|
27
|
+
def test_with_range():
|
28
|
+
query = QueryExpression[int]()
|
29
|
+
query.with_range(10, 5)
|
30
|
+
assert query.min_key == 5
|
31
|
+
assert query.max_key == 10
|
32
|
+
with pytest.raises(ValueError):
|
33
|
+
query.with_range(20, 25)
|
34
|
+
|
35
|
+
|
36
|
+
def test_matches_query():
|
37
|
+
query = QueryExpression[int]()
|
38
|
+
assert query.matches_query(5)
|
39
|
+
assert query.matches_query(-999)
|
40
|
+
query.with_range(10, 20)
|
41
|
+
assert query.matches_query(15)
|
42
|
+
assert not query.matches_query(25)
|
43
|
+
assert not query.matches_query(5)
|
44
|
+
|
45
|
+
|
46
|
+
def test_below_query_range():
|
47
|
+
query = QueryExpression[int]()
|
48
|
+
assert not query.below_query_range(5)
|
49
|
+
query.with_range(10, 20)
|
50
|
+
assert query.below_query_range(5)
|
51
|
+
assert not query.below_query_range(15)
|
52
|
+
assert not query.below_query_range(25)
|
53
|
+
|
54
|
+
|
55
|
+
def test_with_shard_existing_query(sample_range_shard):
|
56
|
+
query = QueryExpression[int]().with_range(10, 20)
|
57
|
+
new_query = QueryExpression.with_shard(query, sample_range_shard)
|
58
|
+
assert new_query.min_key == 5
|
59
|
+
assert new_query.max_key == 20
|
60
|
+
|
61
|
+
|
62
|
+
def test_with_shard_none_shard():
|
63
|
+
query = QueryExpression[int]().with_range(10, 20)
|
64
|
+
result = QueryExpression.with_shard(query, None)
|
65
|
+
assert result.min_key == 10
|
66
|
+
assert result.max_key == 20
|
67
|
+
|
68
|
+
|
69
|
+
def test_with_shard_existing_query_string(sample_string_shard):
|
70
|
+
query = QueryExpression[str]().with_range("banana", "yellow")
|
71
|
+
new_query = QueryExpression.with_shard(query, sample_string_shard)
|
72
|
+
assert new_query.min_key == "apple"
|
73
|
+
assert new_query.max_key == "zebra"
|
74
|
+
|
75
|
+
|
76
|
+
def test_query_expression_string_matches():
|
77
|
+
query = QueryExpression[str]().with_range("apple", "cat")
|
78
|
+
assert query.matches_query("apple")
|
79
|
+
assert query.matches_query("banana")
|
80
|
+
assert not query.matches_query("dog")
|
@@ -0,0 +1,119 @@
|
|
1
|
+
import pytest
|
2
|
+
from deltacat.tests.experimental.storage.rivulet.test_utils import verify_pyarrow_scan
|
3
|
+
import pyarrow as pa
|
4
|
+
from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.fixture
|
9
|
+
def combined_schema():
|
10
|
+
return Schema(
|
11
|
+
fields=[
|
12
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
13
|
+
Field("name", Datatype.string()),
|
14
|
+
Field("age", Datatype.int32()),
|
15
|
+
Field("height", Datatype.int64()),
|
16
|
+
Field("gender", Datatype.string()),
|
17
|
+
]
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@pytest.fixture
|
22
|
+
def initial_schema():
|
23
|
+
return Schema(
|
24
|
+
fields=[
|
25
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
26
|
+
Field("name", Datatype.string()),
|
27
|
+
Field("age", Datatype.int32()),
|
28
|
+
]
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
@pytest.fixture
|
33
|
+
def extended_schema():
|
34
|
+
return Schema(
|
35
|
+
fields=[
|
36
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
37
|
+
Field("height", Datatype.int64()),
|
38
|
+
Field("gender", Datatype.string()),
|
39
|
+
]
|
40
|
+
)
|
41
|
+
|
42
|
+
|
43
|
+
@pytest.fixture
|
44
|
+
def sample_data():
|
45
|
+
return {
|
46
|
+
"id": [1, 2, 3],
|
47
|
+
"name": ["Alice", "Bob", "Charlie"],
|
48
|
+
"age": [25, 30, 35],
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
@pytest.fixture
|
53
|
+
def extended_data():
|
54
|
+
return {
|
55
|
+
"id": [1, 2, 3],
|
56
|
+
"height": [150, 160, 159],
|
57
|
+
"gender": ["male", "female", "male"],
|
58
|
+
}
|
59
|
+
|
60
|
+
|
61
|
+
@pytest.fixture
|
62
|
+
def combined_data(sample_data, extended_data):
|
63
|
+
data = sample_data.copy()
|
64
|
+
data.update(extended_data)
|
65
|
+
return data
|
66
|
+
|
67
|
+
|
68
|
+
@pytest.fixture
|
69
|
+
def parquet_data(tmp_path, sample_data):
|
70
|
+
parquet_path = tmp_path / "test.parquet"
|
71
|
+
table = pa.Table.from_pydict(sample_data)
|
72
|
+
pa.parquet.write_table(table, parquet_path)
|
73
|
+
return parquet_path
|
74
|
+
|
75
|
+
|
76
|
+
@pytest.fixture
|
77
|
+
def sample_dataset(parquet_data, tmp_path):
|
78
|
+
return Dataset.from_parquet(
|
79
|
+
name="test_dataset",
|
80
|
+
file_uri=str(parquet_data),
|
81
|
+
metadata_uri=str(tmp_path),
|
82
|
+
merge_keys="id",
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
def test_end_to_end_scan_with_multiple_schemas(
|
87
|
+
sample_dataset,
|
88
|
+
initial_schema,
|
89
|
+
extended_schema,
|
90
|
+
combined_schema,
|
91
|
+
sample_data,
|
92
|
+
extended_data,
|
93
|
+
combined_data,
|
94
|
+
):
|
95
|
+
# Verify initial scan.
|
96
|
+
verify_pyarrow_scan(sample_dataset.scan().to_arrow(), initial_schema, sample_data)
|
97
|
+
|
98
|
+
# Add a new schema to the dataset
|
99
|
+
sample_dataset.add_schema(schema=extended_schema, schema_name="schema2")
|
100
|
+
new_data = [
|
101
|
+
{"id": 1, "height": 150, "gender": "male"},
|
102
|
+
{"id": 2, "height": 160, "gender": "female"},
|
103
|
+
{"id": 3, "height": 159, "gender": "male"},
|
104
|
+
]
|
105
|
+
writer = sample_dataset.writer(schema_name="schema2")
|
106
|
+
writer.write(new_data)
|
107
|
+
writer.flush()
|
108
|
+
|
109
|
+
# Verify scan with the extended schema retrieves only extended datfa
|
110
|
+
verify_pyarrow_scan(
|
111
|
+
sample_dataset.scan(schema_name="schema2").to_arrow(),
|
112
|
+
extended_schema,
|
113
|
+
extended_data,
|
114
|
+
)
|
115
|
+
|
116
|
+
# Verify a combined scan retrieves data matching the combined schema
|
117
|
+
verify_pyarrow_scan(
|
118
|
+
sample_dataset.scan().to_arrow(), combined_schema, combined_data
|
119
|
+
)
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import pytest
|
2
|
+
import os
|
3
|
+
|
4
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
|
5
|
+
from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
|
6
|
+
DatasetMetastore,
|
7
|
+
)
|
8
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
9
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
10
|
+
from deltacat.experimental.storage.rivulet import Schema
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def sample_schema():
|
15
|
+
return Schema(
|
16
|
+
{("id", Datatype.int32()), ("name", Datatype.string())},
|
17
|
+
"id",
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@pytest.fixture
|
22
|
+
def sample_pydict():
|
23
|
+
return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
|
24
|
+
|
25
|
+
|
26
|
+
def test_dataset_metastore_e2e(sample_schema, tmp_path):
|
27
|
+
# Setup
|
28
|
+
dataset = Dataset(metadata_uri=tmp_path, dataset_name="dataset")
|
29
|
+
file_provider = dataset._file_provider
|
30
|
+
manifest_io = DeltacatManifestIO(file_provider.uri, dataset._locator)
|
31
|
+
|
32
|
+
# Create multiple manifests
|
33
|
+
manifests_data = [
|
34
|
+
{"sst_files": ["sst1.sst", "sst2.sst"], "level": 1},
|
35
|
+
{"sst_files": ["sst3.sst", "sst4.sst"], "level": 2},
|
36
|
+
]
|
37
|
+
|
38
|
+
# Create SST files and manifests
|
39
|
+
manifest_paths = []
|
40
|
+
for manifest_data in manifests_data:
|
41
|
+
sst_files = manifest_data["sst_files"]
|
42
|
+
for sst in sst_files:
|
43
|
+
with open(os.path.join(file_provider.uri, sst), "w") as f:
|
44
|
+
f.write("test data")
|
45
|
+
|
46
|
+
manifest_path = manifest_io.write(
|
47
|
+
sst_files, sample_schema, manifest_data["level"]
|
48
|
+
)
|
49
|
+
manifest_paths.append(manifest_path)
|
50
|
+
|
51
|
+
# Initialize DatasetMetastore
|
52
|
+
metastore = DatasetMetastore(
|
53
|
+
file_provider.uri,
|
54
|
+
file_provider,
|
55
|
+
file_provider._locator,
|
56
|
+
manifest_io=manifest_io,
|
57
|
+
)
|
58
|
+
|
59
|
+
# Test manifest generation
|
60
|
+
manifest_accessors = list(metastore.generate_manifests())
|
61
|
+
assert len(manifest_accessors) == len(manifests_data)
|
62
|
+
|
63
|
+
# Verify each manifest accessor
|
64
|
+
for accessor in manifest_accessors:
|
65
|
+
assert accessor.context.schema == sample_schema
|
66
|
+
manifests_data_index = 0 if accessor.context.level == 1 else 1
|
67
|
+
assert accessor.context.level == manifests_data[manifests_data_index]["level"]
|
68
|
+
assert (
|
69
|
+
accessor.manifest.sst_files
|
70
|
+
== manifests_data[manifests_data_index]["sst_files"]
|
71
|
+
)
|
File without changes
|
File without changes
|
@@ -0,0 +1,162 @@
|
|
1
|
+
import pytest
|
2
|
+
import pyarrow as pa
|
3
|
+
import pyarrow.parquet as pq
|
4
|
+
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
from deltacat.experimental.storage.rivulet.shard.range_shard import (
|
7
|
+
RangeShard,
|
8
|
+
RangeShardingStrategy,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def sample_numeric_dataset(tmp_path):
|
14
|
+
"""
|
15
|
+
Creates a small Parquet file with integer-based min/max keys and
|
16
|
+
initializes a Dataset from it. Merge key is 'id' with values [1,2,3].
|
17
|
+
So min_key=1, max_key=3.
|
18
|
+
"""
|
19
|
+
data = {
|
20
|
+
"id": [1, 2, 3],
|
21
|
+
"name": ["Alice", "Bob", "Charlie"],
|
22
|
+
"age": [25, 30, 35],
|
23
|
+
}
|
24
|
+
table = pa.Table.from_pydict(data)
|
25
|
+
parquet_file = tmp_path / "numeric_data.parquet"
|
26
|
+
pq.write_table(table, parquet_file)
|
27
|
+
|
28
|
+
ds = Dataset.from_parquet(
|
29
|
+
name="numeric_dataset",
|
30
|
+
file_uri=str(parquet_file),
|
31
|
+
metadata_uri=tmp_path,
|
32
|
+
merge_keys="id",
|
33
|
+
)
|
34
|
+
return ds
|
35
|
+
|
36
|
+
|
37
|
+
@pytest.fixture
|
38
|
+
def sample_string_dataset(tmp_path):
|
39
|
+
"""
|
40
|
+
Creates a small Parquet file with a string-based merge key ('name')
|
41
|
+
and initializes a Dataset from it. Merge key has values
|
42
|
+
['Alice', 'Bob', 'Charlie'] => min_key='Alice', max_key='Charlie'.
|
43
|
+
"""
|
44
|
+
data = {
|
45
|
+
"name": ["Alice", "Charlie", "Bob"], # random order
|
46
|
+
"value": [100, 200, 150],
|
47
|
+
}
|
48
|
+
table = pa.Table.from_pydict(data)
|
49
|
+
parquet_file = tmp_path / "string_data.parquet"
|
50
|
+
pq.write_table(table, parquet_file)
|
51
|
+
|
52
|
+
ds = Dataset.from_parquet(
|
53
|
+
name="string_dataset",
|
54
|
+
file_uri=str(parquet_file),
|
55
|
+
metadata_uri=tmp_path,
|
56
|
+
merge_keys="name",
|
57
|
+
)
|
58
|
+
return ds
|
59
|
+
|
60
|
+
|
61
|
+
def test_shards(sample_numeric_dataset, sample_string_dataset):
|
62
|
+
shards = sample_numeric_dataset.shards(num_shards=2)
|
63
|
+
|
64
|
+
num_shards = len(list(shards))
|
65
|
+
assert num_shards == 2
|
66
|
+
|
67
|
+
shard = shards[0]
|
68
|
+
records = list(sample_numeric_dataset.scan(shard=shard).to_pydict())
|
69
|
+
num_records = len(records)
|
70
|
+
assert num_records == 2
|
71
|
+
|
72
|
+
assert records[0]["id"] == 1
|
73
|
+
assert records[0]["name"] == "Alice"
|
74
|
+
|
75
|
+
assert records[1]["id"] == 2
|
76
|
+
assert records[1]["name"] == "Bob"
|
77
|
+
|
78
|
+
|
79
|
+
def test_range_shard_repr():
|
80
|
+
shard = RangeShard(min_key=5, max_key=15)
|
81
|
+
assert repr(shard) == "Shard(type=range, min_key=5, max_key=15)"
|
82
|
+
|
83
|
+
|
84
|
+
def test_range_shard_split_integers():
|
85
|
+
shards = RangeShard.split(global_min=1, global_max=10, num_shards=2)
|
86
|
+
assert len(shards) == 2
|
87
|
+
|
88
|
+
assert shards[0].min_key == 1
|
89
|
+
assert shards[0].max_key == 5
|
90
|
+
assert shards[1].min_key == 6
|
91
|
+
assert shards[1].max_key == 10
|
92
|
+
|
93
|
+
|
94
|
+
def test_range_shard_split_integers_single_shard():
|
95
|
+
shards = RangeShard.split(global_min=1, global_max=10, num_shards=1)
|
96
|
+
assert len(shards) == 1
|
97
|
+
assert shards[0].min_key == 1
|
98
|
+
assert shards[0].max_key == 10
|
99
|
+
|
100
|
+
|
101
|
+
def test_range_shard_split_integers_same_value():
|
102
|
+
shards = RangeShard.split(global_min=5, global_max=5, num_shards=3)
|
103
|
+
assert len(shards) == 1
|
104
|
+
|
105
|
+
|
106
|
+
def test_range_sharding_strategy_integers(sample_numeric_dataset):
|
107
|
+
strategy = RangeShardingStrategy()
|
108
|
+
shards = list(
|
109
|
+
strategy.shards(num_shards=2, metastore=sample_numeric_dataset._metastore)
|
110
|
+
)
|
111
|
+
|
112
|
+
assert len(shards) == 2, "Expected 2 shards for dataset with keys [1,2,3]"
|
113
|
+
|
114
|
+
shard1, shard2 = shards
|
115
|
+
assert isinstance(shard1, RangeShard)
|
116
|
+
assert isinstance(shard2, RangeShard)
|
117
|
+
assert shard1.min_key == 1
|
118
|
+
assert shard1.max_key == 2
|
119
|
+
assert shard2.min_key == 3
|
120
|
+
assert shard2.max_key == 3
|
121
|
+
|
122
|
+
|
123
|
+
def test_range_sharding_strategy_integers_single_shard(sample_numeric_dataset):
|
124
|
+
strategy = RangeShardingStrategy()
|
125
|
+
shards = list(
|
126
|
+
strategy.shards(num_shards=1, metastore=sample_numeric_dataset._metastore)
|
127
|
+
)
|
128
|
+
assert len(shards) == 1
|
129
|
+
shard = shards[0]
|
130
|
+
assert shard.min_key == 1
|
131
|
+
assert shard.max_key == 3
|
132
|
+
|
133
|
+
|
134
|
+
def test_range_sharding_strategy_strings(sample_string_dataset):
|
135
|
+
strategy = RangeShardingStrategy()
|
136
|
+
shards = list(
|
137
|
+
strategy.shards(num_shards=2, metastore=sample_string_dataset._metastore)
|
138
|
+
)
|
139
|
+
|
140
|
+
assert len(shards) == 2, "Expected 2 shards for string-based dataset"
|
141
|
+
shard1, shard2 = shards
|
142
|
+
assert isinstance(shard1, RangeShard)
|
143
|
+
assert isinstance(shard2, RangeShard)
|
144
|
+
|
145
|
+
assert shard1.min_key == "Alice"
|
146
|
+
assert shard1.max_key < "Charlie"
|
147
|
+
|
148
|
+
assert shard2.min_key == shard1.max_key
|
149
|
+
assert shard2.max_key == "Charlie"
|
150
|
+
|
151
|
+
|
152
|
+
def test_range_sharding_strategy_strings_single_shard(sample_string_dataset):
|
153
|
+
strategy = RangeShardingStrategy()
|
154
|
+
shards = list(
|
155
|
+
strategy.shards(num_shards=1, metastore=sample_string_dataset._metastore)
|
156
|
+
)
|
157
|
+
|
158
|
+
assert len(shards) == 1
|
159
|
+
|
160
|
+
shard = shards[0]
|
161
|
+
assert shard.min_key == "Alice"
|
162
|
+
assert shard.max_key == "Charlie"
|
@@ -3,9 +3,11 @@ from deltacat.utils.metafile_locator import _find_partition_path
|
|
3
3
|
import pytest
|
4
4
|
|
5
5
|
import pyarrow as pa
|
6
|
-
from deltacat.storage.rivulet import Schema, Field, Datatype
|
7
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
8
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
6
|
+
from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
|
7
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
9
|
+
QueryExpression,
|
10
|
+
)
|
9
11
|
|
10
12
|
|
11
13
|
@pytest.fixture
|
@@ -2,11 +2,11 @@ import os
|
|
2
2
|
|
3
3
|
import pytest
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
6
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
7
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
8
|
-
from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
|
9
|
-
from deltacat.storage.rivulet import Schema, Field
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
7
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
8
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
|
9
|
+
from deltacat.experimental.storage.rivulet import Schema, Field
|
10
10
|
import pyarrow as pa
|
11
11
|
import pyarrow.parquet
|
12
12
|
|
@@ -2,16 +2,16 @@ from typing import List, FrozenSet, Dict
|
|
2
2
|
|
3
3
|
import pytest
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.metastore.delta import DeltaContext
|
6
|
-
from deltacat.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
7
|
-
from deltacat.storage.rivulet.metastore.sst_interval_tree import (
|
5
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
7
|
+
from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
|
8
8
|
BlockIntervalTree,
|
9
9
|
BlockGroup,
|
10
10
|
OrderedBlockGroups,
|
11
11
|
Block,
|
12
12
|
)
|
13
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
14
|
-
from deltacat.storage.rivulet import Schema
|
13
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
14
|
+
from deltacat.experimental.storage.rivulet import Schema
|
15
15
|
|
16
16
|
|
17
17
|
@pytest.fixture
|
@@ -3,12 +3,14 @@ import os
|
|
3
3
|
|
4
4
|
from pyarrow import RecordBatch, Table
|
5
5
|
|
6
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
7
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
8
|
-
|
9
|
-
|
10
|
-
from deltacat.storage.rivulet.
|
11
|
-
|
6
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
7
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
8
|
+
QueryExpression,
|
9
|
+
)
|
10
|
+
from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
|
11
|
+
|
12
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable, MvpRow
|
13
|
+
from deltacat.experimental.storage.rivulet import Schema
|
12
14
|
from typing import Dict, List, Generator, Set
|
13
15
|
|
14
16
|
FIXTURE_ROW_COUNT = 10000
|
File without changes
|
deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py
RENAMED
@@ -7,23 +7,25 @@ import msgpack
|
|
7
7
|
import pytest
|
8
8
|
from pyarrow import RecordBatch
|
9
9
|
|
10
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
11
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
12
|
-
from deltacat.storage.rivulet.metastore.delta import (
|
10
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
11
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
12
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import (
|
13
13
|
ManifestIO,
|
14
14
|
TreeLevel,
|
15
15
|
DeltacatManifestIO,
|
16
16
|
)
|
17
17
|
|
18
|
-
from deltacat.storage.rivulet.mvp.Table import MvpTable, MvpRow
|
19
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
20
|
-
|
21
|
-
|
18
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable, MvpRow
|
19
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
20
|
+
QueryExpression,
|
21
|
+
)
|
22
|
+
from deltacat.experimental.storage.rivulet import Schema
|
23
|
+
from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
|
22
24
|
MemtableDatasetWriter,
|
23
25
|
)
|
24
26
|
|
25
|
-
from deltacat.tests.storage.rivulet.test_utils import FIXTURE_ROW_COUNT
|
26
|
-
from deltacat.tests.storage.rivulet.test_utils import (
|
27
|
+
from deltacat.tests.experimental.storage.rivulet.test_utils import FIXTURE_ROW_COUNT
|
28
|
+
from deltacat.tests.experimental.storage.rivulet.test_utils import (
|
27
29
|
write_mvp_table,
|
28
30
|
compare_mvp_table_to_scan_results,
|
29
31
|
mvp_table_to_record_batches,
|
@@ -2,8 +2,8 @@ import pytest
|
|
2
2
|
import shutil
|
3
3
|
import tempfile
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
6
|
-
from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
|
5
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
6
|
+
from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
|
7
7
|
MemtableDatasetWriter,
|
8
8
|
)
|
9
9
|
from ..test_utils import (
|
deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py
RENAMED
@@ -1,12 +1,12 @@
|
|
1
1
|
import pytest
|
2
2
|
|
3
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
4
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
5
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
6
|
-
from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
|
7
|
-
from deltacat.storage.rivulet import Schema
|
8
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
9
|
-
from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
|
3
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
4
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
5
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
|
7
|
+
from deltacat.experimental.storage.rivulet import Schema
|
8
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
9
|
+
from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
|
10
10
|
MemtableDatasetWriter,
|
11
11
|
)
|
12
12
|
|
@@ -7,7 +7,9 @@ def test_sharding_strategy_from_string_range():
|
|
7
7
|
"""
|
8
8
|
Tests that from_string('range') returns an instance of RangeShardingStrategy.
|
9
9
|
"""
|
10
|
-
from deltacat.storage.rivulet.shard.range_shard import
|
10
|
+
from deltacat.experimental.storage.rivulet.shard.range_shard import (
|
11
|
+
RangeShardingStrategy,
|
12
|
+
)
|
11
13
|
|
12
14
|
strategy = ShardingStrategy.from_string("range")
|
13
15
|
assert isinstance(strategy, RangeShardingStrategy)
|
deltacat/types/media.py
CHANGED
@@ -148,9 +148,9 @@ class DatastoreType(str, Enum):
|
|
148
148
|
writer for that data store. Note that, although some overlap exists between
|
149
149
|
enum values here and in :class:`deltacat.types.media.ContentType`, each
|
150
150
|
enum serve a different purpose. The purpose of
|
151
|
-
:class:`deltacat.types.media.ContentType` is to resolve
|
152
|
-
|
153
|
-
|
151
|
+
:class:`deltacat.types.media.ContentType` is to resolve a file's MIME type,
|
152
|
+
and may be used together with datastores that support storing different
|
153
|
+
file types to describe the specific file type read/written from/to that
|
154
154
|
datastore (e.g., Iceberg, Hudi, Delta Lake, Audio, Images, Video, etc.)
|
155
155
|
"""
|
156
156
|
|