deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -16
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +26 -10
- deltacat/catalog/model/catalog.py +165 -109
- deltacat/catalog/model/properties.py +25 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/client.py +406 -0
- deltacat/constants.py +5 -6
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/types.py +5 -3
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +531 -5
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -111
- deltacat/daft/model.py +0 -258
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → compute/jobs}/__init__.py +0 -0
- /deltacat/examples/{common → experimental}/__init__.py +0 -0
- /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,162 @@
|
|
1
|
+
import pytest
|
2
|
+
import pyarrow as pa
|
3
|
+
import pyarrow.parquet as pq
|
4
|
+
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
from deltacat.experimental.storage.rivulet.shard.range_shard import (
|
7
|
+
RangeShard,
|
8
|
+
RangeShardingStrategy,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def sample_numeric_dataset(tmp_path):
|
14
|
+
"""
|
15
|
+
Creates a small Parquet file with integer-based min/max keys and
|
16
|
+
initializes a Dataset from it. Merge key is 'id' with values [1,2,3].
|
17
|
+
So min_key=1, max_key=3.
|
18
|
+
"""
|
19
|
+
data = {
|
20
|
+
"id": [1, 2, 3],
|
21
|
+
"name": ["Alice", "Bob", "Charlie"],
|
22
|
+
"age": [25, 30, 35],
|
23
|
+
}
|
24
|
+
table = pa.Table.from_pydict(data)
|
25
|
+
parquet_file = tmp_path / "numeric_data.parquet"
|
26
|
+
pq.write_table(table, parquet_file)
|
27
|
+
|
28
|
+
ds = Dataset.from_parquet(
|
29
|
+
name="numeric_dataset",
|
30
|
+
file_uri=str(parquet_file),
|
31
|
+
metadata_uri=tmp_path,
|
32
|
+
merge_keys="id",
|
33
|
+
)
|
34
|
+
return ds
|
35
|
+
|
36
|
+
|
37
|
+
@pytest.fixture
|
38
|
+
def sample_string_dataset(tmp_path):
|
39
|
+
"""
|
40
|
+
Creates a small Parquet file with a string-based merge key ('name')
|
41
|
+
and initializes a Dataset from it. Merge key has values
|
42
|
+
['Alice', 'Bob', 'Charlie'] => min_key='Alice', max_key='Charlie'.
|
43
|
+
"""
|
44
|
+
data = {
|
45
|
+
"name": ["Alice", "Charlie", "Bob"], # random order
|
46
|
+
"value": [100, 200, 150],
|
47
|
+
}
|
48
|
+
table = pa.Table.from_pydict(data)
|
49
|
+
parquet_file = tmp_path / "string_data.parquet"
|
50
|
+
pq.write_table(table, parquet_file)
|
51
|
+
|
52
|
+
ds = Dataset.from_parquet(
|
53
|
+
name="string_dataset",
|
54
|
+
file_uri=str(parquet_file),
|
55
|
+
metadata_uri=tmp_path,
|
56
|
+
merge_keys="name",
|
57
|
+
)
|
58
|
+
return ds
|
59
|
+
|
60
|
+
|
61
|
+
def test_shards(sample_numeric_dataset, sample_string_dataset):
|
62
|
+
shards = sample_numeric_dataset.shards(num_shards=2)
|
63
|
+
|
64
|
+
num_shards = len(list(shards))
|
65
|
+
assert num_shards == 2
|
66
|
+
|
67
|
+
shard = shards[0]
|
68
|
+
records = list(sample_numeric_dataset.scan(shard=shard).to_pydict())
|
69
|
+
num_records = len(records)
|
70
|
+
assert num_records == 2
|
71
|
+
|
72
|
+
assert records[0]["id"] == 1
|
73
|
+
assert records[0]["name"] == "Alice"
|
74
|
+
|
75
|
+
assert records[1]["id"] == 2
|
76
|
+
assert records[1]["name"] == "Bob"
|
77
|
+
|
78
|
+
|
79
|
+
def test_range_shard_repr():
|
80
|
+
shard = RangeShard(min_key=5, max_key=15)
|
81
|
+
assert repr(shard) == "Shard(type=range, min_key=5, max_key=15)"
|
82
|
+
|
83
|
+
|
84
|
+
def test_range_shard_split_integers():
|
85
|
+
shards = RangeShard.split(global_min=1, global_max=10, num_shards=2)
|
86
|
+
assert len(shards) == 2
|
87
|
+
|
88
|
+
assert shards[0].min_key == 1
|
89
|
+
assert shards[0].max_key == 5
|
90
|
+
assert shards[1].min_key == 6
|
91
|
+
assert shards[1].max_key == 10
|
92
|
+
|
93
|
+
|
94
|
+
def test_range_shard_split_integers_single_shard():
|
95
|
+
shards = RangeShard.split(global_min=1, global_max=10, num_shards=1)
|
96
|
+
assert len(shards) == 1
|
97
|
+
assert shards[0].min_key == 1
|
98
|
+
assert shards[0].max_key == 10
|
99
|
+
|
100
|
+
|
101
|
+
def test_range_shard_split_integers_same_value():
|
102
|
+
shards = RangeShard.split(global_min=5, global_max=5, num_shards=3)
|
103
|
+
assert len(shards) == 1
|
104
|
+
|
105
|
+
|
106
|
+
def test_range_sharding_strategy_integers(sample_numeric_dataset):
|
107
|
+
strategy = RangeShardingStrategy()
|
108
|
+
shards = list(
|
109
|
+
strategy.shards(num_shards=2, metastore=sample_numeric_dataset._metastore)
|
110
|
+
)
|
111
|
+
|
112
|
+
assert len(shards) == 2, "Expected 2 shards for dataset with keys [1,2,3]"
|
113
|
+
|
114
|
+
shard1, shard2 = shards
|
115
|
+
assert isinstance(shard1, RangeShard)
|
116
|
+
assert isinstance(shard2, RangeShard)
|
117
|
+
assert shard1.min_key == 1
|
118
|
+
assert shard1.max_key == 2
|
119
|
+
assert shard2.min_key == 3
|
120
|
+
assert shard2.max_key == 3
|
121
|
+
|
122
|
+
|
123
|
+
def test_range_sharding_strategy_integers_single_shard(sample_numeric_dataset):
|
124
|
+
strategy = RangeShardingStrategy()
|
125
|
+
shards = list(
|
126
|
+
strategy.shards(num_shards=1, metastore=sample_numeric_dataset._metastore)
|
127
|
+
)
|
128
|
+
assert len(shards) == 1
|
129
|
+
shard = shards[0]
|
130
|
+
assert shard.min_key == 1
|
131
|
+
assert shard.max_key == 3
|
132
|
+
|
133
|
+
|
134
|
+
def test_range_sharding_strategy_strings(sample_string_dataset):
|
135
|
+
strategy = RangeShardingStrategy()
|
136
|
+
shards = list(
|
137
|
+
strategy.shards(num_shards=2, metastore=sample_string_dataset._metastore)
|
138
|
+
)
|
139
|
+
|
140
|
+
assert len(shards) == 2, "Expected 2 shards for string-based dataset"
|
141
|
+
shard1, shard2 = shards
|
142
|
+
assert isinstance(shard1, RangeShard)
|
143
|
+
assert isinstance(shard2, RangeShard)
|
144
|
+
|
145
|
+
assert shard1.min_key == "Alice"
|
146
|
+
assert shard1.max_key < "Charlie"
|
147
|
+
|
148
|
+
assert shard2.min_key == shard1.max_key
|
149
|
+
assert shard2.max_key == "Charlie"
|
150
|
+
|
151
|
+
|
152
|
+
def test_range_sharding_strategy_strings_single_shard(sample_string_dataset):
|
153
|
+
strategy = RangeShardingStrategy()
|
154
|
+
shards = list(
|
155
|
+
strategy.shards(num_shards=1, metastore=sample_string_dataset._metastore)
|
156
|
+
)
|
157
|
+
|
158
|
+
assert len(shards) == 1
|
159
|
+
|
160
|
+
shard = shards[0]
|
161
|
+
assert shard.min_key == "Alice"
|
162
|
+
assert shard.max_key == "Charlie"
|
@@ -3,9 +3,11 @@ from deltacat.utils.metafile_locator import _find_partition_path
|
|
3
3
|
import pytest
|
4
4
|
|
5
5
|
import pyarrow as pa
|
6
|
-
from deltacat.storage.rivulet import Schema, Field, Datatype
|
7
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
8
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
6
|
+
from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
|
7
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
9
|
+
QueryExpression,
|
10
|
+
)
|
9
11
|
|
10
12
|
|
11
13
|
@pytest.fixture
|
@@ -57,7 +59,7 @@ def test_dataset_creation_metadata_structure(tmp_path):
|
|
57
59
|
dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
|
58
60
|
|
59
61
|
assert dataset._metadata_folder.startswith(".riv-meta")
|
60
|
-
assert dataset._namespace == "
|
62
|
+
assert dataset._namespace == "default"
|
61
63
|
assert dataset.dataset_name == "test_dataset"
|
62
64
|
assert dataset._metadata_path == str(tmp_path / ".riv-meta-test_dataset")
|
63
65
|
|
@@ -2,11 +2,11 @@ import os
|
|
2
2
|
|
3
3
|
import pytest
|
4
4
|
|
5
|
-
from deltacat import Dataset
|
6
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
7
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
8
|
-
from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
|
9
|
-
from deltacat.storage.rivulet import Schema, Field
|
5
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
6
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
7
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
8
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
|
9
|
+
from deltacat.experimental.storage.rivulet import Schema, Field
|
10
10
|
import pyarrow as pa
|
11
11
|
import pyarrow.parquet
|
12
12
|
|
@@ -2,16 +2,16 @@ from typing import List, FrozenSet, Dict
|
|
2
2
|
|
3
3
|
import pytest
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.metastore.delta import DeltaContext
|
6
|
-
from deltacat.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
7
|
-
from deltacat.storage.rivulet.metastore.sst_interval_tree import (
|
5
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
7
|
+
from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
|
8
8
|
BlockIntervalTree,
|
9
9
|
BlockGroup,
|
10
10
|
OrderedBlockGroups,
|
11
11
|
Block,
|
12
12
|
)
|
13
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
14
|
-
from deltacat.storage.rivulet import Schema
|
13
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
14
|
+
from deltacat.experimental.storage.rivulet import Schema
|
15
15
|
|
16
16
|
|
17
17
|
@pytest.fixture
|
@@ -3,12 +3,14 @@ import os
|
|
3
3
|
|
4
4
|
from pyarrow import RecordBatch, Table
|
5
5
|
|
6
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
7
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
8
|
-
|
9
|
-
|
10
|
-
from deltacat.storage.rivulet.
|
11
|
-
|
6
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
7
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
8
|
+
QueryExpression,
|
9
|
+
)
|
10
|
+
from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
|
11
|
+
|
12
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable, MvpRow
|
13
|
+
from deltacat.experimental.storage.rivulet import Schema
|
12
14
|
from typing import Dict, List, Generator, Set
|
13
15
|
|
14
16
|
FIXTURE_ROW_COUNT = 10000
|
File without changes
|
deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py
RENAMED
@@ -7,23 +7,25 @@ import msgpack
|
|
7
7
|
import pytest
|
8
8
|
from pyarrow import RecordBatch
|
9
9
|
|
10
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
11
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
12
|
-
from deltacat.storage.rivulet.metastore.delta import (
|
10
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
11
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
12
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import (
|
13
13
|
ManifestIO,
|
14
14
|
TreeLevel,
|
15
15
|
DeltacatManifestIO,
|
16
16
|
)
|
17
17
|
|
18
|
-
from deltacat.storage.rivulet.mvp.Table import MvpTable, MvpRow
|
19
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
20
|
-
|
21
|
-
|
18
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable, MvpRow
|
19
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
20
|
+
QueryExpression,
|
21
|
+
)
|
22
|
+
from deltacat.experimental.storage.rivulet import Schema
|
23
|
+
from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
|
22
24
|
MemtableDatasetWriter,
|
23
25
|
)
|
24
26
|
|
25
|
-
from deltacat.tests.storage.rivulet.test_utils import FIXTURE_ROW_COUNT
|
26
|
-
from deltacat.tests.storage.rivulet.test_utils import (
|
27
|
+
from deltacat.tests.experimental.storage.rivulet.test_utils import FIXTURE_ROW_COUNT
|
28
|
+
from deltacat.tests.experimental.storage.rivulet.test_utils import (
|
27
29
|
write_mvp_table,
|
28
30
|
compare_mvp_table_to_scan_results,
|
29
31
|
mvp_table_to_record_batches,
|
@@ -2,8 +2,8 @@ import pytest
|
|
2
2
|
import shutil
|
3
3
|
import tempfile
|
4
4
|
|
5
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
6
|
-
from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
|
5
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
6
|
+
from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
|
7
7
|
MemtableDatasetWriter,
|
8
8
|
)
|
9
9
|
from ..test_utils import (
|
deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py
RENAMED
@@ -1,12 +1,12 @@
|
|
1
1
|
import pytest
|
2
2
|
|
3
|
-
from deltacat import Dataset
|
4
|
-
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
5
|
-
from deltacat.storage.rivulet.fs.file_store import FileStore
|
6
|
-
from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
|
7
|
-
from deltacat.storage.rivulet import Schema
|
8
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
9
|
-
from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
|
3
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
4
|
+
from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
|
5
|
+
from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
|
6
|
+
from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
|
7
|
+
from deltacat.experimental.storage.rivulet import Schema
|
8
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
9
|
+
from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
|
10
10
|
MemtableDatasetWriter,
|
11
11
|
)
|
12
12
|
|
@@ -0,0 +1,327 @@
|
|
1
|
+
import pytest
|
2
|
+
import pyarrow as pa
|
3
|
+
|
4
|
+
from deltacat.storage.model.expression import (
|
5
|
+
Reference,
|
6
|
+
Literal,
|
7
|
+
Equal,
|
8
|
+
NotEqual,
|
9
|
+
GreaterThan,
|
10
|
+
LessThan,
|
11
|
+
GreaterThanEqual,
|
12
|
+
LessThanEqual,
|
13
|
+
And,
|
14
|
+
Or,
|
15
|
+
Not,
|
16
|
+
In,
|
17
|
+
Between,
|
18
|
+
Like,
|
19
|
+
IsNull,
|
20
|
+
)
|
21
|
+
from deltacat.storage.model.expression.visitor import DisplayVisitor, ExpressionVisitor
|
22
|
+
|
23
|
+
|
24
|
+
@pytest.fixture
|
25
|
+
def field_ref():
|
26
|
+
return Reference("field1")
|
27
|
+
|
28
|
+
|
29
|
+
@pytest.fixture
|
30
|
+
def field_ref2():
|
31
|
+
return Reference("field2")
|
32
|
+
|
33
|
+
|
34
|
+
@pytest.fixture
|
35
|
+
def literal_int():
|
36
|
+
return Literal(pa.scalar(42))
|
37
|
+
|
38
|
+
|
39
|
+
@pytest.fixture
|
40
|
+
def literal_str():
|
41
|
+
return Literal(pa.scalar("test"))
|
42
|
+
|
43
|
+
|
44
|
+
@pytest.fixture
|
45
|
+
def display_visitor():
|
46
|
+
return DisplayVisitor()
|
47
|
+
|
48
|
+
|
49
|
+
class TestExpressionLibrary:
|
50
|
+
"""Test suite for the Deltacat expression library."""
|
51
|
+
|
52
|
+
def test_reference_creation(self):
|
53
|
+
ref = Reference("field1")
|
54
|
+
assert ref.field == "field1"
|
55
|
+
assert ref.index is None
|
56
|
+
|
57
|
+
def test_reference_with_index(self):
|
58
|
+
ref = Reference("field1", 0)
|
59
|
+
assert ref.field == "field1"
|
60
|
+
assert ref.index == 0
|
61
|
+
|
62
|
+
def test_literal_creation(self):
|
63
|
+
lit = Literal(pa.scalar(42))
|
64
|
+
assert lit.value.as_py() == 42
|
65
|
+
|
66
|
+
# Test the factory methods (.of)
|
67
|
+
def test_factory_methods(self):
|
68
|
+
# Reference.of
|
69
|
+
ref = Reference.of("field1")
|
70
|
+
assert ref.field == "field1"
|
71
|
+
|
72
|
+
# Literal.of
|
73
|
+
lit = Literal.of(42)
|
74
|
+
assert lit.value.as_py() == 42
|
75
|
+
|
76
|
+
# Equal.of with mixed types
|
77
|
+
eq = Equal.of("field1", 42)
|
78
|
+
assert isinstance(eq.left, Literal)
|
79
|
+
assert isinstance(eq.right, Literal)
|
80
|
+
assert eq.left.value.as_py() == "field1"
|
81
|
+
assert eq.right.value.as_py() == 42
|
82
|
+
|
83
|
+
# Not.of
|
84
|
+
not_expr = Not.of(Equal.of("field1", 42))
|
85
|
+
assert isinstance(not_expr.operand, Equal)
|
86
|
+
|
87
|
+
# In.of
|
88
|
+
in_expr = In.of("field1", [1, 2, 3])
|
89
|
+
assert isinstance(in_expr.value, Literal)
|
90
|
+
assert len(in_expr.values) == 3
|
91
|
+
assert all(isinstance(v, Literal) for v in in_expr.values)
|
92
|
+
|
93
|
+
# Between.of
|
94
|
+
between_expr = Between.of("field1", 10, 20)
|
95
|
+
assert isinstance(between_expr.value, Literal)
|
96
|
+
assert between_expr.lower.value.as_py() == 10
|
97
|
+
assert between_expr.upper.value.as_py() == 20
|
98
|
+
|
99
|
+
# Like.of
|
100
|
+
like_expr = Like.of("field1", "%test%")
|
101
|
+
assert isinstance(like_expr.value, Literal)
|
102
|
+
assert like_expr.pattern.value.as_py() == "%test%"
|
103
|
+
|
104
|
+
# Test reference comparison helper methods
|
105
|
+
def test_reference_comparison_helpers(self, field_ref):
|
106
|
+
# Test eq, ne, gt, lt, ge, le methods
|
107
|
+
eq_expr = field_ref.eq(42)
|
108
|
+
assert isinstance(eq_expr, Equal)
|
109
|
+
assert eq_expr.left == field_ref
|
110
|
+
assert eq_expr.right.value.as_py() == 42
|
111
|
+
|
112
|
+
ne_expr = field_ref.ne(42)
|
113
|
+
assert isinstance(ne_expr, NotEqual)
|
114
|
+
|
115
|
+
gt_expr = field_ref.gt(42)
|
116
|
+
assert isinstance(gt_expr, GreaterThan)
|
117
|
+
|
118
|
+
lt_expr = field_ref.lt(42)
|
119
|
+
assert isinstance(lt_expr, LessThan)
|
120
|
+
|
121
|
+
ge_expr = field_ref.ge(42)
|
122
|
+
assert isinstance(ge_expr, GreaterThanEqual)
|
123
|
+
|
124
|
+
le_expr = field_ref.le(42)
|
125
|
+
assert isinstance(le_expr, LessThanEqual)
|
126
|
+
|
127
|
+
# Test reference special operation helpers
|
128
|
+
def test_reference_special_helpers(self, field_ref):
|
129
|
+
# Test is_null, in_, between, like methods
|
130
|
+
is_null_expr = field_ref.is_null()
|
131
|
+
assert isinstance(is_null_expr, IsNull)
|
132
|
+
assert is_null_expr.operand == field_ref
|
133
|
+
|
134
|
+
in_expr = field_ref.in_([1, 2, 3])
|
135
|
+
assert isinstance(in_expr, In)
|
136
|
+
assert in_expr.value == field_ref
|
137
|
+
assert len(in_expr.values) == 3
|
138
|
+
assert in_expr.values[0].value.as_py() == 1
|
139
|
+
|
140
|
+
between_expr = field_ref.between(10, 20)
|
141
|
+
assert isinstance(between_expr, Between)
|
142
|
+
assert between_expr.value == field_ref
|
143
|
+
assert between_expr.lower.value.as_py() == 10
|
144
|
+
assert between_expr.upper.value.as_py() == 20
|
145
|
+
|
146
|
+
like_expr = field_ref.like("%test%")
|
147
|
+
assert isinstance(like_expr, Like)
|
148
|
+
assert like_expr.value == field_ref
|
149
|
+
assert like_expr.pattern.value.as_py() == "%test%"
|
150
|
+
|
151
|
+
# Test boolean expression helper methods
|
152
|
+
def test_boolean_expression_helpers(self, field_ref):
|
153
|
+
# Test and_, or_, not_ methods
|
154
|
+
expr1 = field_ref.eq(42)
|
155
|
+
expr2 = field_ref.gt(10)
|
156
|
+
|
157
|
+
and_expr = expr1.and_(expr2)
|
158
|
+
assert isinstance(and_expr, And)
|
159
|
+
assert and_expr.left == expr1
|
160
|
+
assert and_expr.right == expr2
|
161
|
+
|
162
|
+
or_expr = expr1.or_(expr2)
|
163
|
+
assert isinstance(or_expr, Or)
|
164
|
+
assert or_expr.left == expr1
|
165
|
+
assert or_expr.right == expr2
|
166
|
+
|
167
|
+
not_expr = expr1.not_()
|
168
|
+
assert isinstance(not_expr, Not)
|
169
|
+
assert not_expr.operand == expr1
|
170
|
+
|
171
|
+
# Test building complex expressions
|
172
|
+
def test_complex_expression_building(self, field_ref, field_ref2):
|
173
|
+
# Test building more complex expressions using method chaining
|
174
|
+
expr = field_ref.eq(42).and_(field_ref2.gt(10)).or_(field_ref.is_null()).not_()
|
175
|
+
|
176
|
+
assert isinstance(expr, Not)
|
177
|
+
assert isinstance(expr.operand, Or)
|
178
|
+
assert isinstance(expr.operand.left, And)
|
179
|
+
assert isinstance(expr.operand.right, IsNull)
|
180
|
+
|
181
|
+
# Test DisplayVisitor for different expression types
|
182
|
+
def test_reference_display(self, field_ref, display_visitor):
|
183
|
+
assert display_visitor.visit(field_ref) == "field1"
|
184
|
+
|
185
|
+
def test_literal_display(self, literal_int, literal_str, display_visitor):
|
186
|
+
assert display_visitor.visit(literal_int) == "42"
|
187
|
+
assert display_visitor.visit(literal_str) == "test"
|
188
|
+
|
189
|
+
def test_comparison_display(self, field_ref, literal_int, display_visitor):
|
190
|
+
assert display_visitor.visit(Equal(field_ref, literal_int)) == "field1 = 42"
|
191
|
+
assert display_visitor.visit(NotEqual(field_ref, literal_int)) == "field1 <> 42"
|
192
|
+
assert (
|
193
|
+
display_visitor.visit(GreaterThan(field_ref, literal_int)) == "field1 > 42"
|
194
|
+
)
|
195
|
+
assert display_visitor.visit(LessThan(field_ref, literal_int)) == "field1 < 42"
|
196
|
+
assert (
|
197
|
+
display_visitor.visit(GreaterThanEqual(field_ref, literal_int))
|
198
|
+
== "field1 >= 42"
|
199
|
+
)
|
200
|
+
assert (
|
201
|
+
display_visitor.visit(LessThanEqual(field_ref, literal_int))
|
202
|
+
== "field1 <= 42"
|
203
|
+
)
|
204
|
+
|
205
|
+
def test_logical_operator_display(self, field_ref, literal_int, display_visitor):
|
206
|
+
eq_expr = Equal(field_ref, literal_int)
|
207
|
+
gt_expr = GreaterThan(field_ref, literal_int)
|
208
|
+
|
209
|
+
assert (
|
210
|
+
display_visitor.visit(And(eq_expr, gt_expr))
|
211
|
+
== "(field1 = 42 AND field1 > 42)"
|
212
|
+
)
|
213
|
+
assert (
|
214
|
+
display_visitor.visit(Or(eq_expr, gt_expr))
|
215
|
+
== "(field1 = 42 OR field1 > 42)"
|
216
|
+
)
|
217
|
+
assert display_visitor.visit(Not(eq_expr)) == "NOT (field1 = 42)"
|
218
|
+
|
219
|
+
def test_special_operator_display(self, field_ref, display_visitor):
|
220
|
+
assert display_visitor.visit(IsNull(field_ref)) == "(field1) IS NULL"
|
221
|
+
|
222
|
+
values = [Literal(pa.scalar(1)), Literal(pa.scalar(2)), Literal(pa.scalar(3))]
|
223
|
+
assert display_visitor.visit(In(field_ref, values)) == "field1 IN (1, 2, 3)"
|
224
|
+
|
225
|
+
lower = Literal(pa.scalar(10))
|
226
|
+
upper = Literal(pa.scalar(20))
|
227
|
+
assert (
|
228
|
+
display_visitor.visit(Between(field_ref, lower, upper))
|
229
|
+
== "field1 BETWEEN 10 AND 20"
|
230
|
+
)
|
231
|
+
|
232
|
+
pattern = Literal(pa.scalar("%test%"))
|
233
|
+
assert display_visitor.visit(Like(field_ref, pattern)) == "field1 LIKE %test%"
|
234
|
+
|
235
|
+
def test_complex_expression_display(self, field_ref, field_ref2, display_visitor):
|
236
|
+
expr = field_ref.eq(42).and_(field_ref2.gt(10)).or_(field_ref.is_null()).not_()
|
237
|
+
|
238
|
+
# Check that the DisplayVisitor correctly formats the complex expression
|
239
|
+
assert (
|
240
|
+
display_visitor.visit(expr)
|
241
|
+
== "NOT (((field1 = 42 AND field2 > 10) OR (field1) IS NULL))"
|
242
|
+
)
|
243
|
+
|
244
|
+
# Test BinaryExpression with_ methods
|
245
|
+
def test_binary_expression_with_methods(self, field_ref, field_ref2, literal_int):
|
246
|
+
eq_expr = Equal(field_ref, literal_int)
|
247
|
+
|
248
|
+
# Test with_left
|
249
|
+
new_expr = eq_expr.with_left(field_ref2)
|
250
|
+
assert isinstance(new_expr, Equal)
|
251
|
+
assert new_expr.left == field_ref2
|
252
|
+
assert new_expr.right == literal_int
|
253
|
+
|
254
|
+
# Test with_right
|
255
|
+
new_lit = Literal(pa.scalar(100))
|
256
|
+
new_expr = eq_expr.with_right(new_lit)
|
257
|
+
assert new_expr.left == field_ref
|
258
|
+
assert new_expr.right == new_lit
|
259
|
+
|
260
|
+
# Test __str__ method which uses DisplayVisitor
|
261
|
+
def test_expression_str_method(self, field_ref, literal_int):
|
262
|
+
eq_expr = Equal(field_ref, literal_int)
|
263
|
+
assert str(eq_expr) == "field1 = 42"
|
264
|
+
|
265
|
+
# Test proper parenthesization in complex expressions
|
266
|
+
def test_nested_parentheses(self, field_ref, field_ref2, display_visitor):
|
267
|
+
# Create a complex expression: (field1 = 1 AND field2 = 2) OR field2 = 3
|
268
|
+
expr1 = Equal(field_ref, Literal(pa.scalar(1)))
|
269
|
+
expr2 = Equal(field_ref2, Literal(pa.scalar(2)))
|
270
|
+
expr3 = Equal(field_ref2, Literal(pa.scalar(3)))
|
271
|
+
|
272
|
+
and_expr = And(expr1, expr2)
|
273
|
+
or_expr = Or(and_expr, expr3)
|
274
|
+
|
275
|
+
assert (
|
276
|
+
display_visitor.visit(or_expr)
|
277
|
+
== "((field1 = 1 AND field2 = 2) OR field2 = 3)"
|
278
|
+
)
|
279
|
+
|
280
|
+
# Test Literal comparison methods
|
281
|
+
def test_literal_comparison_methods(self, literal_int):
|
282
|
+
eq_expr = literal_int.eq("test")
|
283
|
+
assert isinstance(eq_expr, Equal)
|
284
|
+
assert eq_expr.left == literal_int
|
285
|
+
assert eq_expr.right.value.as_py() == "test"
|
286
|
+
|
287
|
+
ne_expr = literal_int.ne("test")
|
288
|
+
assert isinstance(ne_expr, NotEqual)
|
289
|
+
assert ne_expr.left == literal_int
|
290
|
+
assert ne_expr.right.value.as_py() == "test"
|
291
|
+
|
292
|
+
# Test a custom ExpressionVisitor implementation
|
293
|
+
def test_custom_visitor(self, field_ref, literal_int):
|
294
|
+
class CountingVisitor(ExpressionVisitor[None, int]):
|
295
|
+
"""Simple visitor that counts expression nodes"""
|
296
|
+
|
297
|
+
def visit_reference(self, expr, context=None):
|
298
|
+
return 1
|
299
|
+
|
300
|
+
def visit_literal(self, expr, context=None):
|
301
|
+
return 1
|
302
|
+
|
303
|
+
def visit_binary_expression(self, expr, left, right, context=None):
|
304
|
+
return left + right + 1
|
305
|
+
|
306
|
+
def visit_unary_expression(self, expr, operand, context=None):
|
307
|
+
return operand + 1
|
308
|
+
|
309
|
+
def visit_in(self, expr, context=None):
|
310
|
+
return 1 + len(expr.values) + 1 # value + all values + In operator
|
311
|
+
|
312
|
+
def visit_between(self, expr, context=None):
|
313
|
+
return 3 # value + lower + upper
|
314
|
+
|
315
|
+
def visit_like(self, expr, context=None):
|
316
|
+
return 2 # value + pattern
|
317
|
+
|
318
|
+
visitor = CountingVisitor()
|
319
|
+
|
320
|
+
# Count nodes in simple expressions
|
321
|
+
assert visitor.visit(field_ref) == 1
|
322
|
+
assert visitor.visit(literal_int) == 1
|
323
|
+
assert visitor.visit(Equal(field_ref, literal_int)) == 3 # left + right + Equal
|
324
|
+
|
325
|
+
# Count nodes in a more complex expression
|
326
|
+
expr = field_ref.eq(42).and_(field_ref.gt(10))
|
327
|
+
assert visitor.visit(expr) == 7 # (1+1+1) + (1+1+1) + 1
|
@@ -7,7 +7,9 @@ def test_sharding_strategy_from_string_range():
|
|
7
7
|
"""
|
8
8
|
Tests that from_string('range') returns an instance of RangeShardingStrategy.
|
9
9
|
"""
|
10
|
-
from deltacat.storage.rivulet.shard.range_shard import
|
10
|
+
from deltacat.experimental.storage.rivulet.shard.range_shard import (
|
11
|
+
RangeShardingStrategy,
|
12
|
+
)
|
11
13
|
|
12
14
|
strategy = ShardingStrategy.from_string("range")
|
13
15
|
assert isinstance(strategy, RangeShardingStrategy)
|