deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +19 -15
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +18 -8
- deltacat/catalog/model/catalog.py +111 -73
- deltacat/catalog/model/properties.py +25 -22
- deltacat/compute/jobs/client.py +7 -5
- deltacat/constants.py +1 -2
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/model/shard.py +6 -2
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/types/media.py +3 -3
- deltacat/utils/daft.py +530 -4
- deltacat/utils/export.py +3 -1
- deltacat/utils/url.py +1 -1
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +4 -5
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +120 -100
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → examples/experimental}/__init__.py +0 -0
- /deltacat/examples/{common → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{examples/iceberg → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/iceberg → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/fs → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/shard}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/writer}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- /deltacat/tests/{storage/rivulet/schema → catalog/data}/__init__.py +0 -0
- /deltacat/tests/{storage/rivulet/writer → catalog/main}/__init__.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,24 @@
|
|
1
1
|
import unittest
|
2
2
|
import pytest
|
3
|
-
import ray
|
4
3
|
import tempfile
|
5
4
|
import shutil
|
6
5
|
import uuid
|
7
6
|
from unittest import mock
|
8
7
|
import os
|
9
8
|
|
10
|
-
from deltacat.catalog import
|
11
|
-
|
12
|
-
|
13
|
-
from deltacat.catalog.model.catalog import (
|
9
|
+
from deltacat.catalog import (
|
10
|
+
CatalogProperties,
|
14
11
|
Catalog,
|
15
|
-
|
12
|
+
clear_catalogs,
|
16
13
|
get_catalog,
|
17
|
-
|
14
|
+
init,
|
18
15
|
is_initialized,
|
16
|
+
put_catalog,
|
19
17
|
)
|
20
|
-
from deltacat.catalog.iceberg
|
18
|
+
from deltacat.experimental.catalog.iceberg import impl as IcebergCatalog
|
19
|
+
from pyiceberg.catalog import Catalog as PyIcebergCatalog
|
20
|
+
|
21
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
21
22
|
|
22
23
|
from pyiceberg.catalog import CatalogType
|
23
24
|
|
@@ -25,41 +26,19 @@ from pyiceberg.catalog import CatalogType
|
|
25
26
|
# Test module to mock a catalog implementation
|
26
27
|
class MockCatalogImpl:
|
27
28
|
@staticmethod
|
28
|
-
def initialize(*args, **kwargs):
|
29
|
+
def initialize(config, *args, **kwargs):
|
29
30
|
# Return some state that the catalog would normally maintain
|
30
|
-
return {
|
31
|
+
return {
|
32
|
+
"initialized": True,
|
33
|
+
"config": config,
|
34
|
+
"args": args,
|
35
|
+
"kwargs": kwargs,
|
36
|
+
}
|
31
37
|
|
32
38
|
|
33
39
|
@pytest.fixture(scope="function")
|
34
|
-
def
|
35
|
-
|
36
|
-
Setup and teardown for Ray environment for tests.
|
37
|
-
|
38
|
-
This will kill the actor all_catalogs, essentially wiping global state for catalogs
|
39
|
-
|
40
|
-
NOTE: tests using this fixture must be run serially. As of April 7 2025, the unit test suite had various
|
41
|
-
failures if run in parallel, in part because the state of all_catalogs in ray is shared across tests.
|
42
|
-
|
43
|
-
NOTE: when using this fixture, ensure you pass ray_init_args={"ignore_reinit_error": True} into all
|
44
|
-
functions which may re-initialize ray. This is because the production code checks the all_catalogs actor
|
45
|
-
in order to determine whether it needs to initialize Ray
|
46
|
-
"""
|
47
|
-
# Reset the global catalog_actor state before each test
|
48
|
-
import deltacat.catalog.model.catalog as catalog_module
|
49
|
-
|
50
|
-
# Initialize Ray if not already initialized
|
51
|
-
if not ray.is_initialized():
|
52
|
-
ray.init(ignore_reinit_error=True)
|
53
|
-
yield
|
54
|
-
|
55
|
-
# Clean up the actor if it exists
|
56
|
-
if catalog_module.all_catalogs is not None:
|
57
|
-
try:
|
58
|
-
ray.kill(catalog_module.all_catalogs)
|
59
|
-
except Exception:
|
60
|
-
pass
|
61
|
-
finally:
|
62
|
-
catalog_module.all_catalogs = None
|
40
|
+
def reset_catalogs():
|
41
|
+
clear_catalogs()
|
63
42
|
|
64
43
|
|
65
44
|
class TestCatalog(unittest.TestCase):
|
@@ -74,6 +53,7 @@ class TestCatalog(unittest.TestCase):
|
|
74
53
|
# Check that inner state was correctly initialized
|
75
54
|
# This just asserts that kwargs were plumbed through from Catalog constructor
|
76
55
|
self.assertTrue(catalog.inner["initialized"])
|
56
|
+
self.assertIsNone(catalog.inner["config"])
|
77
57
|
self.assertEqual(catalog.inner["args"], ())
|
78
58
|
self.assertEqual(catalog.inner["kwargs"], {})
|
79
59
|
|
@@ -81,16 +61,18 @@ class TestCatalog(unittest.TestCase):
|
|
81
61
|
"""Test the iceberg factory method correctly creates an Iceberg catalog."""
|
82
62
|
# Create a mock for the Iceberg catalog module
|
83
63
|
with mock.patch(
|
84
|
-
"deltacat.catalog.
|
64
|
+
"deltacat.experimental.catalog.iceberg.impl.IcebergCatalog"
|
85
65
|
) as mock_iceberg_catalog:
|
86
66
|
# Configure the mock to return a known value when initialize is called
|
87
67
|
mock_iceberg_catalog.initialize.return_value = {"iceberg": True}
|
88
68
|
|
89
69
|
# Create an Iceberg catalog config and invoke iceberg factory method
|
90
70
|
config = IcebergCatalogConfig(type=CatalogType.IN_MEMORY, properties={})
|
91
|
-
catalog =
|
71
|
+
catalog = IcebergCatalog.from_config(config)
|
92
72
|
|
93
73
|
# Check that the implementation is set to iceberg_catalog
|
74
|
+
print(f"catalog.impl: {catalog.impl}")
|
75
|
+
print(f"mock_iceberg_catalog: {mock_iceberg_catalog}")
|
94
76
|
self.assertEqual(catalog.impl, mock_iceberg_catalog)
|
95
77
|
# Check that the inner state is set to the output of initialize
|
96
78
|
self.assertEqual(catalog.inner, {"iceberg": True})
|
@@ -109,8 +91,7 @@ class TestCatalogsIntegration:
|
|
109
91
|
catalog = Catalog(impl=MockCatalogImpl)
|
110
92
|
init(
|
111
93
|
catalog,
|
112
|
-
|
113
|
-
**{"force_reinitialize": True},
|
94
|
+
force=True,
|
114
95
|
)
|
115
96
|
|
116
97
|
@classmethod
|
@@ -118,17 +99,13 @@ class TestCatalogsIntegration:
|
|
118
99
|
if cls.temp_dir and os.path.exists(cls.temp_dir):
|
119
100
|
shutil.rmtree(cls.temp_dir)
|
120
101
|
|
121
|
-
def test_init_single_catalog(self,
|
102
|
+
def test_init_single_catalog(self, reset_catalogs):
|
122
103
|
"""Test initializing a single catalog."""
|
123
104
|
|
124
105
|
catalog = Catalog(impl=MockCatalogImpl)
|
125
106
|
|
126
107
|
# Initialize with a single catalog and Ray init args including the namespace
|
127
|
-
init(
|
128
|
-
catalog,
|
129
|
-
ray_init_args={"ignore_reinit_error": True},
|
130
|
-
**{"force_reinitialize": True},
|
131
|
-
)
|
108
|
+
init(catalog, force=True)
|
132
109
|
|
133
110
|
assert is_initialized()
|
134
111
|
|
@@ -137,7 +114,7 @@ class TestCatalogsIntegration:
|
|
137
114
|
assert retrieved_catalog.impl == MockCatalogImpl
|
138
115
|
assert retrieved_catalog.inner["initialized"]
|
139
116
|
|
140
|
-
def test_init_multiple_catalogs(self,
|
117
|
+
def test_init_multiple_catalogs(self, reset_catalogs):
|
141
118
|
"""Test initializing multiple catalogs."""
|
142
119
|
# Create catalogs
|
143
120
|
catalog1 = Catalog(impl=MockCatalogImpl, id=1)
|
@@ -145,11 +122,7 @@ class TestCatalogsIntegration:
|
|
145
122
|
|
146
123
|
# Initialize with multiple catalogs and Ray init args including the namespace
|
147
124
|
catalogs_dict = {"catalog1": catalog1, "catalog2": catalog2}
|
148
|
-
init(
|
149
|
-
catalogs_dict,
|
150
|
-
ray_init_args={"ignore_reinit_error": True},
|
151
|
-
**{"force_reinitialize": True},
|
152
|
-
)
|
125
|
+
init(catalogs_dict, force=True)
|
153
126
|
|
154
127
|
assert is_initialized()
|
155
128
|
|
@@ -162,7 +135,7 @@ class TestCatalogsIntegration:
|
|
162
135
|
assert retrieved_catalog2.impl == MockCatalogImpl
|
163
136
|
assert retrieved_catalog2.inner["kwargs"]["id"] == 2
|
164
137
|
|
165
|
-
def test_init_with_default_catalog_name(self,
|
138
|
+
def test_init_with_default_catalog_name(self, reset_catalogs):
|
166
139
|
"""Test initializing with a specified default catalog name."""
|
167
140
|
# Create catalogs
|
168
141
|
catalog1 = Catalog(impl=MockCatalogImpl, id=1)
|
@@ -173,8 +146,7 @@ class TestCatalogsIntegration:
|
|
173
146
|
init(
|
174
147
|
catalogs_dict,
|
175
148
|
default="catalog2",
|
176
|
-
|
177
|
-
**{"force_reinitialize": True},
|
149
|
+
force=True,
|
178
150
|
)
|
179
151
|
|
180
152
|
# Get the default catalog and check it's catalog2
|
@@ -182,16 +154,12 @@ class TestCatalogsIntegration:
|
|
182
154
|
assert default_catalog.impl == MockCatalogImpl
|
183
155
|
assert default_catalog.inner["kwargs"]["id"] == 2
|
184
156
|
|
185
|
-
def test_put_catalog(self,
|
157
|
+
def test_put_catalog(self, reset_catalogs):
|
186
158
|
"""Test adding a catalog after initialization."""
|
187
159
|
# Initialize with a single catalog
|
188
160
|
catalog1 = Catalog(impl=MockCatalogImpl, id=1)
|
189
161
|
catalog2 = Catalog(impl=MockCatalogImpl, id=2)
|
190
|
-
init(
|
191
|
-
{"catalog1": catalog1},
|
192
|
-
ray_init_args={"ignore_reinit_error": True},
|
193
|
-
**{"force_reinitialize": True},
|
194
|
-
)
|
162
|
+
init({"catalog1": catalog1}, force=True)
|
195
163
|
|
196
164
|
# Add a second catalog
|
197
165
|
put_catalog("catalog2", catalog2)
|
@@ -203,21 +171,19 @@ class TestCatalogsIntegration:
|
|
203
171
|
retrieved_catalog2 = get_catalog("catalog2")
|
204
172
|
assert retrieved_catalog2.inner["kwargs"]["id"] == 2
|
205
173
|
|
206
|
-
def test_put_catalog_that_already_exists(self,
|
174
|
+
def test_put_catalog_that_already_exists(self, reset_catalogs):
|
207
175
|
catalog = Catalog(impl=MockCatalogImpl, id=1)
|
208
176
|
catalog2 = Catalog(impl=MockCatalogImpl, id=2)
|
209
177
|
put_catalog(
|
210
178
|
"test_catalog",
|
211
179
|
catalog,
|
212
180
|
id=1,
|
213
|
-
ray_init_args={"ignore_reinit_error": True},
|
214
181
|
)
|
215
182
|
|
216
183
|
# Try to add another catalog with the same name. Should not error
|
217
184
|
put_catalog(
|
218
185
|
"test_catalog",
|
219
186
|
catalog2,
|
220
|
-
ray_init_args={"ignore_reinit_error": True},
|
221
187
|
)
|
222
188
|
|
223
189
|
retrieved_catalog = get_catalog("test_catalog")
|
@@ -228,40 +194,31 @@ class TestCatalogsIntegration:
|
|
228
194
|
put_catalog(
|
229
195
|
"test_catalog",
|
230
196
|
catalog,
|
231
|
-
ray_init_args={"ignore_reinit_error": True},
|
232
197
|
fail_if_exists=True,
|
233
198
|
)
|
234
199
|
|
235
|
-
def test_get_catalog_nonexistent(self,
|
200
|
+
def test_get_catalog_nonexistent(self, reset_catalogs):
|
236
201
|
"""Test that trying to get a nonexistent catalog raises an error."""
|
237
202
|
# Initialize with a catalog
|
238
203
|
catalog = Catalog(impl=MockCatalogImpl)
|
239
|
-
init(
|
240
|
-
{"test_catalog": catalog},
|
241
|
-
ray_init_args={"ignore_reinit_error": True},
|
242
|
-
**{"force_reinitialize": True},
|
243
|
-
)
|
204
|
+
init({"test_catalog": catalog}, force=True)
|
244
205
|
|
245
206
|
# Try to get a nonexistent catalog
|
246
207
|
with pytest.raises(ValueError):
|
247
208
|
get_catalog("nonexistent")
|
248
209
|
|
249
|
-
def test_get_catalog_no_default(self,
|
210
|
+
def test_get_catalog_no_default(self, reset_catalogs):
|
250
211
|
"""Test that trying to get the default catalog when none is set raises an error."""
|
251
212
|
# Initialize with multiple catalogs but no default
|
252
213
|
catalog1 = Catalog(impl=MockCatalogImpl, id=1)
|
253
214
|
catalog2 = Catalog(impl=MockCatalogImpl, id=2)
|
254
|
-
init(
|
255
|
-
{"catalog1": catalog1, "catalog2": catalog2},
|
256
|
-
ray_init_args={"ignore_reinit_error": True},
|
257
|
-
**{"force_reinitialize": True},
|
258
|
-
)
|
215
|
+
init({"catalog1": catalog1, "catalog2": catalog2}, force=True)
|
259
216
|
|
260
217
|
# Try to get the default catalog
|
261
218
|
with pytest.raises(ValueError):
|
262
219
|
get_catalog()
|
263
220
|
|
264
|
-
def test_default_catalog_initialization(self,
|
221
|
+
def test_default_catalog_initialization(self, reset_catalogs):
|
265
222
|
"""Test that a Default catalog can be initialized and accessed using the factory method."""
|
266
223
|
from deltacat.catalog.model.properties import CatalogProperties
|
267
224
|
|
@@ -270,15 +227,11 @@ class TestCatalogsIntegration:
|
|
270
227
|
# Create the catalog properties
|
271
228
|
config = CatalogProperties(root=self.temp_dir)
|
272
229
|
|
273
|
-
# Create the catalog
|
274
|
-
catalog = Catalog
|
230
|
+
# Create the catalog
|
231
|
+
catalog = Catalog(config)
|
275
232
|
|
276
233
|
# Initialize DeltaCAT with this catalog
|
277
|
-
init(
|
278
|
-
{catalog_name: catalog},
|
279
|
-
ray_init_args={"ignore_reinit_error": True},
|
280
|
-
**{"force_reinitialize": True},
|
281
|
-
)
|
234
|
+
init({catalog_name: catalog}, force=True)
|
282
235
|
|
283
236
|
# Retrieve the catalog and verify it's the same one
|
284
237
|
retrieved_catalog = get_catalog(catalog_name)
|
@@ -286,16 +239,14 @@ class TestCatalogsIntegration:
|
|
286
239
|
assert isinstance(retrieved_catalog.inner, CatalogProperties)
|
287
240
|
assert retrieved_catalog.inner.root == self.temp_dir
|
288
241
|
|
289
|
-
def test_default_catalog_initialization_from_kwargs(self,
|
242
|
+
def test_default_catalog_initialization_from_kwargs(self, reset_catalogs):
|
290
243
|
|
291
244
|
catalog_name = str(uuid.uuid4())
|
292
|
-
# Initialize DeltaCAT with this catalog
|
293
|
-
from deltacat.catalog.main import impl as DeltacatCatalog
|
294
245
|
|
246
|
+
# Initialize DeltaCAT with this catalog
|
295
247
|
put_catalog(
|
296
248
|
catalog_name,
|
297
|
-
Catalog(
|
298
|
-
ray_init_args={"ignore_reinit_error": True},
|
249
|
+
Catalog(root="test_root"),
|
299
250
|
)
|
300
251
|
|
301
252
|
# Retrieve the catalog and verify it's the same one
|
@@ -304,7 +255,7 @@ class TestCatalogsIntegration:
|
|
304
255
|
assert isinstance(retrieved_catalog.inner, CatalogProperties)
|
305
256
|
assert retrieved_catalog.inner.root == "test_root"
|
306
257
|
|
307
|
-
def test_iceberg_catalog_initialization(self,
|
258
|
+
def test_iceberg_catalog_initialization(self, reset_catalogs):
|
308
259
|
"""Test that an Iceberg catalog can be initialized and accessed."""
|
309
260
|
catalog_name = str(uuid.uuid4())
|
310
261
|
|
@@ -314,11 +265,14 @@ class TestCatalogsIntegration:
|
|
314
265
|
)
|
315
266
|
|
316
267
|
# Create the catalog using the factory method
|
317
|
-
catalog =
|
268
|
+
catalog = IcebergCatalog.from_config(config)
|
318
269
|
|
319
|
-
put_catalog(catalog_name, catalog
|
270
|
+
put_catalog(catalog_name, catalog)
|
320
271
|
|
321
272
|
# Retrieve the catalog and verify it's the same one
|
322
273
|
retrieved_catalog = get_catalog(catalog_name)
|
323
|
-
assert
|
324
|
-
|
274
|
+
assert (
|
275
|
+
retrieved_catalog.impl.__name__
|
276
|
+
== "deltacat.experimental.catalog.iceberg.impl"
|
277
|
+
)
|
278
|
+
assert isinstance(retrieved_catalog.inner, PyIcebergCatalog)
|
@@ -38,8 +38,7 @@ class TestReadTable(unittest.TestCase):
|
|
38
38
|
catalog_config = CatalogProperties(storage=ds)
|
39
39
|
dc.put_catalog(
|
40
40
|
cls.catalog_name,
|
41
|
-
catalog=Catalog
|
42
|
-
ray_init_args={"ignore_reinit_error": True},
|
41
|
+
catalog=Catalog(catalog_config),
|
43
42
|
)
|
44
43
|
super().setUpClass()
|
45
44
|
|
File without changes
|
@@ -0,0 +1,97 @@
|
|
1
|
+
import pytest
|
2
|
+
import pyarrow as pa
|
3
|
+
from daft import DataType, TimeUnit
|
4
|
+
from daft.logical.schema import Field as DaftField
|
5
|
+
|
6
|
+
from deltacat.storage.model.transform import IdentityTransform
|
7
|
+
from deltacat.storage.model.partition import PartitionKey
|
8
|
+
from deltacat.utils.daft import DaftFieldMapper, DaftPartitionKeyMapper
|
9
|
+
|
10
|
+
from deltacat.storage.model.schema import Field, Schema
|
11
|
+
|
12
|
+
|
13
|
+
class TestDaftFieldMapper:
|
14
|
+
def test_field_mapper_basic_types(self):
|
15
|
+
"""Test mapping basic data types between Daft and PyArrow fields"""
|
16
|
+
test_cases = [
|
17
|
+
(DataType.int32(), pa.int32()),
|
18
|
+
(DataType.int64(), pa.int64()),
|
19
|
+
(DataType.float32(), pa.float32()),
|
20
|
+
(DataType.float64(), pa.float64()),
|
21
|
+
(DataType.string(), pa.large_string()),
|
22
|
+
(DataType.bool(), pa.bool_()),
|
23
|
+
(DataType.binary(), pa.large_binary()),
|
24
|
+
(DataType.date(), pa.date32()),
|
25
|
+
(DataType.timestamp(TimeUnit.ns()), pa.timestamp("ns")),
|
26
|
+
]
|
27
|
+
|
28
|
+
for daft_type, pa_type in test_cases:
|
29
|
+
# Create test fields
|
30
|
+
daft_field = DaftField.create(
|
31
|
+
name="test_field",
|
32
|
+
dtype=daft_type,
|
33
|
+
)
|
34
|
+
|
35
|
+
# Daft to PyArrow
|
36
|
+
pa_field = DaftFieldMapper.map(daft_field)
|
37
|
+
assert pa_field is not None
|
38
|
+
assert pa_field.name == "test_field"
|
39
|
+
assert pa_field.type == pa_type # type: ignore
|
40
|
+
assert pa_field.nullable is True
|
41
|
+
|
42
|
+
# PyArrow to Daft
|
43
|
+
daft_field_back = DaftFieldMapper.unmap(pa_field)
|
44
|
+
assert daft_field_back is not None
|
45
|
+
assert daft_field_back.name == daft_field.name
|
46
|
+
assert daft_field_back.dtype == daft_field.dtype
|
47
|
+
|
48
|
+
|
49
|
+
class TestDaftPartitionKeyMapper:
|
50
|
+
def test_unmap(self):
|
51
|
+
"""
|
52
|
+
Test unmap method of DaftPartitionKeyMapper when obj is not None, schema is provided,
|
53
|
+
len(obj.key) is 1, and dc_field is found in the schema.
|
54
|
+
|
55
|
+
This test verifies that the method correctly converts a PartitionKey to a DaftPartitionField
|
56
|
+
when all conditions are met and the field exists in the schema.
|
57
|
+
"""
|
58
|
+
# Create a mock schema
|
59
|
+
schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
|
60
|
+
# Create a PartitionKey object
|
61
|
+
partition_key = PartitionKey(
|
62
|
+
key=["test_field"], transform=IdentityTransform(), name="partition_field"
|
63
|
+
)
|
64
|
+
|
65
|
+
result = DaftPartitionKeyMapper.unmap(obj=partition_key, schema=schema)
|
66
|
+
assert result is not None
|
67
|
+
assert result.field.name() == "partition_field"
|
68
|
+
assert DataType._from_pydatatype(result.field.dtype()) == DataType.int32()
|
69
|
+
|
70
|
+
def test_unmap_no_field_locator(self):
|
71
|
+
schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
|
72
|
+
partition_key = PartitionKey(key=[], name="partition_field")
|
73
|
+
|
74
|
+
with pytest.raises(ValueError) as excinfo:
|
75
|
+
DaftPartitionKeyMapper.unmap(partition_key, schema)
|
76
|
+
|
77
|
+
assert "At least 1 PartitionKey FieldLocator is expected" in str(excinfo.value)
|
78
|
+
|
79
|
+
def test_unmap_partition_key_not_found(self):
|
80
|
+
schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
|
81
|
+
partition_key = PartitionKey(
|
82
|
+
key=["test_field_2"], transform=IdentityTransform(), name="partition_field"
|
83
|
+
)
|
84
|
+
|
85
|
+
with pytest.raises(KeyError) as excinfo:
|
86
|
+
DaftPartitionKeyMapper.unmap(partition_key, schema)
|
87
|
+
|
88
|
+
assert "Column test_field_2 does not exist in schema" in str(excinfo.value)
|
89
|
+
|
90
|
+
def test_unmap_partition_name_not_defined(self):
|
91
|
+
schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
|
92
|
+
partition_key = PartitionKey(key=[])
|
93
|
+
|
94
|
+
with pytest.raises(ValueError) as excinfo:
|
95
|
+
DaftPartitionKeyMapper.unmap(partition_key, schema)
|
96
|
+
|
97
|
+
assert "Name is required for PartitionKey conversion" in str(excinfo.value)
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import tempfile
|
2
|
+
import shutil
|
3
|
+
import uuid
|
4
|
+
import deltacat
|
5
|
+
import pytest
|
6
|
+
from deltacat import Field, Schema
|
7
|
+
from pyiceberg.catalog import CatalogType
|
8
|
+
|
9
|
+
import pyarrow as pa
|
10
|
+
|
11
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
12
|
+
|
13
|
+
|
14
|
+
@pytest.fixture
|
15
|
+
def schema_a():
|
16
|
+
return Schema.of(
|
17
|
+
[
|
18
|
+
Field.of(
|
19
|
+
field=pa.field("col1", pa.int32(), nullable=False),
|
20
|
+
field_id=1,
|
21
|
+
is_merge_key=True,
|
22
|
+
)
|
23
|
+
]
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
class TestIcebergCatalogInitialization:
|
28
|
+
temp_dir = None
|
29
|
+
|
30
|
+
@classmethod
|
31
|
+
def setup_class(cls):
|
32
|
+
cls.temp_dir = tempfile.mkdtemp()
|
33
|
+
|
34
|
+
@classmethod
|
35
|
+
def teardown_class(cls):
|
36
|
+
shutil.rmtree(cls.temp_dir)
|
37
|
+
|
38
|
+
def test_iceberg_catalog_and_table_create(self, schema_a):
|
39
|
+
|
40
|
+
# Register a random catalog name to avoid concurrent test conflicts
|
41
|
+
catalog_name = str(uuid.uuid4())
|
42
|
+
|
43
|
+
config = IcebergCatalogConfig(
|
44
|
+
type=CatalogType.SQL,
|
45
|
+
properties={
|
46
|
+
"warehouse": self.temp_dir,
|
47
|
+
"uri": f"sqlite:////{self.temp_dir}/sql-catalog.db",
|
48
|
+
},
|
49
|
+
)
|
50
|
+
|
51
|
+
# Initialize with the PyIceberg catalog
|
52
|
+
catalog = deltacat.IcebergCatalog.from_config(config)
|
53
|
+
deltacat.init(
|
54
|
+
{catalog_name: catalog},
|
55
|
+
force=True,
|
56
|
+
)
|
57
|
+
|
58
|
+
table_def = deltacat.create_table(
|
59
|
+
"test_table", catalog=catalog_name, schema=schema_a
|
60
|
+
)
|
61
|
+
|
62
|
+
# Fetch table we just created
|
63
|
+
fetched_table_def = deltacat.get_table("test_table", catalog=catalog_name)
|
64
|
+
assert table_def.table_version == fetched_table_def.table_version
|
65
|
+
|
66
|
+
# For now, just check that we created a table version with an equivalent schema
|
67
|
+
assert table_def.table_version.schema.equivalent_to(schema_a)
|
68
|
+
|
69
|
+
# Sanity check that list namespaces works
|
70
|
+
namespaces = deltacat.list_namespaces(catalog=catalog_name).all_items()
|
71
|
+
assert table_def.table.namespace in [n.namespace for n in namespaces]
|
File without changes
|
@@ -0,0 +1,136 @@
|
|
1
|
+
import daft
|
2
|
+
from daft import Table, Identifier
|
3
|
+
import pytest
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from deltacat.catalog import Catalog as DeltaCATCatalog
|
7
|
+
from deltacat.catalog import CatalogProperties
|
8
|
+
from deltacat.experimental.daft.daft_catalog import DaftCatalog
|
9
|
+
import shutil
|
10
|
+
import tempfile
|
11
|
+
|
12
|
+
from deltacat.experimental.catalog.iceberg import impl as IcebergCatalog
|
13
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
14
|
+
|
15
|
+
from pyiceberg.catalog import CatalogType
|
16
|
+
|
17
|
+
|
18
|
+
class TestCatalogIntegration:
|
19
|
+
@classmethod
|
20
|
+
def setup_method(cls):
|
21
|
+
cls.tmpdir = tempfile.mkdtemp()
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def teardown_method(cls):
|
25
|
+
shutil.rmtree(cls.tmpdir)
|
26
|
+
|
27
|
+
def test_create_table(self):
|
28
|
+
"""Demonstrate DeltaCAT-Daft integration."""
|
29
|
+
# Create a DeltaCAT catalog
|
30
|
+
catalog_props = CatalogProperties(root=self.tmpdir)
|
31
|
+
dc_catalog = DeltaCATCatalog(catalog_props)
|
32
|
+
|
33
|
+
# Use a random catalog name to prevent namespacing conflicts with other tests
|
34
|
+
# Convert the DeltaCAT catalog to a Daft catalog
|
35
|
+
catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
|
36
|
+
|
37
|
+
daft_catalog = DaftCatalog(catalog=dc_catalog, name=catalog_name)
|
38
|
+
|
39
|
+
# Register the catalog with Daft's catalog system
|
40
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
41
|
+
|
42
|
+
# Create a sample DataFrame
|
43
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
44
|
+
# Create then get table
|
45
|
+
daft_catalog.create_table(Identifier("example_table"), df)
|
46
|
+
table: Table = daft_catalog.get_table(Identifier("example_table"))
|
47
|
+
assert table.name == "example_table"
|
48
|
+
|
49
|
+
def test_get_table(self):
|
50
|
+
"""Test getting a table from the DeltaCAT-Daft catalog."""
|
51
|
+
# Create a DeltaCAT catalog using the existing tmpdir
|
52
|
+
catalog_props = CatalogProperties(root=self.tmpdir)
|
53
|
+
dc_catalog = DeltaCATCatalog(catalog_props)
|
54
|
+
|
55
|
+
# Convert to DaftCatalog and attach to Daft
|
56
|
+
catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
|
57
|
+
daft_catalog = DaftCatalog(dc_catalog, catalog_name)
|
58
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
59
|
+
|
60
|
+
# Create a sample DataFrame and table
|
61
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
62
|
+
table_name = "test_get_table"
|
63
|
+
daft_catalog.create_table(Identifier(table_name), df)
|
64
|
+
|
65
|
+
# Get the table using different forms of identifiers
|
66
|
+
table2 = daft_catalog.get_table(Identifier(table_name))
|
67
|
+
assert table2 is not None
|
68
|
+
assert table2.name == table_name
|
69
|
+
|
70
|
+
# 3. With namespace. DeltaCAT used the default namespace since it was not provided
|
71
|
+
table3 = daft_catalog.get_table(Identifier("default", table_name))
|
72
|
+
assert table3 is not None
|
73
|
+
assert table3.name == table_name
|
74
|
+
|
75
|
+
# Test non-existent table raises an appropriate error
|
76
|
+
with pytest.raises(ValueError, match="Table nonexistent_table not found"):
|
77
|
+
daft_catalog.get_table(Identifier("nonexistent_table"))
|
78
|
+
|
79
|
+
|
80
|
+
class TestIcebergCatalogIntegration:
|
81
|
+
@classmethod
|
82
|
+
def setup_method(cls):
|
83
|
+
cls.tmpdir = tempfile.mkdtemp()
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def teardown_method(cls):
|
87
|
+
shutil.rmtree(cls.tmpdir)
|
88
|
+
|
89
|
+
def test_iceberg_catalog_integration(self):
|
90
|
+
# Create a unique warehouse path for this test
|
91
|
+
warehouse_path = self.tmpdir
|
92
|
+
|
93
|
+
# Configure an Iceberg catalog with the warehouse path
|
94
|
+
config = IcebergCatalogConfig(
|
95
|
+
type=CatalogType.SQL,
|
96
|
+
properties={
|
97
|
+
"warehouse": warehouse_path,
|
98
|
+
"uri": f"sqlite:////{warehouse_path}/sql-catalog.db",
|
99
|
+
},
|
100
|
+
)
|
101
|
+
dc_catalog = IcebergCatalog.from_config(config)
|
102
|
+
|
103
|
+
# Convert the DeltaCAT catalog to a Daft catalog
|
104
|
+
catalog_name = f"deltacat_iceberg_{uuid.uuid4().hex[:8]}"
|
105
|
+
daft_catalog = DaftCatalog(dc_catalog, catalog_name)
|
106
|
+
daft.attach_catalog(daft_catalog, catalog_name)
|
107
|
+
|
108
|
+
# Create a sample DataFrame
|
109
|
+
df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
110
|
+
|
111
|
+
# Create a table with the Daft catalog
|
112
|
+
table_name = "example_table"
|
113
|
+
namespace = "example_namespace"
|
114
|
+
daft_catalog.create_table(Identifier(namespace, table_name), df)
|
115
|
+
|
116
|
+
# Query that Iceberg table exists using PyIceberg
|
117
|
+
iceberg_catalog = dc_catalog.inner
|
118
|
+
|
119
|
+
# Verify the table exists in the Iceberg catalog
|
120
|
+
tables = iceberg_catalog.list_tables(namespace)
|
121
|
+
|
122
|
+
assert any(
|
123
|
+
t[0] == namespace and t[1] == table_name for t in tables
|
124
|
+
), f"Table {table_name} not found in Iceberg catalog"
|
125
|
+
|
126
|
+
# Load the table from Iceberg catalog and verify its properties
|
127
|
+
iceberg_table = iceberg_catalog.load_table(f"{namespace}.{table_name}")
|
128
|
+
|
129
|
+
# Check that the schema matches our DataFrame
|
130
|
+
schema = iceberg_table.schema()
|
131
|
+
assert (
|
132
|
+
schema.find_field("id") is not None
|
133
|
+
), "Field 'id' not fcound in table schema"
|
134
|
+
assert (
|
135
|
+
schema.find_field("value") is not None
|
136
|
+
), "Field 'value' not found in table schema"
|
File without changes
|
File without changes
|
@@ -3,9 +3,9 @@ import io
|
|
3
3
|
import pytest
|
4
4
|
from faker import Faker
|
5
5
|
|
6
|
-
from deltacat.storage.rivulet.schema.datatype import Datatype
|
7
|
-
from deltacat.storage.rivulet.mvp.Table import MvpTable
|
8
|
-
from deltacat.storage.rivulet.schema.schema import Schema
|
6
|
+
from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
|
7
|
+
from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
|
8
|
+
from deltacat.experimental.storage.rivulet.schema.schema import Schema
|
9
9
|
import random
|
10
10
|
import string
|
11
11
|
from PIL import Image
|
File without changes
|