deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -16
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +26 -10
- deltacat/catalog/model/catalog.py +165 -109
- deltacat/catalog/model/properties.py +25 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/client.py +406 -0
- deltacat/constants.py +5 -6
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/types.py +5 -3
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +531 -5
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -111
- deltacat/daft/model.py +0 -258
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → compute/jobs}/__init__.py +0 -0
- /deltacat/examples/{common → experimental}/__init__.py +0 -0
- /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,24 @@
|
|
1
1
|
import unittest
|
2
2
|
import pytest
|
3
|
-
import ray
|
4
3
|
import tempfile
|
5
4
|
import shutil
|
6
5
|
import uuid
|
7
6
|
from unittest import mock
|
8
7
|
import os
|
9
8
|
|
10
|
-
from deltacat.catalog import
|
11
|
-
|
12
|
-
|
13
|
-
from deltacat.catalog.model.catalog import (
|
9
|
+
from deltacat.catalog import (
|
10
|
+
CatalogProperties,
|
14
11
|
Catalog,
|
15
|
-
|
12
|
+
clear_catalogs,
|
16
13
|
get_catalog,
|
17
|
-
|
14
|
+
init,
|
18
15
|
is_initialized,
|
16
|
+
put_catalog,
|
19
17
|
)
|
20
|
-
from deltacat.catalog.iceberg
|
18
|
+
from deltacat.experimental.catalog.iceberg import impl as IcebergCatalog
|
19
|
+
from pyiceberg.catalog import Catalog as PyIcebergCatalog
|
20
|
+
|
21
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
21
22
|
|
22
23
|
from pyiceberg.catalog import CatalogType
|
23
24
|
|
@@ -25,41 +26,19 @@ from pyiceberg.catalog import CatalogType
|
|
25
26
|
# Test module to mock a catalog implementation
|
26
27
|
class MockCatalogImpl:
|
27
28
|
@staticmethod
|
28
|
-
def initialize(*args, **kwargs):
|
29
|
+
def initialize(config, *args, **kwargs):
|
29
30
|
# Return some state that the catalog would normally maintain
|
30
|
-
return {
|
31
|
+
return {
|
32
|
+
"initialized": True,
|
33
|
+
"config": config,
|
34
|
+
"args": args,
|
35
|
+
"kwargs": kwargs,
|
36
|
+
}
|
31
37
|
|
32
38
|
|
33
39
|
@pytest.fixture(scope="function")
|
34
|
-
def
|
35
|
-
|
36
|
-
Setup and teardown for Ray environment for tests.
|
37
|
-
|
38
|
-
This will kill the actor all_catalogs, essentially wiping global state for catalogs
|
39
|
-
|
40
|
-
NOTE: tests using this fixture must be run serially. As of April 7 2025, the unit test suite had various
|
41
|
-
failures if run in parallel, in part because the state of all_catalogs in ray is shared across tests.
|
42
|
-
|
43
|
-
NOTE: when using this fixture, ensure you pass ray_init_args={"ignore_reinit_error": True} into all
|
44
|
-
functions which may re-initialize ray. This is because the production code checks the all_catalogs actor
|
45
|
-
in order to determine whether it needs to initialize Ray
|
46
|
-
"""
|
47
|
-
# Reset the global catalog_actor state before each test
|
48
|
-
import deltacat.catalog.model.catalog as catalog_module
|
49
|
-
|
50
|
-
# Initialize Ray if not already initialized
|
51
|
-
if not ray.is_initialized():
|
52
|
-
ray.init(ignore_reinit_error=True)
|
53
|
-
yield
|
54
|
-
|
55
|
-
# Clean up the actor if it exists
|
56
|
-
if catalog_module.all_catalogs is not None:
|
57
|
-
try:
|
58
|
-
ray.kill(catalog_module.all_catalogs)
|
59
|
-
except Exception:
|
60
|
-
pass
|
61
|
-
finally:
|
62
|
-
catalog_module.all_catalogs = None
|
40
|
+
def reset_catalogs():
|
41
|
+
clear_catalogs()
|
63
42
|
|
64
43
|
|
65
44
|
class TestCatalog(unittest.TestCase):
|
@@ -74,6 +53,7 @@ class TestCatalog(unittest.TestCase):
|
|
74
53
|
# Check that inner state was correctly initialized
|
75
54
|
# This just asserts that kwargs were plumbed through from Catalog constructor
|
76
55
|
self.assertTrue(catalog.inner["initialized"])
|
56
|
+
self.assertIsNone(catalog.inner["config"])
|
77
57
|
self.assertEqual(catalog.inner["args"], ())
|
78
58
|
self.assertEqual(catalog.inner["kwargs"], {})
|
79
59
|
|
@@ -81,16 +61,18 @@ class TestCatalog(unittest.TestCase):
|
|
81
61
|
"""Test the iceberg factory method correctly creates an Iceberg catalog."""
|
82
62
|
# Create a mock for the Iceberg catalog module
|
83
63
|
with mock.patch(
|
84
|
-
"deltacat.catalog.
|
64
|
+
"deltacat.experimental.catalog.iceberg.impl.IcebergCatalog"
|
85
65
|
) as mock_iceberg_catalog:
|
86
66
|
# Configure the mock to return a known value when initialize is called
|
87
67
|
mock_iceberg_catalog.initialize.return_value = {"iceberg": True}
|
88
68
|
|
89
69
|
# Create an Iceberg catalog config and invoke iceberg factory method
|
90
70
|
config = IcebergCatalogConfig(type=CatalogType.IN_MEMORY, properties={})
|
91
|
-
catalog =
|
71
|
+
catalog = IcebergCatalog.from_config(config)
|
92
72
|
|
93
73
|
# Check that the implementation is set to iceberg_catalog
|
74
|
+
print(f"catalog.impl: {catalog.impl}")
|
75
|
+
print(f"mock_iceberg_catalog: {mock_iceberg_catalog}")
|
94
76
|
self.assertEqual(catalog.impl, mock_iceberg_catalog)
|
95
77
|
# Check that the inner state is set to the output of initialize
|
96
78
|
self.assertEqual(catalog.inner, {"iceberg": True})
|
@@ -109,8 +91,7 @@ class TestCatalogsIntegration:
|
|
109
91
|
catalog = Catalog(impl=MockCatalogImpl)
|
110
92
|
init(
|
111
93
|
catalog,
|
112
|
-
|
113
|
-
**{"force_reinitialize": True},
|
94
|
+
force=True,
|
114
95
|
)
|
115
96
|
|
116
97
|
@classmethod
|
@@ -118,17 +99,13 @@ class TestCatalogsIntegration:
|
|
118
99
|
if cls.temp_dir and os.path.exists(cls.temp_dir):
|
119
100
|
shutil.rmtree(cls.temp_dir)
|
120
101
|
|
121
|
-
def test_init_single_catalog(self,
|
102
|
+
def test_init_single_catalog(self, reset_catalogs):
|
122
103
|
"""Test initializing a single catalog."""
|
123
104
|
|
124
105
|
catalog = Catalog(impl=MockCatalogImpl)
|
125
106
|
|
126
107
|
# Initialize with a single catalog and Ray init args including the namespace
|
127
|
-
init(
|
128
|
-
catalog,
|
129
|
-
ray_init_args={"ignore_reinit_error": True},
|
130
|
-
**{"force_reinitialize": True},
|
131
|
-
)
|
108
|
+
init(catalog, force=True)
|
132
109
|
|
133
110
|
assert is_initialized()
|
134
111
|
|
@@ -137,7 +114,7 @@ class TestCatalogsIntegration:
|
|
137
114
|
assert retrieved_catalog.impl == MockCatalogImpl
|
138
115
|
assert retrieved_catalog.inner["initialized"]
|
139
116
|
|
140
|
-
def test_init_multiple_catalogs(self,
|
117
|
+
def test_init_multiple_catalogs(self, reset_catalogs):
|
141
118
|
"""Test initializing multiple catalogs."""
|
142
119
|
# Create catalogs
|
143
120
|
catalog1 = Catalog(impl=MockCatalogImpl, id=1)
|
@@ -145,11 +122,7 @@ class TestCatalogsIntegration:
|
|
145
122
|
|
146
123
|
# Initialize with multiple catalogs and Ray init args including the namespace
|
147
124
|
catalogs_dict = {"catalog1": catalog1, "catalog2": catalog2}
|
148
|
-
init(
|
149
|
-
catalogs_dict,
|
150
|
-
ray_init_args={"ignore_reinit_error": True},
|
151
|
-
**{"force_reinitialize": True},
|
152
|
-
)
|
125
|
+
init(catalogs_dict, force=True)
|
153
126
|
|
154
127
|
assert is_initialized()
|
155
128
|
|
@@ -162,7 +135,7 @@ class TestCatalogsIntegration:
|
|
162
135
|
assert retrieved_catalog2.impl == MockCatalogImpl
|
163
136
|
assert retrieved_catalog2.inner["kwargs"]["id"] == 2
|
164
137
|
|
165
|
-
def test_init_with_default_catalog_name(self,
|
138
|
+
def test_init_with_default_catalog_name(self, reset_catalogs):
|
166
139
|
"""Test initializing with a specified default catalog name."""
|
167
140
|
# Create catalogs
|
168
141
|
catalog1 = Catalog(impl=MockCatalogImpl, id=1)
|
@@ -173,8 +146,7 @@ class TestCatalogsIntegration:
|
|
173
146
|
init(
|
174
147
|
catalogs_dict,
|
175
148
|
default="catalog2",
|
176
|
-
|
177
|
-
**{"force_reinitialize": True},
|
149
|
+
force=True,
|
178
150
|
)
|
179
151
|
|
180
152
|
# Get the default catalog and check it's catalog2
|
@@ -182,16 +154,12 @@ class TestCatalogsIntegration:
|
|
182
154
|
assert default_catalog.impl == MockCatalogImpl
|
183
155
|
assert default_catalog.inner["kwargs"]["id"] == 2
|
184
156
|
|
185
|
-
def test_put_catalog(self,
|
157
|
+
def test_put_catalog(self, reset_catalogs):
|
186
158
|
"""Test adding a catalog after initialization."""
|
187
159
|
# Initialize with a single catalog
|
188
160
|
catalog1 = Catalog(impl=MockCatalogImpl, id=1)
|
189
161
|
catalog2 = Catalog(impl=MockCatalogImpl, id=2)
|
190
|
-
init(
|
191
|
-
{"catalog1": catalog1},
|
192
|
-
ray_init_args={"ignore_reinit_error": True},
|
193
|
-
**{"force_reinitialize": True},
|
194
|
-
)
|
162
|
+
init({"catalog1": catalog1}, force=True)
|
195
163
|
|
196
164
|
# Add a second catalog
|
197
165
|
put_catalog("catalog2", catalog2)
|
@@ -203,21 +171,19 @@ class TestCatalogsIntegration:
|
|
203
171
|
retrieved_catalog2 = get_catalog("catalog2")
|
204
172
|
assert retrieved_catalog2.inner["kwargs"]["id"] == 2
|
205
173
|
|
206
|
-
def test_put_catalog_that_already_exists(self,
|
174
|
+
def test_put_catalog_that_already_exists(self, reset_catalogs):
|
207
175
|
catalog = Catalog(impl=MockCatalogImpl, id=1)
|
208
176
|
catalog2 = Catalog(impl=MockCatalogImpl, id=2)
|
209
177
|
put_catalog(
|
210
178
|
"test_catalog",
|
211
179
|
catalog,
|
212
180
|
id=1,
|
213
|
-
ray_init_args={"ignore_reinit_error": True},
|
214
181
|
)
|
215
182
|
|
216
183
|
# Try to add another catalog with the same name. Should not error
|
217
184
|
put_catalog(
|
218
185
|
"test_catalog",
|
219
186
|
catalog2,
|
220
|
-
ray_init_args={"ignore_reinit_error": True},
|
221
187
|
)
|
222
188
|
|
223
189
|
retrieved_catalog = get_catalog("test_catalog")
|
@@ -228,40 +194,31 @@ class TestCatalogsIntegration:
|
|
228
194
|
put_catalog(
|
229
195
|
"test_catalog",
|
230
196
|
catalog,
|
231
|
-
ray_init_args={"ignore_reinit_error": True},
|
232
197
|
fail_if_exists=True,
|
233
198
|
)
|
234
199
|
|
235
|
-
def test_get_catalog_nonexistent(self,
|
200
|
+
def test_get_catalog_nonexistent(self, reset_catalogs):
|
236
201
|
"""Test that trying to get a nonexistent catalog raises an error."""
|
237
202
|
# Initialize with a catalog
|
238
203
|
catalog = Catalog(impl=MockCatalogImpl)
|
239
|
-
init(
|
240
|
-
{"test_catalog": catalog},
|
241
|
-
ray_init_args={"ignore_reinit_error": True},
|
242
|
-
**{"force_reinitialize": True},
|
243
|
-
)
|
204
|
+
init({"test_catalog": catalog}, force=True)
|
244
205
|
|
245
206
|
# Try to get a nonexistent catalog
|
246
207
|
with pytest.raises(ValueError):
|
247
208
|
get_catalog("nonexistent")
|
248
209
|
|
249
|
-
def test_get_catalog_no_default(self,
|
210
|
+
def test_get_catalog_no_default(self, reset_catalogs):
|
250
211
|
"""Test that trying to get the default catalog when none is set raises an error."""
|
251
212
|
# Initialize with multiple catalogs but no default
|
252
213
|
catalog1 = Catalog(impl=MockCatalogImpl, id=1)
|
253
214
|
catalog2 = Catalog(impl=MockCatalogImpl, id=2)
|
254
|
-
init(
|
255
|
-
{"catalog1": catalog1, "catalog2": catalog2},
|
256
|
-
ray_init_args={"ignore_reinit_error": True},
|
257
|
-
**{"force_reinitialize": True},
|
258
|
-
)
|
215
|
+
init({"catalog1": catalog1, "catalog2": catalog2}, force=True)
|
259
216
|
|
260
217
|
# Try to get the default catalog
|
261
218
|
with pytest.raises(ValueError):
|
262
219
|
get_catalog()
|
263
220
|
|
264
|
-
def test_default_catalog_initialization(self,
|
221
|
+
def test_default_catalog_initialization(self, reset_catalogs):
|
265
222
|
"""Test that a Default catalog can be initialized and accessed using the factory method."""
|
266
223
|
from deltacat.catalog.model.properties import CatalogProperties
|
267
224
|
|
@@ -270,15 +227,11 @@ class TestCatalogsIntegration:
|
|
270
227
|
# Create the catalog properties
|
271
228
|
config = CatalogProperties(root=self.temp_dir)
|
272
229
|
|
273
|
-
# Create the catalog
|
274
|
-
catalog = Catalog
|
230
|
+
# Create the catalog
|
231
|
+
catalog = Catalog(config)
|
275
232
|
|
276
233
|
# Initialize DeltaCAT with this catalog
|
277
|
-
init(
|
278
|
-
{catalog_name: catalog},
|
279
|
-
ray_init_args={"ignore_reinit_error": True},
|
280
|
-
**{"force_reinitialize": True},
|
281
|
-
)
|
234
|
+
init({catalog_name: catalog}, force=True)
|
282
235
|
|
283
236
|
# Retrieve the catalog and verify it's the same one
|
284
237
|
retrieved_catalog = get_catalog(catalog_name)
|
@@ -286,16 +239,14 @@ class TestCatalogsIntegration:
|
|
286
239
|
assert isinstance(retrieved_catalog.inner, CatalogProperties)
|
287
240
|
assert retrieved_catalog.inner.root == self.temp_dir
|
288
241
|
|
289
|
-
def test_default_catalog_initialization_from_kwargs(self,
|
242
|
+
def test_default_catalog_initialization_from_kwargs(self, reset_catalogs):
|
290
243
|
|
291
244
|
catalog_name = str(uuid.uuid4())
|
292
|
-
# Initialize DeltaCAT with this catalog
|
293
|
-
from deltacat.catalog.main import impl as DeltacatCatalog
|
294
245
|
|
246
|
+
# Initialize DeltaCAT with this catalog
|
295
247
|
put_catalog(
|
296
248
|
catalog_name,
|
297
|
-
Catalog(
|
298
|
-
ray_init_args={"ignore_reinit_error": True},
|
249
|
+
Catalog(root="test_root"),
|
299
250
|
)
|
300
251
|
|
301
252
|
# Retrieve the catalog and verify it's the same one
|
@@ -304,7 +255,7 @@ class TestCatalogsIntegration:
|
|
304
255
|
assert isinstance(retrieved_catalog.inner, CatalogProperties)
|
305
256
|
assert retrieved_catalog.inner.root == "test_root"
|
306
257
|
|
307
|
-
def test_iceberg_catalog_initialization(self,
|
258
|
+
def test_iceberg_catalog_initialization(self, reset_catalogs):
|
308
259
|
"""Test that an Iceberg catalog can be initialized and accessed."""
|
309
260
|
catalog_name = str(uuid.uuid4())
|
310
261
|
|
@@ -314,11 +265,14 @@ class TestCatalogsIntegration:
|
|
314
265
|
)
|
315
266
|
|
316
267
|
# Create the catalog using the factory method
|
317
|
-
catalog =
|
268
|
+
catalog = IcebergCatalog.from_config(config)
|
318
269
|
|
319
|
-
put_catalog(catalog_name, catalog
|
270
|
+
put_catalog(catalog_name, catalog)
|
320
271
|
|
321
272
|
# Retrieve the catalog and verify it's the same one
|
322
273
|
retrieved_catalog = get_catalog(catalog_name)
|
323
|
-
assert
|
324
|
-
|
274
|
+
assert (
|
275
|
+
retrieved_catalog.impl.__name__
|
276
|
+
== "deltacat.experimental.catalog.iceberg.impl"
|
277
|
+
)
|
278
|
+
assert isinstance(retrieved_catalog.inner, PyIcebergCatalog)
|
@@ -38,8 +38,7 @@ class TestReadTable(unittest.TestCase):
|
|
38
38
|
catalog_config = CatalogProperties(storage=ds)
|
39
39
|
dc.put_catalog(
|
40
40
|
cls.catalog_name,
|
41
|
-
catalog=Catalog
|
42
|
-
ray_init_args={"ignore_reinit_error": True},
|
41
|
+
catalog=Catalog(catalog_config),
|
43
42
|
)
|
44
43
|
super().setUpClass()
|
45
44
|
|
@@ -12,14 +12,13 @@ from pyiceberg.types import (
|
|
12
12
|
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
13
13
|
from pyiceberg.transforms import IdentityTransform
|
14
14
|
import pyarrow as pa
|
15
|
+
import daft
|
15
16
|
|
16
17
|
from deltacat.compute.converter.steps.convert import convert
|
17
18
|
from deltacat.compute.converter.model.convert_input import ConvertInput
|
18
19
|
from deltacat.compute.converter.pyiceberg.overrides import (
|
19
20
|
fetch_all_bucket_files,
|
20
|
-
parquet_files_dict_to_iceberg_data_files,
|
21
21
|
)
|
22
|
-
from collections import defaultdict
|
23
22
|
from deltacat.compute.converter.utils.converter_session_utils import (
|
24
23
|
group_all_files_to_each_bucket,
|
25
24
|
)
|
@@ -244,11 +243,14 @@ def test_converter_drop_duplicates_success(
|
|
244
243
|
convert_task_index=i,
|
245
244
|
iceberg_table_warehouse_prefix="warehouse/default",
|
246
245
|
identifier_fields=["primary_key"],
|
247
|
-
|
246
|
+
table_io=tbl.io,
|
247
|
+
table_metadata=tbl.metadata,
|
248
|
+
compact_previous_position_delete_files=False,
|
248
249
|
enforce_primary_key_uniqueness=True,
|
249
250
|
position_delete_for_multiple_data_files=True,
|
250
251
|
max_parallel_data_file_download=10,
|
251
252
|
s3_file_system=s3_file_system,
|
253
|
+
s3_client_kwargs={},
|
252
254
|
)
|
253
255
|
|
254
256
|
number_partitioned_array_1 = pa.array([0, 0, 0], type=pa.int32())
|
@@ -272,38 +274,31 @@ def test_converter_drop_duplicates_success(
|
|
272
274
|
[number_partitioned_array_3, primary_key_array_3], names=names
|
273
275
|
)
|
274
276
|
|
277
|
+
daft_df_1 = daft.from_arrow(data_table_1)
|
278
|
+
daft_df_2 = daft.from_arrow(data_table_2)
|
279
|
+
daft_df_3 = daft.from_arrow(data_table_3)
|
280
|
+
|
275
281
|
download_data_mock = mocker.patch(
|
276
|
-
"deltacat.compute.converter.utils.io.
|
282
|
+
"deltacat.compute.converter.utils.io.daft_read_parquet"
|
277
283
|
)
|
278
|
-
download_data_mock.side_effect = (
|
284
|
+
download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
|
279
285
|
|
280
286
|
convert_ref = convert.remote(convert_input)
|
281
287
|
|
282
288
|
to_be_deleted_files_list = []
|
283
|
-
to_be_added_files_dict_list = []
|
284
|
-
convert_result = ray.get(convert_ref)
|
285
|
-
|
286
|
-
partition_value = convert_input.convert_input_files.partition_value
|
287
289
|
|
288
|
-
|
289
|
-
to_be_deleted_files_list.extend(convert_result[0].values())
|
290
|
-
|
291
|
-
file_location = convert_result[1][partition_value][0]
|
292
|
-
to_be_added_files = f"s3://{file_location}"
|
290
|
+
convert_result = ray.get(convert_ref)
|
293
291
|
|
294
|
-
|
295
|
-
|
296
|
-
|
292
|
+
to_be_added_files_list = []
|
293
|
+
# Check if there're files to delete
|
294
|
+
if convert_result.to_be_deleted_files:
|
295
|
+
to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
|
296
|
+
if convert_result.to_be_added_files:
|
297
|
+
to_be_added_files_list.extend(convert_result.to_be_added_files)
|
297
298
|
|
298
|
-
# 4. Commit position delete, delete equality deletes from table
|
299
|
-
new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
|
300
|
-
io=tbl.io,
|
301
|
-
table_metadata=tbl.metadata,
|
302
|
-
files_dict_list=to_be_added_files_dict_list,
|
303
|
-
)
|
304
299
|
commit_append_snapshot(
|
305
300
|
iceberg_table=tbl,
|
306
|
-
new_position_delete_files=
|
301
|
+
new_position_delete_files=to_be_added_files_list,
|
307
302
|
)
|
308
303
|
tbl.refresh()
|
309
304
|
|
@@ -413,11 +408,14 @@ def test_converter_pos_delete_read_by_spark_success(
|
|
413
408
|
convert_task_index=i,
|
414
409
|
iceberg_table_warehouse_prefix="warehouse/default",
|
415
410
|
identifier_fields=["primary_key"],
|
416
|
-
|
411
|
+
table_io=tbl.io,
|
412
|
+
table_metadata=tbl.metadata,
|
413
|
+
compact_previous_position_delete_files=False,
|
417
414
|
enforce_primary_key_uniqueness=True,
|
418
415
|
position_delete_for_multiple_data_files=True,
|
419
416
|
max_parallel_data_file_download=10,
|
420
417
|
s3_file_system=s3_file_system,
|
418
|
+
s3_client_kwargs={},
|
421
419
|
)
|
422
420
|
|
423
421
|
primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
|
@@ -432,39 +430,30 @@ def test_converter_pos_delete_read_by_spark_success(
|
|
432
430
|
names = ["primary_key"]
|
433
431
|
data_table_3 = pa.Table.from_arrays([primary_key_array_3], names=names)
|
434
432
|
|
433
|
+
daft_df_1 = daft.from_arrow(data_table_1)
|
434
|
+
daft_df_2 = daft.from_arrow(data_table_2)
|
435
|
+
daft_df_3 = daft.from_arrow(data_table_3)
|
436
|
+
|
435
437
|
download_data_mock = mocker.patch(
|
436
|
-
"deltacat.compute.converter.utils.io.
|
438
|
+
"deltacat.compute.converter.utils.io.daft_read_parquet"
|
437
439
|
)
|
438
|
-
download_data_mock.side_effect = (
|
440
|
+
download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
|
439
441
|
|
440
442
|
convert_ref = convert.remote(convert_input)
|
441
443
|
|
442
444
|
to_be_deleted_files_list = []
|
443
|
-
|
445
|
+
to_be_added_files_list = []
|
444
446
|
convert_result = ray.get(convert_ref)
|
445
447
|
|
446
|
-
|
447
|
-
|
448
|
-
if convert_result
|
449
|
-
|
450
|
-
|
451
|
-
file_location = convert_result[1][partition_value][0]
|
452
|
-
to_be_added_files = f"s3://{file_location}"
|
453
|
-
|
454
|
-
to_be_added_files_dict = defaultdict()
|
455
|
-
to_be_added_files_dict[partition_value] = [to_be_added_files]
|
456
|
-
to_be_added_files_dict_list.append(to_be_added_files_dict)
|
448
|
+
if convert_result.to_be_deleted_files:
|
449
|
+
to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
|
450
|
+
if convert_result.to_be_added_files:
|
451
|
+
to_be_added_files_list.extend(convert_result.to_be_added_files)
|
457
452
|
|
458
453
|
# 4. Commit position delete, delete equality deletes from table
|
459
|
-
new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
|
460
|
-
io=tbl.io,
|
461
|
-
table_metadata=tbl.metadata,
|
462
|
-
files_dict_list=to_be_added_files_dict_list,
|
463
|
-
)
|
464
|
-
|
465
454
|
commit_append_snapshot(
|
466
455
|
iceberg_table=tbl,
|
467
|
-
new_position_delete_files=
|
456
|
+
new_position_delete_files=to_be_added_files_list,
|
468
457
|
)
|
469
458
|
tbl.refresh()
|
470
459
|
|
@@ -476,3 +465,177 @@ def test_converter_pos_delete_read_by_spark_success(
|
|
476
465
|
]
|
477
466
|
all_pk_sorted = sorted(all_pk)
|
478
467
|
assert all_pk_sorted == ["pk1", "pk2", "pk3", "pk4"]
|
468
|
+
|
469
|
+
|
470
|
+
@pytest.mark.integration
|
471
|
+
def test_converter_pos_delete_multiple_identifier_fields_success(
|
472
|
+
spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
|
473
|
+
) -> None:
|
474
|
+
"""
|
475
|
+
Test for convert compute remote function happy case. Download file results are mocked.
|
476
|
+
"""
|
477
|
+
|
478
|
+
# 1. Create Iceberg table
|
479
|
+
namespace = "default"
|
480
|
+
table_name = "table_converter_ray_pos_delete_multiple_identifier_fields"
|
481
|
+
|
482
|
+
identifier = f"{namespace}.{table_name}"
|
483
|
+
|
484
|
+
schema = Schema(
|
485
|
+
NestedField(
|
486
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
487
|
+
),
|
488
|
+
NestedField(
|
489
|
+
field_id=2, name="primary_key1", field_type=StringType(), required=False
|
490
|
+
),
|
491
|
+
NestedField(
|
492
|
+
field_id=3, name="primary_key2", field_type=LongType(), required=False
|
493
|
+
),
|
494
|
+
schema_id=0,
|
495
|
+
)
|
496
|
+
|
497
|
+
partition_field_identity = PartitionField(
|
498
|
+
source_id=1,
|
499
|
+
field_id=101,
|
500
|
+
transform=IdentityTransform(),
|
501
|
+
name="number_partitioned",
|
502
|
+
)
|
503
|
+
partition_spec = PartitionSpec(partition_field_identity)
|
504
|
+
|
505
|
+
properties = dict()
|
506
|
+
properties["write.format.default"] = "parquet"
|
507
|
+
properties["write.delete.mode"] = "merge-on-read"
|
508
|
+
properties["write.update.mode"] = "merge-on-read"
|
509
|
+
properties["write.merge.mode"] = "merge-on-read"
|
510
|
+
properties["format-version"] = "2"
|
511
|
+
|
512
|
+
drop_table_if_exists(identifier, session_catalog)
|
513
|
+
session_catalog.create_table(
|
514
|
+
identifier,
|
515
|
+
schema=schema,
|
516
|
+
partition_spec=partition_spec,
|
517
|
+
properties=properties,
|
518
|
+
)
|
519
|
+
|
520
|
+
# 2. Use Spark to generate initial data files
|
521
|
+
tbl = session_catalog.load_table(identifier)
|
522
|
+
|
523
|
+
run_spark_commands(
|
524
|
+
spark,
|
525
|
+
[
|
526
|
+
f"""
|
527
|
+
INSERT INTO {identifier} VALUES (0, "pk1", 1), (0, "pk2", 2), (0, "pk3", 3)
|
528
|
+
"""
|
529
|
+
],
|
530
|
+
)
|
531
|
+
run_spark_commands(
|
532
|
+
spark,
|
533
|
+
[
|
534
|
+
f"""
|
535
|
+
INSERT INTO {identifier} VALUES (0, "pk1", 1), (0, "pk2", 2), (0, "pk3", 3)
|
536
|
+
"""
|
537
|
+
],
|
538
|
+
)
|
539
|
+
run_spark_commands(
|
540
|
+
spark,
|
541
|
+
[
|
542
|
+
f"""
|
543
|
+
INSERT INTO {identifier} VALUES (0, "pk4", 1), (0, "pk2", 3), (0, "pk3", 4)
|
544
|
+
"""
|
545
|
+
],
|
546
|
+
)
|
547
|
+
tbl.refresh()
|
548
|
+
|
549
|
+
# 3. Use convert.remote() function to compute position deletes
|
550
|
+
data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
|
551
|
+
|
552
|
+
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
553
|
+
data_file_dict=data_file_dict,
|
554
|
+
equality_delete_dict=equality_delete_dict,
|
555
|
+
pos_delete_dict=pos_delete_dict,
|
556
|
+
)
|
557
|
+
|
558
|
+
s3_file_system = get_s3_file_system()
|
559
|
+
for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
|
560
|
+
convert_input = ConvertInput.of(
|
561
|
+
convert_input_files=one_bucket_files,
|
562
|
+
convert_task_index=i,
|
563
|
+
iceberg_table_warehouse_prefix="warehouse/default",
|
564
|
+
identifier_fields=["primary_key1", "primary_key2"],
|
565
|
+
table_io=tbl.io,
|
566
|
+
table_metadata=tbl.metadata,
|
567
|
+
compact_previous_position_delete_files=False,
|
568
|
+
enforce_primary_key_uniqueness=True,
|
569
|
+
position_delete_for_multiple_data_files=True,
|
570
|
+
max_parallel_data_file_download=10,
|
571
|
+
s3_file_system=s3_file_system,
|
572
|
+
s3_client_kwargs={},
|
573
|
+
)
|
574
|
+
|
575
|
+
names = ["primary_key1", "primary_key2"]
|
576
|
+
|
577
|
+
primary_key1_array_1 = pa.array(["pk1", "pk2", "pk3"])
|
578
|
+
primary_key2_array_1 = pa.array([1, 2, 3])
|
579
|
+
data_table_1 = pa.Table.from_arrays(
|
580
|
+
[primary_key1_array_1, primary_key2_array_1], names=names
|
581
|
+
)
|
582
|
+
|
583
|
+
primary_key1_array_2 = pa.array(["pk1", "pk2", "pk3"])
|
584
|
+
primary_key2_array_2 = pa.array([1, 2, 3])
|
585
|
+
data_table_2 = pa.Table.from_arrays(
|
586
|
+
[primary_key1_array_2, primary_key2_array_2], names=names
|
587
|
+
)
|
588
|
+
|
589
|
+
primary_key1_array_3 = pa.array(["pk4", "pk2", "pk3"])
|
590
|
+
primary_key2_array_3 = pa.array([1, 3, 4])
|
591
|
+
data_table_3 = pa.Table.from_arrays(
|
592
|
+
[primary_key1_array_3, primary_key2_array_3], names=names
|
593
|
+
)
|
594
|
+
|
595
|
+
daft_df_1 = daft.from_arrow(data_table_1)
|
596
|
+
daft_df_2 = daft.from_arrow(data_table_2)
|
597
|
+
daft_df_3 = daft.from_arrow(data_table_3)
|
598
|
+
|
599
|
+
download_data_mock = mocker.patch(
|
600
|
+
"deltacat.compute.converter.utils.io.daft_read_parquet"
|
601
|
+
)
|
602
|
+
download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
|
603
|
+
|
604
|
+
convert_ref = convert.remote(convert_input)
|
605
|
+
|
606
|
+
to_be_deleted_files_list = []
|
607
|
+
to_be_added_files_list = []
|
608
|
+
convert_result = ray.get(convert_ref)
|
609
|
+
|
610
|
+
if convert_result.to_be_deleted_files:
|
611
|
+
to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
|
612
|
+
if convert_result.to_be_added_files:
|
613
|
+
to_be_added_files_list.extend(convert_result.to_be_added_files)
|
614
|
+
|
615
|
+
# 4. Commit position delete, delete equality deletes from table
|
616
|
+
|
617
|
+
commit_append_snapshot(
|
618
|
+
iceberg_table=tbl,
|
619
|
+
new_position_delete_files=to_be_added_files_list,
|
620
|
+
)
|
621
|
+
tbl.refresh()
|
622
|
+
|
623
|
+
# 5. Result assertion: Expected unique primary keys to be kept
|
624
|
+
pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
|
625
|
+
expected_result_tuple_list = [
|
626
|
+
("pk1", 1),
|
627
|
+
("pk2", 2),
|
628
|
+
("pk2", 3),
|
629
|
+
("pk3", 3),
|
630
|
+
("pk3", 4),
|
631
|
+
("pk4", 1),
|
632
|
+
]
|
633
|
+
pk_combined_res = []
|
634
|
+
for pk1, pk2 in zip(
|
635
|
+
pyiceberg_scan_table_rows["primary_key1"],
|
636
|
+
pyiceberg_scan_table_rows["primary_key2"],
|
637
|
+
):
|
638
|
+
pk_combined_res.append((pk1, pk2))
|
639
|
+
|
640
|
+
# Assert elements are same disregard ordering in list
|
641
|
+
assert sorted(pk_combined_res) == sorted(expected_result_tuple_list)
|
File without changes
|