deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. deltacat/__init__.py +41 -16
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/benchmark_engine.py +4 -2
  5. deltacat/benchmarking/conftest.py +1 -1
  6. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  7. deltacat/catalog/__init__.py +62 -5
  8. deltacat/catalog/main/impl.py +26 -10
  9. deltacat/catalog/model/catalog.py +165 -109
  10. deltacat/catalog/model/properties.py +25 -24
  11. deltacat/compute/__init__.py +14 -0
  12. deltacat/compute/converter/constants.py +5 -0
  13. deltacat/compute/converter/converter_session.py +78 -36
  14. deltacat/compute/converter/model/convert_input.py +24 -4
  15. deltacat/compute/converter/model/convert_result.py +61 -0
  16. deltacat/compute/converter/model/converter_session_params.py +52 -10
  17. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  18. deltacat/compute/converter/steps/convert.py +84 -36
  19. deltacat/compute/converter/steps/dedupe.py +25 -4
  20. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  21. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  22. deltacat/compute/converter/utils/io.py +82 -11
  23. deltacat/compute/converter/utils/s3u.py +13 -4
  24. deltacat/compute/jobs/client.py +406 -0
  25. deltacat/constants.py +5 -6
  26. deltacat/env.py +10 -0
  27. deltacat/examples/basic_logging.py +6 -6
  28. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  29. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  30. deltacat/examples/hello_world.py +4 -2
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +198 -0
  33. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  34. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  35. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  36. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  37. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  38. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  39. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  40. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  41. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  42. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  43. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  44. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  45. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  46. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  47. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  49. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  50. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  51. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  52. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  53. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  54. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  55. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  56. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  57. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  58. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  59. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  60. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  61. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  62. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  63. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  64. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  65. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  66. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  67. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  68. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  69. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  70. deltacat/io/__init__.py +13 -0
  71. deltacat/io/dataset/__init__.py +0 -0
  72. deltacat/io/dataset/deltacat_dataset.py +91 -0
  73. deltacat/io/datasink/__init__.py +0 -0
  74. deltacat/io/datasink/deltacat_datasink.py +207 -0
  75. deltacat/io/datasource/__init__.py +0 -0
  76. deltacat/io/datasource/deltacat_datasource.py +580 -0
  77. deltacat/io/reader/__init__.py +0 -0
  78. deltacat/io/reader/deltacat_read_api.py +172 -0
  79. deltacat/storage/__init__.py +2 -0
  80. deltacat/storage/model/expression/__init__.py +47 -0
  81. deltacat/storage/model/expression/expression.py +656 -0
  82. deltacat/storage/model/expression/visitor.py +248 -0
  83. deltacat/storage/model/metafile.py +74 -42
  84. deltacat/storage/model/scan/push_down.py +32 -5
  85. deltacat/storage/model/shard.py +6 -2
  86. deltacat/storage/model/types.py +5 -3
  87. deltacat/tests/_io/reader/__init__.py +0 -0
  88. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  89. deltacat/tests/catalog/data/__init__.py +0 -0
  90. deltacat/tests/catalog/main/__init__.py +0 -0
  91. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  92. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  93. deltacat/tests/catalog/model/__init__.py +0 -0
  94. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  95. deltacat/tests/catalog/test_catalogs.py +52 -98
  96. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  97. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  98. deltacat/tests/daft/__init__.py +0 -0
  99. deltacat/tests/daft/test_model.py +97 -0
  100. deltacat/tests/experimental/__init__.py +0 -0
  101. deltacat/tests/experimental/catalog/__init__.py +0 -0
  102. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  103. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  104. deltacat/tests/experimental/daft/__init__.py +0 -0
  105. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  106. deltacat/tests/experimental/storage/__init__.py +0 -0
  107. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  108. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  109. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  110. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  111. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  112. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  113. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  114. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  115. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  116. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  117. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  118. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  119. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  120. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  121. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  122. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  123. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  124. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  125. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  126. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  127. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  128. deltacat/tests/storage/model/test_expression.py +327 -0
  129. deltacat/tests/storage/model/test_shard.py +3 -1
  130. deltacat/tests/test_deltacat_api.py +50 -9
  131. deltacat/types/media.py +141 -43
  132. deltacat/types/tables.py +35 -7
  133. deltacat/utils/daft.py +531 -5
  134. deltacat/utils/export.py +3 -1
  135. deltacat/utils/filesystem.py +39 -9
  136. deltacat/utils/polars.py +128 -0
  137. deltacat/utils/pyarrow.py +151 -15
  138. deltacat/utils/ray_utils/concurrency.py +1 -1
  139. deltacat/utils/ray_utils/runtime.py +56 -4
  140. deltacat/utils/url.py +1284 -0
  141. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
  142. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
  143. deltacat/catalog/iceberg/__init__.py +0 -4
  144. deltacat/daft/daft_scan.py +0 -111
  145. deltacat/daft/model.py +0 -258
  146. deltacat/examples/common/fixtures.py +0 -15
  147. deltacat/storage/rivulet/__init__.py +0 -11
  148. deltacat/storage/rivulet/feather/__init__.py +0 -5
  149. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  150. /deltacat/{daft → compute/jobs}/__init__.py +0 -0
  151. /deltacat/examples/{common → experimental}/__init__.py +0 -0
  152. /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
  153. /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
  154. /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
  155. /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  156. /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
  157. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  158. /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
  159. /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  160. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  161. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  162. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
  163. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  164. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  165. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  166. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  167. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  168. /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  169. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  170. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  171. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  172. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  173. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  174. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  175. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,24 @@
1
1
  import unittest
2
2
  import pytest
3
- import ray
4
3
  import tempfile
5
4
  import shutil
6
5
  import uuid
7
6
  from unittest import mock
8
7
  import os
9
8
 
10
- from deltacat.catalog import CatalogProperties
11
- from pyiceberg.catalog import Catalog as IcebergCatalog
12
-
13
- from deltacat.catalog.model.catalog import (
9
+ from deltacat.catalog import (
10
+ CatalogProperties,
14
11
  Catalog,
15
- init,
12
+ clear_catalogs,
16
13
  get_catalog,
17
- put_catalog,
14
+ init,
18
15
  is_initialized,
16
+ put_catalog,
19
17
  )
20
- from deltacat.catalog.iceberg.iceberg_catalog_config import IcebergCatalogConfig
18
+ from deltacat.experimental.catalog.iceberg import impl as IcebergCatalog
19
+ from pyiceberg.catalog import Catalog as PyIcebergCatalog
20
+
21
+ from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
21
22
 
22
23
  from pyiceberg.catalog import CatalogType
23
24
 
@@ -25,41 +26,19 @@ from pyiceberg.catalog import CatalogType
25
26
  # Test module to mock a catalog implementation
26
27
  class MockCatalogImpl:
27
28
  @staticmethod
28
- def initialize(*args, **kwargs):
29
+ def initialize(config, *args, **kwargs):
29
30
  # Return some state that the catalog would normally maintain
30
- return {"initialized": True, "args": args, "kwargs": kwargs}
31
+ return {
32
+ "initialized": True,
33
+ "config": config,
34
+ "args": args,
35
+ "kwargs": kwargs,
36
+ }
31
37
 
32
38
 
33
39
  @pytest.fixture(scope="function")
34
- def reset_catalogs_ray_actor():
35
- """
36
- Setup and teardown for Ray environment for tests.
37
-
38
- This will kill the actor all_catalogs, essentially wiping global state for catalogs
39
-
40
- NOTE: tests using this fixture must be run serially. As of April 7 2025, the unit test suite had various
41
- failures if run in parallel, in part because the state of all_catalogs in ray is shared across tests.
42
-
43
- NOTE: when using this fixture, ensure you pass ray_init_args={"ignore_reinit_error": True} into all
44
- functions which may re-initialize ray. This is because the production code checks the all_catalogs actor
45
- in order to determine whether it needs to initialize Ray
46
- """
47
- # Reset the global catalog_actor state before each test
48
- import deltacat.catalog.model.catalog as catalog_module
49
-
50
- # Initialize Ray if not already initialized
51
- if not ray.is_initialized():
52
- ray.init(ignore_reinit_error=True)
53
- yield
54
-
55
- # Clean up the actor if it exists
56
- if catalog_module.all_catalogs is not None:
57
- try:
58
- ray.kill(catalog_module.all_catalogs)
59
- except Exception:
60
- pass
61
- finally:
62
- catalog_module.all_catalogs = None
40
+ def reset_catalogs():
41
+ clear_catalogs()
63
42
 
64
43
 
65
44
  class TestCatalog(unittest.TestCase):
@@ -74,6 +53,7 @@ class TestCatalog(unittest.TestCase):
74
53
  # Check that inner state was correctly initialized
75
54
  # This just asserts that kwargs were plumbed through from Catalog constructor
76
55
  self.assertTrue(catalog.inner["initialized"])
56
+ self.assertIsNone(catalog.inner["config"])
77
57
  self.assertEqual(catalog.inner["args"], ())
78
58
  self.assertEqual(catalog.inner["kwargs"], {})
79
59
 
@@ -81,16 +61,18 @@ class TestCatalog(unittest.TestCase):
81
61
  """Test the iceberg factory method correctly creates an Iceberg catalog."""
82
62
  # Create a mock for the Iceberg catalog module
83
63
  with mock.patch(
84
- "deltacat.catalog.model.catalog.IcebergCatalog"
64
+ "deltacat.experimental.catalog.iceberg.impl.IcebergCatalog"
85
65
  ) as mock_iceberg_catalog:
86
66
  # Configure the mock to return a known value when initialize is called
87
67
  mock_iceberg_catalog.initialize.return_value = {"iceberg": True}
88
68
 
89
69
  # Create an Iceberg catalog config and invoke iceberg factory method
90
70
  config = IcebergCatalogConfig(type=CatalogType.IN_MEMORY, properties={})
91
- catalog = Catalog.iceberg(config)
71
+ catalog = IcebergCatalog.from_config(config)
92
72
 
93
73
  # Check that the implementation is set to iceberg_catalog
74
+ print(f"catalog.impl: {catalog.impl}")
75
+ print(f"mock_iceberg_catalog: {mock_iceberg_catalog}")
94
76
  self.assertEqual(catalog.impl, mock_iceberg_catalog)
95
77
  # Check that the inner state is set to the output of initialize
96
78
  self.assertEqual(catalog.inner, {"iceberg": True})
@@ -109,8 +91,7 @@ class TestCatalogsIntegration:
109
91
  catalog = Catalog(impl=MockCatalogImpl)
110
92
  init(
111
93
  catalog,
112
- ray_init_args={"ignore_reinit_error": True},
113
- **{"force_reinitialize": True},
94
+ force=True,
114
95
  )
115
96
 
116
97
  @classmethod
@@ -118,17 +99,13 @@ class TestCatalogsIntegration:
118
99
  if cls.temp_dir and os.path.exists(cls.temp_dir):
119
100
  shutil.rmtree(cls.temp_dir)
120
101
 
121
- def test_init_single_catalog(self, reset_catalogs_ray_actor):
102
+ def test_init_single_catalog(self, reset_catalogs):
122
103
  """Test initializing a single catalog."""
123
104
 
124
105
  catalog = Catalog(impl=MockCatalogImpl)
125
106
 
126
107
  # Initialize with a single catalog and Ray init args including the namespace
127
- init(
128
- catalog,
129
- ray_init_args={"ignore_reinit_error": True},
130
- **{"force_reinitialize": True},
131
- )
108
+ init(catalog, force=True)
132
109
 
133
110
  assert is_initialized()
134
111
 
@@ -137,7 +114,7 @@ class TestCatalogsIntegration:
137
114
  assert retrieved_catalog.impl == MockCatalogImpl
138
115
  assert retrieved_catalog.inner["initialized"]
139
116
 
140
- def test_init_multiple_catalogs(self, reset_catalogs_ray_actor):
117
+ def test_init_multiple_catalogs(self, reset_catalogs):
141
118
  """Test initializing multiple catalogs."""
142
119
  # Create catalogs
143
120
  catalog1 = Catalog(impl=MockCatalogImpl, id=1)
@@ -145,11 +122,7 @@ class TestCatalogsIntegration:
145
122
 
146
123
  # Initialize with multiple catalogs and Ray init args including the namespace
147
124
  catalogs_dict = {"catalog1": catalog1, "catalog2": catalog2}
148
- init(
149
- catalogs_dict,
150
- ray_init_args={"ignore_reinit_error": True},
151
- **{"force_reinitialize": True},
152
- )
125
+ init(catalogs_dict, force=True)
153
126
 
154
127
  assert is_initialized()
155
128
 
@@ -162,7 +135,7 @@ class TestCatalogsIntegration:
162
135
  assert retrieved_catalog2.impl == MockCatalogImpl
163
136
  assert retrieved_catalog2.inner["kwargs"]["id"] == 2
164
137
 
165
- def test_init_with_default_catalog_name(self, reset_catalogs_ray_actor):
138
+ def test_init_with_default_catalog_name(self, reset_catalogs):
166
139
  """Test initializing with a specified default catalog name."""
167
140
  # Create catalogs
168
141
  catalog1 = Catalog(impl=MockCatalogImpl, id=1)
@@ -173,8 +146,7 @@ class TestCatalogsIntegration:
173
146
  init(
174
147
  catalogs_dict,
175
148
  default="catalog2",
176
- ray_init_args={"ignore_reinit_error": True},
177
- **{"force_reinitialize": True},
149
+ force=True,
178
150
  )
179
151
 
180
152
  # Get the default catalog and check it's catalog2
@@ -182,16 +154,12 @@ class TestCatalogsIntegration:
182
154
  assert default_catalog.impl == MockCatalogImpl
183
155
  assert default_catalog.inner["kwargs"]["id"] == 2
184
156
 
185
- def test_put_catalog(self, reset_catalogs_ray_actor):
157
+ def test_put_catalog(self, reset_catalogs):
186
158
  """Test adding a catalog after initialization."""
187
159
  # Initialize with a single catalog
188
160
  catalog1 = Catalog(impl=MockCatalogImpl, id=1)
189
161
  catalog2 = Catalog(impl=MockCatalogImpl, id=2)
190
- init(
191
- {"catalog1": catalog1},
192
- ray_init_args={"ignore_reinit_error": True},
193
- **{"force_reinitialize": True},
194
- )
162
+ init({"catalog1": catalog1}, force=True)
195
163
 
196
164
  # Add a second catalog
197
165
  put_catalog("catalog2", catalog2)
@@ -203,21 +171,19 @@ class TestCatalogsIntegration:
203
171
  retrieved_catalog2 = get_catalog("catalog2")
204
172
  assert retrieved_catalog2.inner["kwargs"]["id"] == 2
205
173
 
206
- def test_put_catalog_that_already_exists(self, reset_catalogs_ray_actor):
174
+ def test_put_catalog_that_already_exists(self, reset_catalogs):
207
175
  catalog = Catalog(impl=MockCatalogImpl, id=1)
208
176
  catalog2 = Catalog(impl=MockCatalogImpl, id=2)
209
177
  put_catalog(
210
178
  "test_catalog",
211
179
  catalog,
212
180
  id=1,
213
- ray_init_args={"ignore_reinit_error": True},
214
181
  )
215
182
 
216
183
  # Try to add another catalog with the same name. Should not error
217
184
  put_catalog(
218
185
  "test_catalog",
219
186
  catalog2,
220
- ray_init_args={"ignore_reinit_error": True},
221
187
  )
222
188
 
223
189
  retrieved_catalog = get_catalog("test_catalog")
@@ -228,40 +194,31 @@ class TestCatalogsIntegration:
228
194
  put_catalog(
229
195
  "test_catalog",
230
196
  catalog,
231
- ray_init_args={"ignore_reinit_error": True},
232
197
  fail_if_exists=True,
233
198
  )
234
199
 
235
- def test_get_catalog_nonexistent(self, reset_catalogs_ray_actor):
200
+ def test_get_catalog_nonexistent(self, reset_catalogs):
236
201
  """Test that trying to get a nonexistent catalog raises an error."""
237
202
  # Initialize with a catalog
238
203
  catalog = Catalog(impl=MockCatalogImpl)
239
- init(
240
- {"test_catalog": catalog},
241
- ray_init_args={"ignore_reinit_error": True},
242
- **{"force_reinitialize": True},
243
- )
204
+ init({"test_catalog": catalog}, force=True)
244
205
 
245
206
  # Try to get a nonexistent catalog
246
207
  with pytest.raises(ValueError):
247
208
  get_catalog("nonexistent")
248
209
 
249
- def test_get_catalog_no_default(self, reset_catalogs_ray_actor):
210
+ def test_get_catalog_no_default(self, reset_catalogs):
250
211
  """Test that trying to get the default catalog when none is set raises an error."""
251
212
  # Initialize with multiple catalogs but no default
252
213
  catalog1 = Catalog(impl=MockCatalogImpl, id=1)
253
214
  catalog2 = Catalog(impl=MockCatalogImpl, id=2)
254
- init(
255
- {"catalog1": catalog1, "catalog2": catalog2},
256
- ray_init_args={"ignore_reinit_error": True},
257
- **{"force_reinitialize": True},
258
- )
215
+ init({"catalog1": catalog1, "catalog2": catalog2}, force=True)
259
216
 
260
217
  # Try to get the default catalog
261
218
  with pytest.raises(ValueError):
262
219
  get_catalog()
263
220
 
264
- def test_default_catalog_initialization(self, reset_catalogs_ray_actor):
221
+ def test_default_catalog_initialization(self, reset_catalogs):
265
222
  """Test that a Default catalog can be initialized and accessed using the factory method."""
266
223
  from deltacat.catalog.model.properties import CatalogProperties
267
224
 
@@ -270,15 +227,11 @@ class TestCatalogsIntegration:
270
227
  # Create the catalog properties
271
228
  config = CatalogProperties(root=self.temp_dir)
272
229
 
273
- # Create the catalog using the factory method
274
- catalog = Catalog.default(config)
230
+ # Create the catalog
231
+ catalog = Catalog(config)
275
232
 
276
233
  # Initialize DeltaCAT with this catalog
277
- init(
278
- {catalog_name: catalog},
279
- ray_init_args={"ignore_reinit_error": True},
280
- **{"force_reinitialize": True},
281
- )
234
+ init({catalog_name: catalog}, force=True)
282
235
 
283
236
  # Retrieve the catalog and verify it's the same one
284
237
  retrieved_catalog = get_catalog(catalog_name)
@@ -286,16 +239,14 @@ class TestCatalogsIntegration:
286
239
  assert isinstance(retrieved_catalog.inner, CatalogProperties)
287
240
  assert retrieved_catalog.inner.root == self.temp_dir
288
241
 
289
- def test_default_catalog_initialization_from_kwargs(self, reset_catalogs_ray_actor):
242
+ def test_default_catalog_initialization_from_kwargs(self, reset_catalogs):
290
243
 
291
244
  catalog_name = str(uuid.uuid4())
292
- # Initialize DeltaCAT with this catalog
293
- from deltacat.catalog.main import impl as DeltacatCatalog
294
245
 
246
+ # Initialize DeltaCAT with this catalog
295
247
  put_catalog(
296
248
  catalog_name,
297
- Catalog(DeltacatCatalog, **{"root": "test_root"}),
298
- ray_init_args={"ignore_reinit_error": True},
249
+ Catalog(root="test_root"),
299
250
  )
300
251
 
301
252
  # Retrieve the catalog and verify it's the same one
@@ -304,7 +255,7 @@ class TestCatalogsIntegration:
304
255
  assert isinstance(retrieved_catalog.inner, CatalogProperties)
305
256
  assert retrieved_catalog.inner.root == "test_root"
306
257
 
307
- def test_iceberg_catalog_initialization(self, reset_catalogs_ray_actor):
258
+ def test_iceberg_catalog_initialization(self, reset_catalogs):
308
259
  """Test that an Iceberg catalog can be initialized and accessed."""
309
260
  catalog_name = str(uuid.uuid4())
310
261
 
@@ -314,11 +265,14 @@ class TestCatalogsIntegration:
314
265
  )
315
266
 
316
267
  # Create the catalog using the factory method
317
- catalog = Catalog.iceberg(config)
268
+ catalog = IcebergCatalog.from_config(config)
318
269
 
319
- put_catalog(catalog_name, catalog, ray_init_args={"ignore_reinit_error": True})
270
+ put_catalog(catalog_name, catalog)
320
271
 
321
272
  # Retrieve the catalog and verify it's the same one
322
273
  retrieved_catalog = get_catalog(catalog_name)
323
- assert retrieved_catalog.impl.__name__ == "deltacat.catalog.iceberg.impl"
324
- assert isinstance(retrieved_catalog.inner, IcebergCatalog)
274
+ assert (
275
+ retrieved_catalog.impl.__name__
276
+ == "deltacat.experimental.catalog.iceberg.impl"
277
+ )
278
+ assert isinstance(retrieved_catalog.inner, PyIcebergCatalog)
@@ -38,8 +38,7 @@ class TestReadTable(unittest.TestCase):
38
38
  catalog_config = CatalogProperties(storage=ds)
39
39
  dc.put_catalog(
40
40
  cls.catalog_name,
41
- catalog=Catalog.default(config=catalog_config),
42
- ray_init_args={"ignore_reinit_error": True},
41
+ catalog=Catalog(catalog_config),
43
42
  )
44
43
  super().setUpClass()
45
44
 
@@ -12,14 +12,13 @@ from pyiceberg.types import (
12
12
  from pyiceberg.partitioning import PartitionSpec, PartitionField
13
13
  from pyiceberg.transforms import IdentityTransform
14
14
  import pyarrow as pa
15
+ import daft
15
16
 
16
17
  from deltacat.compute.converter.steps.convert import convert
17
18
  from deltacat.compute.converter.model.convert_input import ConvertInput
18
19
  from deltacat.compute.converter.pyiceberg.overrides import (
19
20
  fetch_all_bucket_files,
20
- parquet_files_dict_to_iceberg_data_files,
21
21
  )
22
- from collections import defaultdict
23
22
  from deltacat.compute.converter.utils.converter_session_utils import (
24
23
  group_all_files_to_each_bucket,
25
24
  )
@@ -244,11 +243,14 @@ def test_converter_drop_duplicates_success(
244
243
  convert_task_index=i,
245
244
  iceberg_table_warehouse_prefix="warehouse/default",
246
245
  identifier_fields=["primary_key"],
247
- compact_small_files=False,
246
+ table_io=tbl.io,
247
+ table_metadata=tbl.metadata,
248
+ compact_previous_position_delete_files=False,
248
249
  enforce_primary_key_uniqueness=True,
249
250
  position_delete_for_multiple_data_files=True,
250
251
  max_parallel_data_file_download=10,
251
252
  s3_file_system=s3_file_system,
253
+ s3_client_kwargs={},
252
254
  )
253
255
 
254
256
  number_partitioned_array_1 = pa.array([0, 0, 0], type=pa.int32())
@@ -272,38 +274,31 @@ def test_converter_drop_duplicates_success(
272
274
  [number_partitioned_array_3, primary_key_array_3], names=names
273
275
  )
274
276
 
277
+ daft_df_1 = daft.from_arrow(data_table_1)
278
+ daft_df_2 = daft.from_arrow(data_table_2)
279
+ daft_df_3 = daft.from_arrow(data_table_3)
280
+
275
281
  download_data_mock = mocker.patch(
276
- "deltacat.compute.converter.utils.io.download_parquet_with_daft_hash_applied"
282
+ "deltacat.compute.converter.utils.io.daft_read_parquet"
277
283
  )
278
- download_data_mock.side_effect = (data_table_1, data_table_2, data_table_3)
284
+ download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
279
285
 
280
286
  convert_ref = convert.remote(convert_input)
281
287
 
282
288
  to_be_deleted_files_list = []
283
- to_be_added_files_dict_list = []
284
- convert_result = ray.get(convert_ref)
285
-
286
- partition_value = convert_input.convert_input_files.partition_value
287
289
 
288
- if convert_result[0]:
289
- to_be_deleted_files_list.extend(convert_result[0].values())
290
-
291
- file_location = convert_result[1][partition_value][0]
292
- to_be_added_files = f"s3://{file_location}"
290
+ convert_result = ray.get(convert_ref)
293
291
 
294
- to_be_added_files_dict = defaultdict()
295
- to_be_added_files_dict[partition_value] = [to_be_added_files]
296
- to_be_added_files_dict_list.append(to_be_added_files_dict)
292
+ to_be_added_files_list = []
293
+ # Check if there're files to delete
294
+ if convert_result.to_be_deleted_files:
295
+ to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
296
+ if convert_result.to_be_added_files:
297
+ to_be_added_files_list.extend(convert_result.to_be_added_files)
297
298
 
298
- # 4. Commit position delete, delete equality deletes from table
299
- new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
300
- io=tbl.io,
301
- table_metadata=tbl.metadata,
302
- files_dict_list=to_be_added_files_dict_list,
303
- )
304
299
  commit_append_snapshot(
305
300
  iceberg_table=tbl,
306
- new_position_delete_files=new_position_delete_files,
301
+ new_position_delete_files=to_be_added_files_list,
307
302
  )
308
303
  tbl.refresh()
309
304
 
@@ -413,11 +408,14 @@ def test_converter_pos_delete_read_by_spark_success(
413
408
  convert_task_index=i,
414
409
  iceberg_table_warehouse_prefix="warehouse/default",
415
410
  identifier_fields=["primary_key"],
416
- compact_small_files=False,
411
+ table_io=tbl.io,
412
+ table_metadata=tbl.metadata,
413
+ compact_previous_position_delete_files=False,
417
414
  enforce_primary_key_uniqueness=True,
418
415
  position_delete_for_multiple_data_files=True,
419
416
  max_parallel_data_file_download=10,
420
417
  s3_file_system=s3_file_system,
418
+ s3_client_kwargs={},
421
419
  )
422
420
 
423
421
  primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
@@ -432,39 +430,30 @@ def test_converter_pos_delete_read_by_spark_success(
432
430
  names = ["primary_key"]
433
431
  data_table_3 = pa.Table.from_arrays([primary_key_array_3], names=names)
434
432
 
433
+ daft_df_1 = daft.from_arrow(data_table_1)
434
+ daft_df_2 = daft.from_arrow(data_table_2)
435
+ daft_df_3 = daft.from_arrow(data_table_3)
436
+
435
437
  download_data_mock = mocker.patch(
436
- "deltacat.compute.converter.utils.io.download_parquet_with_daft_hash_applied"
438
+ "deltacat.compute.converter.utils.io.daft_read_parquet"
437
439
  )
438
- download_data_mock.side_effect = (data_table_1, data_table_2, data_table_3)
440
+ download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
439
441
 
440
442
  convert_ref = convert.remote(convert_input)
441
443
 
442
444
  to_be_deleted_files_list = []
443
- to_be_added_files_dict_list = []
445
+ to_be_added_files_list = []
444
446
  convert_result = ray.get(convert_ref)
445
447
 
446
- partition_value = convert_input.convert_input_files.partition_value
447
-
448
- if convert_result[0]:
449
- to_be_deleted_files_list.extend(convert_result[0].values())
450
-
451
- file_location = convert_result[1][partition_value][0]
452
- to_be_added_files = f"s3://{file_location}"
453
-
454
- to_be_added_files_dict = defaultdict()
455
- to_be_added_files_dict[partition_value] = [to_be_added_files]
456
- to_be_added_files_dict_list.append(to_be_added_files_dict)
448
+ if convert_result.to_be_deleted_files:
449
+ to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
450
+ if convert_result.to_be_added_files:
451
+ to_be_added_files_list.extend(convert_result.to_be_added_files)
457
452
 
458
453
  # 4. Commit position delete, delete equality deletes from table
459
- new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
460
- io=tbl.io,
461
- table_metadata=tbl.metadata,
462
- files_dict_list=to_be_added_files_dict_list,
463
- )
464
-
465
454
  commit_append_snapshot(
466
455
  iceberg_table=tbl,
467
- new_position_delete_files=new_position_delete_files,
456
+ new_position_delete_files=to_be_added_files_list,
468
457
  )
469
458
  tbl.refresh()
470
459
 
@@ -476,3 +465,177 @@ def test_converter_pos_delete_read_by_spark_success(
476
465
  ]
477
466
  all_pk_sorted = sorted(all_pk)
478
467
  assert all_pk_sorted == ["pk1", "pk2", "pk3", "pk4"]
468
+
469
+
470
+ @pytest.mark.integration
471
+ def test_converter_pos_delete_multiple_identifier_fields_success(
472
+ spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
473
+ ) -> None:
474
+ """
475
+ Test for convert compute remote function happy case. Download file results are mocked.
476
+ """
477
+
478
+ # 1. Create Iceberg table
479
+ namespace = "default"
480
+ table_name = "table_converter_ray_pos_delete_multiple_identifier_fields"
481
+
482
+ identifier = f"{namespace}.{table_name}"
483
+
484
+ schema = Schema(
485
+ NestedField(
486
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
487
+ ),
488
+ NestedField(
489
+ field_id=2, name="primary_key1", field_type=StringType(), required=False
490
+ ),
491
+ NestedField(
492
+ field_id=3, name="primary_key2", field_type=LongType(), required=False
493
+ ),
494
+ schema_id=0,
495
+ )
496
+
497
+ partition_field_identity = PartitionField(
498
+ source_id=1,
499
+ field_id=101,
500
+ transform=IdentityTransform(),
501
+ name="number_partitioned",
502
+ )
503
+ partition_spec = PartitionSpec(partition_field_identity)
504
+
505
+ properties = dict()
506
+ properties["write.format.default"] = "parquet"
507
+ properties["write.delete.mode"] = "merge-on-read"
508
+ properties["write.update.mode"] = "merge-on-read"
509
+ properties["write.merge.mode"] = "merge-on-read"
510
+ properties["format-version"] = "2"
511
+
512
+ drop_table_if_exists(identifier, session_catalog)
513
+ session_catalog.create_table(
514
+ identifier,
515
+ schema=schema,
516
+ partition_spec=partition_spec,
517
+ properties=properties,
518
+ )
519
+
520
+ # 2. Use Spark to generate initial data files
521
+ tbl = session_catalog.load_table(identifier)
522
+
523
+ run_spark_commands(
524
+ spark,
525
+ [
526
+ f"""
527
+ INSERT INTO {identifier} VALUES (0, "pk1", 1), (0, "pk2", 2), (0, "pk3", 3)
528
+ """
529
+ ],
530
+ )
531
+ run_spark_commands(
532
+ spark,
533
+ [
534
+ f"""
535
+ INSERT INTO {identifier} VALUES (0, "pk1", 1), (0, "pk2", 2), (0, "pk3", 3)
536
+ """
537
+ ],
538
+ )
539
+ run_spark_commands(
540
+ spark,
541
+ [
542
+ f"""
543
+ INSERT INTO {identifier} VALUES (0, "pk4", 1), (0, "pk2", 3), (0, "pk3", 4)
544
+ """
545
+ ],
546
+ )
547
+ tbl.refresh()
548
+
549
+ # 3. Use convert.remote() function to compute position deletes
550
+ data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
551
+
552
+ convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
553
+ data_file_dict=data_file_dict,
554
+ equality_delete_dict=equality_delete_dict,
555
+ pos_delete_dict=pos_delete_dict,
556
+ )
557
+
558
+ s3_file_system = get_s3_file_system()
559
+ for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
560
+ convert_input = ConvertInput.of(
561
+ convert_input_files=one_bucket_files,
562
+ convert_task_index=i,
563
+ iceberg_table_warehouse_prefix="warehouse/default",
564
+ identifier_fields=["primary_key1", "primary_key2"],
565
+ table_io=tbl.io,
566
+ table_metadata=tbl.metadata,
567
+ compact_previous_position_delete_files=False,
568
+ enforce_primary_key_uniqueness=True,
569
+ position_delete_for_multiple_data_files=True,
570
+ max_parallel_data_file_download=10,
571
+ s3_file_system=s3_file_system,
572
+ s3_client_kwargs={},
573
+ )
574
+
575
+ names = ["primary_key1", "primary_key2"]
576
+
577
+ primary_key1_array_1 = pa.array(["pk1", "pk2", "pk3"])
578
+ primary_key2_array_1 = pa.array([1, 2, 3])
579
+ data_table_1 = pa.Table.from_arrays(
580
+ [primary_key1_array_1, primary_key2_array_1], names=names
581
+ )
582
+
583
+ primary_key1_array_2 = pa.array(["pk1", "pk2", "pk3"])
584
+ primary_key2_array_2 = pa.array([1, 2, 3])
585
+ data_table_2 = pa.Table.from_arrays(
586
+ [primary_key1_array_2, primary_key2_array_2], names=names
587
+ )
588
+
589
+ primary_key1_array_3 = pa.array(["pk4", "pk2", "pk3"])
590
+ primary_key2_array_3 = pa.array([1, 3, 4])
591
+ data_table_3 = pa.Table.from_arrays(
592
+ [primary_key1_array_3, primary_key2_array_3], names=names
593
+ )
594
+
595
+ daft_df_1 = daft.from_arrow(data_table_1)
596
+ daft_df_2 = daft.from_arrow(data_table_2)
597
+ daft_df_3 = daft.from_arrow(data_table_3)
598
+
599
+ download_data_mock = mocker.patch(
600
+ "deltacat.compute.converter.utils.io.daft_read_parquet"
601
+ )
602
+ download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
603
+
604
+ convert_ref = convert.remote(convert_input)
605
+
606
+ to_be_deleted_files_list = []
607
+ to_be_added_files_list = []
608
+ convert_result = ray.get(convert_ref)
609
+
610
+ if convert_result.to_be_deleted_files:
611
+ to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
612
+ if convert_result.to_be_added_files:
613
+ to_be_added_files_list.extend(convert_result.to_be_added_files)
614
+
615
+ # 4. Commit position delete, delete equality deletes from table
616
+
617
+ commit_append_snapshot(
618
+ iceberg_table=tbl,
619
+ new_position_delete_files=to_be_added_files_list,
620
+ )
621
+ tbl.refresh()
622
+
623
+ # 5. Result assertion: Expected unique primary keys to be kept
624
+ pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
625
+ expected_result_tuple_list = [
626
+ ("pk1", 1),
627
+ ("pk2", 2),
628
+ ("pk2", 3),
629
+ ("pk3", 3),
630
+ ("pk3", 4),
631
+ ("pk4", 1),
632
+ ]
633
+ pk_combined_res = []
634
+ for pk1, pk2 in zip(
635
+ pyiceberg_scan_table_rows["primary_key1"],
636
+ pyiceberg_scan_table_rows["primary_key2"],
637
+ ):
638
+ pk_combined_res.append((pk1, pk2))
639
+
640
+ # Assert elements are same disregard ordering in list
641
+ assert sorted(pk_combined_res) == sorted(expected_result_tuple_list)
File without changes