deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. deltacat/__init__.py +41 -16
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/benchmark_engine.py +4 -2
  5. deltacat/benchmarking/conftest.py +1 -1
  6. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  7. deltacat/catalog/__init__.py +62 -5
  8. deltacat/catalog/main/impl.py +26 -10
  9. deltacat/catalog/model/catalog.py +165 -109
  10. deltacat/catalog/model/properties.py +25 -24
  11. deltacat/compute/__init__.py +14 -0
  12. deltacat/compute/converter/constants.py +5 -0
  13. deltacat/compute/converter/converter_session.py +78 -36
  14. deltacat/compute/converter/model/convert_input.py +24 -4
  15. deltacat/compute/converter/model/convert_result.py +61 -0
  16. deltacat/compute/converter/model/converter_session_params.py +52 -10
  17. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  18. deltacat/compute/converter/steps/convert.py +84 -36
  19. deltacat/compute/converter/steps/dedupe.py +25 -4
  20. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  21. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  22. deltacat/compute/converter/utils/io.py +82 -11
  23. deltacat/compute/converter/utils/s3u.py +13 -4
  24. deltacat/compute/jobs/client.py +406 -0
  25. deltacat/constants.py +5 -6
  26. deltacat/env.py +10 -0
  27. deltacat/examples/basic_logging.py +6 -6
  28. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  29. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  30. deltacat/examples/hello_world.py +4 -2
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +198 -0
  33. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  34. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  35. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  36. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  37. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  38. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  39. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  40. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  41. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  42. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  43. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  44. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  45. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  46. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  47. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  49. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  50. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  51. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  52. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  53. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  54. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  55. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  56. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  57. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  58. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  59. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  60. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  61. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  62. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  63. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  64. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  65. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  66. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  67. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  68. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  69. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  70. deltacat/io/__init__.py +13 -0
  71. deltacat/io/dataset/__init__.py +0 -0
  72. deltacat/io/dataset/deltacat_dataset.py +91 -0
  73. deltacat/io/datasink/__init__.py +0 -0
  74. deltacat/io/datasink/deltacat_datasink.py +207 -0
  75. deltacat/io/datasource/__init__.py +0 -0
  76. deltacat/io/datasource/deltacat_datasource.py +580 -0
  77. deltacat/io/reader/__init__.py +0 -0
  78. deltacat/io/reader/deltacat_read_api.py +172 -0
  79. deltacat/storage/__init__.py +2 -0
  80. deltacat/storage/model/expression/__init__.py +47 -0
  81. deltacat/storage/model/expression/expression.py +656 -0
  82. deltacat/storage/model/expression/visitor.py +248 -0
  83. deltacat/storage/model/metafile.py +74 -42
  84. deltacat/storage/model/scan/push_down.py +32 -5
  85. deltacat/storage/model/shard.py +6 -2
  86. deltacat/storage/model/types.py +5 -3
  87. deltacat/tests/_io/reader/__init__.py +0 -0
  88. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  89. deltacat/tests/catalog/data/__init__.py +0 -0
  90. deltacat/tests/catalog/main/__init__.py +0 -0
  91. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  92. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  93. deltacat/tests/catalog/model/__init__.py +0 -0
  94. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  95. deltacat/tests/catalog/test_catalogs.py +52 -98
  96. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  97. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  98. deltacat/tests/daft/__init__.py +0 -0
  99. deltacat/tests/daft/test_model.py +97 -0
  100. deltacat/tests/experimental/__init__.py +0 -0
  101. deltacat/tests/experimental/catalog/__init__.py +0 -0
  102. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  103. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  104. deltacat/tests/experimental/daft/__init__.py +0 -0
  105. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  106. deltacat/tests/experimental/storage/__init__.py +0 -0
  107. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  108. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  109. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  110. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  111. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  112. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  113. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  114. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  115. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  116. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  117. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  118. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  119. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  120. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  121. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  122. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  123. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  124. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  125. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  126. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  127. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  128. deltacat/tests/storage/model/test_expression.py +327 -0
  129. deltacat/tests/storage/model/test_shard.py +3 -1
  130. deltacat/tests/test_deltacat_api.py +50 -9
  131. deltacat/types/media.py +141 -43
  132. deltacat/types/tables.py +35 -7
  133. deltacat/utils/daft.py +531 -5
  134. deltacat/utils/export.py +3 -1
  135. deltacat/utils/filesystem.py +39 -9
  136. deltacat/utils/polars.py +128 -0
  137. deltacat/utils/pyarrow.py +151 -15
  138. deltacat/utils/ray_utils/concurrency.py +1 -1
  139. deltacat/utils/ray_utils/runtime.py +56 -4
  140. deltacat/utils/url.py +1284 -0
  141. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
  142. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
  143. deltacat/catalog/iceberg/__init__.py +0 -4
  144. deltacat/daft/daft_scan.py +0 -111
  145. deltacat/daft/model.py +0 -258
  146. deltacat/examples/common/fixtures.py +0 -15
  147. deltacat/storage/rivulet/__init__.py +0 -11
  148. deltacat/storage/rivulet/feather/__init__.py +0 -5
  149. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  150. /deltacat/{daft → compute/jobs}/__init__.py +0 -0
  151. /deltacat/examples/{common → experimental}/__init__.py +0 -0
  152. /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
  153. /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
  154. /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
  155. /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  156. /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
  157. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  158. /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
  159. /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  160. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  161. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  162. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
  163. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  164. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  165. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  166. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  167. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  168. /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  169. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  170. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  171. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  172. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  173. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  174. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  175. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,97 @@
1
+ import pytest
2
+ import pyarrow as pa
3
+ from daft import DataType, TimeUnit
4
+ from daft.logical.schema import Field as DaftField
5
+
6
+ from deltacat.storage.model.transform import IdentityTransform
7
+ from deltacat.storage.model.partition import PartitionKey
8
+ from deltacat.utils.daft import DaftFieldMapper, DaftPartitionKeyMapper
9
+
10
+ from deltacat.storage.model.schema import Field, Schema
11
+
12
+
13
+ class TestDaftFieldMapper:
14
+ def test_field_mapper_basic_types(self):
15
+ """Test mapping basic data types between Daft and PyArrow fields"""
16
+ test_cases = [
17
+ (DataType.int32(), pa.int32()),
18
+ (DataType.int64(), pa.int64()),
19
+ (DataType.float32(), pa.float32()),
20
+ (DataType.float64(), pa.float64()),
21
+ (DataType.string(), pa.large_string()),
22
+ (DataType.bool(), pa.bool_()),
23
+ (DataType.binary(), pa.large_binary()),
24
+ (DataType.date(), pa.date32()),
25
+ (DataType.timestamp(TimeUnit.ns()), pa.timestamp("ns")),
26
+ ]
27
+
28
+ for daft_type, pa_type in test_cases:
29
+ # Create test fields
30
+ daft_field = DaftField.create(
31
+ name="test_field",
32
+ dtype=daft_type,
33
+ )
34
+
35
+ # Daft to PyArrow
36
+ pa_field = DaftFieldMapper.map(daft_field)
37
+ assert pa_field is not None
38
+ assert pa_field.name == "test_field"
39
+ assert pa_field.type == pa_type # type: ignore
40
+ assert pa_field.nullable is True
41
+
42
+ # PyArrow to Daft
43
+ daft_field_back = DaftFieldMapper.unmap(pa_field)
44
+ assert daft_field_back is not None
45
+ assert daft_field_back.name == daft_field.name
46
+ assert daft_field_back.dtype == daft_field.dtype
47
+
48
+
49
+ class TestDaftPartitionKeyMapper:
50
+ def test_unmap(self):
51
+ """
52
+ Test unmap method of DaftPartitionKeyMapper when obj is not None, schema is provided,
53
+ len(obj.key) is 1, and dc_field is found in the schema.
54
+
55
+ This test verifies that the method correctly converts a PartitionKey to a DaftPartitionField
56
+ when all conditions are met and the field exists in the schema.
57
+ """
58
+ # Create a mock schema
59
+ schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
60
+ # Create a PartitionKey object
61
+ partition_key = PartitionKey(
62
+ key=["test_field"], transform=IdentityTransform(), name="partition_field"
63
+ )
64
+
65
+ result = DaftPartitionKeyMapper.unmap(obj=partition_key, schema=schema)
66
+ assert result is not None
67
+ assert result.field.name() == "partition_field"
68
+ assert DataType._from_pydatatype(result.field.dtype()) == DataType.int32()
69
+
70
+ def test_unmap_no_field_locator(self):
71
+ schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
72
+ partition_key = PartitionKey(key=[], name="partition_field")
73
+
74
+ with pytest.raises(ValueError) as excinfo:
75
+ DaftPartitionKeyMapper.unmap(partition_key, schema)
76
+
77
+ assert "At least 1 PartitionKey FieldLocator is expected" in str(excinfo.value)
78
+
79
+ def test_unmap_partition_key_not_found(self):
80
+ schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
81
+ partition_key = PartitionKey(
82
+ key=["test_field_2"], transform=IdentityTransform(), name="partition_field"
83
+ )
84
+
85
+ with pytest.raises(KeyError) as excinfo:
86
+ DaftPartitionKeyMapper.unmap(partition_key, schema)
87
+
88
+ assert "Column test_field_2 does not exist in schema" in str(excinfo.value)
89
+
90
+ def test_unmap_partition_name_not_defined(self):
91
+ schema = Schema.of(schema=[Field.of(pa.field("test_field", pa.int32()))])
92
+ partition_key = PartitionKey(key=[])
93
+
94
+ with pytest.raises(ValueError) as excinfo:
95
+ DaftPartitionKeyMapper.unmap(partition_key, schema)
96
+
97
+ assert "Name is required for PartitionKey conversion" in str(excinfo.value)
File without changes
File without changes
@@ -0,0 +1,71 @@
1
+ import tempfile
2
+ import shutil
3
+ import uuid
4
+ import deltacat
5
+ import pytest
6
+ from deltacat import Field, Schema
7
+ from pyiceberg.catalog import CatalogType
8
+
9
+ import pyarrow as pa
10
+
11
+ from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
12
+
13
+
14
+ @pytest.fixture
15
+ def schema_a():
16
+ return Schema.of(
17
+ [
18
+ Field.of(
19
+ field=pa.field("col1", pa.int32(), nullable=False),
20
+ field_id=1,
21
+ is_merge_key=True,
22
+ )
23
+ ]
24
+ )
25
+
26
+
27
+ class TestIcebergCatalogInitialization:
28
+ temp_dir = None
29
+
30
+ @classmethod
31
+ def setup_class(cls):
32
+ cls.temp_dir = tempfile.mkdtemp()
33
+
34
+ @classmethod
35
+ def teardown_class(cls):
36
+ shutil.rmtree(cls.temp_dir)
37
+
38
+ def test_iceberg_catalog_and_table_create(self, schema_a):
39
+
40
+ # Register a random catalog name to avoid concurrent test conflicts
41
+ catalog_name = str(uuid.uuid4())
42
+
43
+ config = IcebergCatalogConfig(
44
+ type=CatalogType.SQL,
45
+ properties={
46
+ "warehouse": self.temp_dir,
47
+ "uri": f"sqlite:////{self.temp_dir}/sql-catalog.db",
48
+ },
49
+ )
50
+
51
+ # Initialize with the PyIceberg catalog
52
+ catalog = deltacat.IcebergCatalog.from_config(config)
53
+ deltacat.init(
54
+ {catalog_name: catalog},
55
+ force=True,
56
+ )
57
+
58
+ table_def = deltacat.create_table(
59
+ "test_table", catalog=catalog_name, schema=schema_a
60
+ )
61
+
62
+ # Fetch table we just created
63
+ fetched_table_def = deltacat.get_table("test_table", catalog=catalog_name)
64
+ assert table_def.table_version == fetched_table_def.table_version
65
+
66
+ # For now, just check that we created a table version with an equivalent schema
67
+ assert table_def.table_version.schema.equivalent_to(schema_a)
68
+
69
+ # Sanity check that list namespaces works
70
+ namespaces = deltacat.list_namespaces(catalog=catalog_name).all_items()
71
+ assert table_def.table.namespace in [n.namespace for n in namespaces]
File without changes
@@ -0,0 +1,136 @@
1
+ import daft
2
+ from daft import Table, Identifier
3
+ import pytest
4
+ import uuid
5
+
6
+ from deltacat.catalog import Catalog as DeltaCATCatalog
7
+ from deltacat.catalog import CatalogProperties
8
+ from deltacat.experimental.daft.daft_catalog import DaftCatalog
9
+ import shutil
10
+ import tempfile
11
+
12
+ from deltacat.experimental.catalog.iceberg import impl as IcebergCatalog
13
+ from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
14
+
15
+ from pyiceberg.catalog import CatalogType
16
+
17
+
18
+ class TestCatalogIntegration:
19
+ @classmethod
20
+ def setup_method(cls):
21
+ cls.tmpdir = tempfile.mkdtemp()
22
+
23
+ @classmethod
24
+ def teardown_method(cls):
25
+ shutil.rmtree(cls.tmpdir)
26
+
27
+ def test_create_table(self):
28
+ """Demonstrate DeltaCAT-Daft integration."""
29
+ # Create a DeltaCAT catalog
30
+ catalog_props = CatalogProperties(root=self.tmpdir)
31
+ dc_catalog = DeltaCATCatalog(catalog_props)
32
+
33
+ # Use a random catalog name to prevent namespacing conflicts with other tests
34
+ # Convert the DeltaCAT catalog to a Daft catalog
35
+ catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
36
+
37
+ daft_catalog = DaftCatalog(catalog=dc_catalog, name=catalog_name)
38
+
39
+ # Register the catalog with Daft's catalog system
40
+ daft.attach_catalog(daft_catalog, catalog_name)
41
+
42
+ # Create a sample DataFrame
43
+ df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
44
+ # Create then get table
45
+ daft_catalog.create_table(Identifier("example_table"), df)
46
+ table: Table = daft_catalog.get_table(Identifier("example_table"))
47
+ assert table.name == "example_table"
48
+
49
+ def test_get_table(self):
50
+ """Test getting a table from the DeltaCAT-Daft catalog."""
51
+ # Create a DeltaCAT catalog using the existing tmpdir
52
+ catalog_props = CatalogProperties(root=self.tmpdir)
53
+ dc_catalog = DeltaCATCatalog(catalog_props)
54
+
55
+ # Convert to DaftCatalog and attach to Daft
56
+ catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
57
+ daft_catalog = DaftCatalog(dc_catalog, catalog_name)
58
+ daft.attach_catalog(daft_catalog, catalog_name)
59
+
60
+ # Create a sample DataFrame and table
61
+ df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
62
+ table_name = "test_get_table"
63
+ daft_catalog.create_table(Identifier(table_name), df)
64
+
65
+ # Get the table using different forms of identifiers
66
+ table2 = daft_catalog.get_table(Identifier(table_name))
67
+ assert table2 is not None
68
+ assert table2.name == table_name
69
+
70
+ # 3. With namespace. DeltaCAT used the default namespace since it was not provided
71
+ table3 = daft_catalog.get_table(Identifier("default", table_name))
72
+ assert table3 is not None
73
+ assert table3.name == table_name
74
+
75
+ # Test non-existent table raises an appropriate error
76
+ with pytest.raises(ValueError, match="Table nonexistent_table not found"):
77
+ daft_catalog.get_table(Identifier("nonexistent_table"))
78
+
79
+
80
+ class TestIcebergCatalogIntegration:
81
+ @classmethod
82
+ def setup_method(cls):
83
+ cls.tmpdir = tempfile.mkdtemp()
84
+
85
+ @classmethod
86
+ def teardown_method(cls):
87
+ shutil.rmtree(cls.tmpdir)
88
+
89
+ def test_iceberg_catalog_integration(self):
90
+ # Create a unique warehouse path for this test
91
+ warehouse_path = self.tmpdir
92
+
93
+ # Configure an Iceberg catalog with the warehouse path
94
+ config = IcebergCatalogConfig(
95
+ type=CatalogType.SQL,
96
+ properties={
97
+ "warehouse": warehouse_path,
98
+ "uri": f"sqlite:////{warehouse_path}/sql-catalog.db",
99
+ },
100
+ )
101
+ dc_catalog = IcebergCatalog.from_config(config)
102
+
103
+ # Convert the DeltaCAT catalog to a Daft catalog
104
+ catalog_name = f"deltacat_iceberg_{uuid.uuid4().hex[:8]}"
105
+ daft_catalog = DaftCatalog(dc_catalog, catalog_name)
106
+ daft.attach_catalog(daft_catalog, catalog_name)
107
+
108
+ # Create a sample DataFrame
109
+ df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
110
+
111
+ # Create a table with the Daft catalog
112
+ table_name = "example_table"
113
+ namespace = "example_namespace"
114
+ daft_catalog.create_table(Identifier(namespace, table_name), df)
115
+
116
+ # Query that Iceberg table exists using PyIceberg
117
+ iceberg_catalog = dc_catalog.inner
118
+
119
+ # Verify the table exists in the Iceberg catalog
120
+ tables = iceberg_catalog.list_tables(namespace)
121
+
122
+ assert any(
123
+ t[0] == namespace and t[1] == table_name for t in tables
124
+ ), f"Table {table_name} not found in Iceberg catalog"
125
+
126
+ # Load the table from Iceberg catalog and verify its properties
127
+ iceberg_table = iceberg_catalog.load_table(f"{namespace}.{table_name}")
128
+
129
+ # Check that the schema matches our DataFrame
130
+ schema = iceberg_table.schema()
131
+ assert (
132
+ schema.find_field("id") is not None
133
+ ), "Field 'id' not fcound in table schema"
134
+ assert (
135
+ schema.find_field("value") is not None
136
+ ), "Field 'value' not found in table schema"
File without changes
@@ -3,9 +3,9 @@ import io
3
3
  import pytest
4
4
  from faker import Faker
5
5
 
6
- from deltacat.storage.rivulet.schema.datatype import Datatype
7
- from deltacat.storage.rivulet.mvp.Table import MvpTable
8
- from deltacat.storage.rivulet.schema.schema import Schema
6
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
7
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
8
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
9
9
  import random
10
10
  import string
11
11
  from PIL import Image
@@ -2,8 +2,9 @@ import pytest
2
2
 
3
3
  import pyarrow as pa
4
4
  import pyarrow.parquet as pq
5
- from deltacat import Datatype, Dataset
6
- from deltacat.storage.rivulet import Schema, Field
5
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
6
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
7
+ from deltacat.experimental.storage.rivulet import Schema, Field
7
8
  from deltacat.utils.metafile_locator import _find_partition_path
8
9
 
9
10
 
@@ -0,0 +1,80 @@
1
+ import pytest
2
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
3
+ QueryExpression,
4
+ )
5
+ from deltacat.experimental.storage.rivulet.shard.range_shard import RangeShard
6
+
7
+
8
+ @pytest.fixture
9
+ def sample_range_shard():
10
+ return RangeShard(min_key=5, max_key=15)
11
+
12
+
13
+ @pytest.fixture
14
+ def sample_string_shard():
15
+ return RangeShard(min_key="apple", max_key="zebra")
16
+
17
+
18
+ def test_with_key():
19
+ query = QueryExpression[int]()
20
+ query.with_key(5)
21
+ assert query.min_key == 5
22
+ assert query.max_key == 5
23
+ with pytest.raises(ValueError):
24
+ query.with_key(10)
25
+
26
+
27
+ def test_with_range():
28
+ query = QueryExpression[int]()
29
+ query.with_range(10, 5)
30
+ assert query.min_key == 5
31
+ assert query.max_key == 10
32
+ with pytest.raises(ValueError):
33
+ query.with_range(20, 25)
34
+
35
+
36
+ def test_matches_query():
37
+ query = QueryExpression[int]()
38
+ assert query.matches_query(5)
39
+ assert query.matches_query(-999)
40
+ query.with_range(10, 20)
41
+ assert query.matches_query(15)
42
+ assert not query.matches_query(25)
43
+ assert not query.matches_query(5)
44
+
45
+
46
+ def test_below_query_range():
47
+ query = QueryExpression[int]()
48
+ assert not query.below_query_range(5)
49
+ query.with_range(10, 20)
50
+ assert query.below_query_range(5)
51
+ assert not query.below_query_range(15)
52
+ assert not query.below_query_range(25)
53
+
54
+
55
+ def test_with_shard_existing_query(sample_range_shard):
56
+ query = QueryExpression[int]().with_range(10, 20)
57
+ new_query = QueryExpression.with_shard(query, sample_range_shard)
58
+ assert new_query.min_key == 5
59
+ assert new_query.max_key == 20
60
+
61
+
62
+ def test_with_shard_none_shard():
63
+ query = QueryExpression[int]().with_range(10, 20)
64
+ result = QueryExpression.with_shard(query, None)
65
+ assert result.min_key == 10
66
+ assert result.max_key == 20
67
+
68
+
69
+ def test_with_shard_existing_query_string(sample_string_shard):
70
+ query = QueryExpression[str]().with_range("banana", "yellow")
71
+ new_query = QueryExpression.with_shard(query, sample_string_shard)
72
+ assert new_query.min_key == "apple"
73
+ assert new_query.max_key == "zebra"
74
+
75
+
76
+ def test_query_expression_string_matches():
77
+ query = QueryExpression[str]().with_range("apple", "cat")
78
+ assert query.matches_query("apple")
79
+ assert query.matches_query("banana")
80
+ assert not query.matches_query("dog")
@@ -0,0 +1,119 @@
1
+ import pytest
2
+ from deltacat.tests.experimental.storage.rivulet.test_utils import verify_pyarrow_scan
3
+ import pyarrow as pa
4
+ from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
5
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
6
+
7
+
8
+ @pytest.fixture
9
+ def combined_schema():
10
+ return Schema(
11
+ fields=[
12
+ Field("id", Datatype.int64(), is_merge_key=True),
13
+ Field("name", Datatype.string()),
14
+ Field("age", Datatype.int32()),
15
+ Field("height", Datatype.int64()),
16
+ Field("gender", Datatype.string()),
17
+ ]
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def initial_schema():
23
+ return Schema(
24
+ fields=[
25
+ Field("id", Datatype.int32(), is_merge_key=True),
26
+ Field("name", Datatype.string()),
27
+ Field("age", Datatype.int32()),
28
+ ]
29
+ )
30
+
31
+
32
+ @pytest.fixture
33
+ def extended_schema():
34
+ return Schema(
35
+ fields=[
36
+ Field("id", Datatype.int64(), is_merge_key=True),
37
+ Field("height", Datatype.int64()),
38
+ Field("gender", Datatype.string()),
39
+ ]
40
+ )
41
+
42
+
43
+ @pytest.fixture
44
+ def sample_data():
45
+ return {
46
+ "id": [1, 2, 3],
47
+ "name": ["Alice", "Bob", "Charlie"],
48
+ "age": [25, 30, 35],
49
+ }
50
+
51
+
52
+ @pytest.fixture
53
+ def extended_data():
54
+ return {
55
+ "id": [1, 2, 3],
56
+ "height": [150, 160, 159],
57
+ "gender": ["male", "female", "male"],
58
+ }
59
+
60
+
61
+ @pytest.fixture
62
+ def combined_data(sample_data, extended_data):
63
+ data = sample_data.copy()
64
+ data.update(extended_data)
65
+ return data
66
+
67
+
68
+ @pytest.fixture
69
+ def parquet_data(tmp_path, sample_data):
70
+ parquet_path = tmp_path / "test.parquet"
71
+ table = pa.Table.from_pydict(sample_data)
72
+ pa.parquet.write_table(table, parquet_path)
73
+ return parquet_path
74
+
75
+
76
+ @pytest.fixture
77
+ def sample_dataset(parquet_data, tmp_path):
78
+ return Dataset.from_parquet(
79
+ name="test_dataset",
80
+ file_uri=str(parquet_data),
81
+ metadata_uri=str(tmp_path),
82
+ merge_keys="id",
83
+ )
84
+
85
+
86
+ def test_end_to_end_scan_with_multiple_schemas(
87
+ sample_dataset,
88
+ initial_schema,
89
+ extended_schema,
90
+ combined_schema,
91
+ sample_data,
92
+ extended_data,
93
+ combined_data,
94
+ ):
95
+ # Verify initial scan.
96
+ verify_pyarrow_scan(sample_dataset.scan().to_arrow(), initial_schema, sample_data)
97
+
98
+ # Add a new schema to the dataset
99
+ sample_dataset.add_schema(schema=extended_schema, schema_name="schema2")
100
+ new_data = [
101
+ {"id": 1, "height": 150, "gender": "male"},
102
+ {"id": 2, "height": 160, "gender": "female"},
103
+ {"id": 3, "height": 159, "gender": "male"},
104
+ ]
105
+ writer = sample_dataset.writer(schema_name="schema2")
106
+ writer.write(new_data)
107
+ writer.flush()
108
+
109
+ # Verify scan with the extended schema retrieves only extended datfa
110
+ verify_pyarrow_scan(
111
+ sample_dataset.scan(schema_name="schema2").to_arrow(),
112
+ extended_schema,
113
+ extended_data,
114
+ )
115
+
116
+ # Verify a combined scan retrieves data matching the combined schema
117
+ verify_pyarrow_scan(
118
+ sample_dataset.scan().to_arrow(), combined_schema, combined_data
119
+ )
@@ -0,0 +1,71 @@
1
+ import pytest
2
+ import os
3
+
4
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
5
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
6
+ DatasetMetastore,
7
+ )
8
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
9
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
10
+ from deltacat.experimental.storage.rivulet import Schema
11
+
12
+
13
+ @pytest.fixture
14
+ def sample_schema():
15
+ return Schema(
16
+ {("id", Datatype.int32()), ("name", Datatype.string())},
17
+ "id",
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def sample_pydict():
23
+ return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
24
+
25
+
26
+ def test_dataset_metastore_e2e(sample_schema, tmp_path):
27
+ # Setup
28
+ dataset = Dataset(metadata_uri=tmp_path, dataset_name="dataset")
29
+ file_provider = dataset._file_provider
30
+ manifest_io = DeltacatManifestIO(file_provider.uri, dataset._locator)
31
+
32
+ # Create multiple manifests
33
+ manifests_data = [
34
+ {"sst_files": ["sst1.sst", "sst2.sst"], "level": 1},
35
+ {"sst_files": ["sst3.sst", "sst4.sst"], "level": 2},
36
+ ]
37
+
38
+ # Create SST files and manifests
39
+ manifest_paths = []
40
+ for manifest_data in manifests_data:
41
+ sst_files = manifest_data["sst_files"]
42
+ for sst in sst_files:
43
+ with open(os.path.join(file_provider.uri, sst), "w") as f:
44
+ f.write("test data")
45
+
46
+ manifest_path = manifest_io.write(
47
+ sst_files, sample_schema, manifest_data["level"]
48
+ )
49
+ manifest_paths.append(manifest_path)
50
+
51
+ # Initialize DatasetMetastore
52
+ metastore = DatasetMetastore(
53
+ file_provider.uri,
54
+ file_provider,
55
+ file_provider._locator,
56
+ manifest_io=manifest_io,
57
+ )
58
+
59
+ # Test manifest generation
60
+ manifest_accessors = list(metastore.generate_manifests())
61
+ assert len(manifest_accessors) == len(manifests_data)
62
+
63
+ # Verify each manifest accessor
64
+ for accessor in manifest_accessors:
65
+ assert accessor.context.schema == sample_schema
66
+ manifests_data_index = 0 if accessor.context.level == 1 else 1
67
+ assert accessor.context.level == manifests_data[manifests_data_index]["level"]
68
+ assert (
69
+ accessor.manifest.sst_files
70
+ == manifests_data[manifests_data_index]["sst_files"]
71
+ )
@@ -1,6 +1,6 @@
1
1
  import pytest
2
2
  import pyarrow as pa
3
- from deltacat.storage.rivulet import Schema, Field, Datatype
3
+ from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
4
4
 
5
5
 
6
6
  def test_field_initialization():