deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. deltacat/__init__.py +19 -15
  2. deltacat/benchmarking/benchmark_engine.py +4 -2
  3. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  4. deltacat/catalog/__init__.py +62 -5
  5. deltacat/catalog/main/impl.py +18 -8
  6. deltacat/catalog/model/catalog.py +111 -73
  7. deltacat/catalog/model/properties.py +25 -22
  8. deltacat/compute/jobs/client.py +7 -5
  9. deltacat/constants.py +1 -2
  10. deltacat/env.py +10 -0
  11. deltacat/examples/basic_logging.py +1 -3
  12. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  13. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  14. deltacat/examples/indexer/indexer.py +2 -2
  15. deltacat/examples/indexer/job_runner.py +1 -2
  16. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  17. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  18. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  19. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  20. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  21. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  22. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  23. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  24. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  25. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  26. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  27. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  28. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  29. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  30. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  31. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  32. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  33. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  34. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  35. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  36. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  37. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  38. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  39. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  40. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  41. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  42. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  43. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  44. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  45. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  46. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  47. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  49. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  50. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  51. deltacat/io/reader/deltacat_read_api.py +1 -1
  52. deltacat/storage/model/shard.py +6 -2
  53. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  54. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  55. deltacat/tests/catalog/model/__init__.py +0 -0
  56. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  57. deltacat/tests/catalog/test_catalogs.py +52 -98
  58. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  59. deltacat/tests/daft/__init__.py +0 -0
  60. deltacat/tests/daft/test_model.py +97 -0
  61. deltacat/tests/experimental/__init__.py +0 -0
  62. deltacat/tests/experimental/catalog/__init__.py +0 -0
  63. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  64. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  65. deltacat/tests/experimental/daft/__init__.py +0 -0
  66. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  67. deltacat/tests/experimental/storage/__init__.py +0 -0
  68. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  69. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  70. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  71. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  72. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  73. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  74. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  75. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  76. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  77. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  78. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  79. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  80. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  81. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  82. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  83. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  84. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  85. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  86. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  87. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  88. deltacat/tests/storage/model/test_shard.py +3 -1
  89. deltacat/types/media.py +3 -3
  90. deltacat/utils/daft.py +530 -4
  91. deltacat/utils/export.py +3 -1
  92. deltacat/utils/url.py +1 -1
  93. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +4 -5
  94. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +120 -100
  95. deltacat/catalog/iceberg/__init__.py +0 -4
  96. deltacat/daft/daft_scan.py +0 -115
  97. deltacat/daft/model.py +0 -258
  98. deltacat/daft/translator.py +0 -126
  99. deltacat/examples/common/fixtures.py +0 -15
  100. deltacat/storage/rivulet/__init__.py +0 -11
  101. deltacat/storage/rivulet/feather/__init__.py +0 -5
  102. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  103. /deltacat/{daft → examples/experimental}/__init__.py +0 -0
  104. /deltacat/examples/{common → experimental/iceberg}/__init__.py +0 -0
  105. /deltacat/{examples/iceberg → experimental/catalog}/__init__.py +0 -0
  106. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  107. /deltacat/{storage/iceberg → experimental/storage}/__init__.py +0 -0
  108. /deltacat/{storage/rivulet/arrow → experimental/storage/iceberg}/__init__.py +0 -0
  109. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  110. /deltacat/{storage/rivulet/fs → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  111. /deltacat/{storage/rivulet/metastore → experimental/storage/rivulet/fs}/__init__.py +0 -0
  112. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  113. /deltacat/{storage/rivulet/reader → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  114. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  115. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  116. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  117. /deltacat/{storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  118. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  119. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  120. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  121. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/shard}/__init__.py +0 -0
  122. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/writer}/__init__.py +0 -0
  123. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  124. /deltacat/tests/{storage/rivulet/schema → catalog/data}/__init__.py +0 -0
  125. /deltacat/tests/{storage/rivulet/writer → catalog/main}/__init__.py +0 -0
  126. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  127. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  128. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,9 @@ import pytest
2
2
 
3
3
  import pyarrow as pa
4
4
  import pyarrow.parquet as pq
5
- from deltacat.storage.rivulet.schema.datatype import Datatype
6
- from deltacat.storage.rivulet.dataset import Dataset
7
- from deltacat.storage.rivulet import Schema, Field
5
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
6
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
7
+ from deltacat.experimental.storage.rivulet import Schema, Field
8
8
  from deltacat.utils.metafile_locator import _find_partition_path
9
9
 
10
10
 
@@ -0,0 +1,80 @@
1
+ import pytest
2
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
3
+ QueryExpression,
4
+ )
5
+ from deltacat.experimental.storage.rivulet.shard.range_shard import RangeShard
6
+
7
+
8
+ @pytest.fixture
9
+ def sample_range_shard():
10
+ return RangeShard(min_key=5, max_key=15)
11
+
12
+
13
+ @pytest.fixture
14
+ def sample_string_shard():
15
+ return RangeShard(min_key="apple", max_key="zebra")
16
+
17
+
18
+ def test_with_key():
19
+ query = QueryExpression[int]()
20
+ query.with_key(5)
21
+ assert query.min_key == 5
22
+ assert query.max_key == 5
23
+ with pytest.raises(ValueError):
24
+ query.with_key(10)
25
+
26
+
27
+ def test_with_range():
28
+ query = QueryExpression[int]()
29
+ query.with_range(10, 5)
30
+ assert query.min_key == 5
31
+ assert query.max_key == 10
32
+ with pytest.raises(ValueError):
33
+ query.with_range(20, 25)
34
+
35
+
36
+ def test_matches_query():
37
+ query = QueryExpression[int]()
38
+ assert query.matches_query(5)
39
+ assert query.matches_query(-999)
40
+ query.with_range(10, 20)
41
+ assert query.matches_query(15)
42
+ assert not query.matches_query(25)
43
+ assert not query.matches_query(5)
44
+
45
+
46
+ def test_below_query_range():
47
+ query = QueryExpression[int]()
48
+ assert not query.below_query_range(5)
49
+ query.with_range(10, 20)
50
+ assert query.below_query_range(5)
51
+ assert not query.below_query_range(15)
52
+ assert not query.below_query_range(25)
53
+
54
+
55
+ def test_with_shard_existing_query(sample_range_shard):
56
+ query = QueryExpression[int]().with_range(10, 20)
57
+ new_query = QueryExpression.with_shard(query, sample_range_shard)
58
+ assert new_query.min_key == 5
59
+ assert new_query.max_key == 20
60
+
61
+
62
+ def test_with_shard_none_shard():
63
+ query = QueryExpression[int]().with_range(10, 20)
64
+ result = QueryExpression.with_shard(query, None)
65
+ assert result.min_key == 10
66
+ assert result.max_key == 20
67
+
68
+
69
+ def test_with_shard_existing_query_string(sample_string_shard):
70
+ query = QueryExpression[str]().with_range("banana", "yellow")
71
+ new_query = QueryExpression.with_shard(query, sample_string_shard)
72
+ assert new_query.min_key == "apple"
73
+ assert new_query.max_key == "zebra"
74
+
75
+
76
+ def test_query_expression_string_matches():
77
+ query = QueryExpression[str]().with_range("apple", "cat")
78
+ assert query.matches_query("apple")
79
+ assert query.matches_query("banana")
80
+ assert not query.matches_query("dog")
@@ -0,0 +1,119 @@
1
+ import pytest
2
+ from deltacat.tests.experimental.storage.rivulet.test_utils import verify_pyarrow_scan
3
+ import pyarrow as pa
4
+ from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
5
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
6
+
7
+
8
+ @pytest.fixture
9
+ def combined_schema():
10
+ return Schema(
11
+ fields=[
12
+ Field("id", Datatype.int64(), is_merge_key=True),
13
+ Field("name", Datatype.string()),
14
+ Field("age", Datatype.int32()),
15
+ Field("height", Datatype.int64()),
16
+ Field("gender", Datatype.string()),
17
+ ]
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def initial_schema():
23
+ return Schema(
24
+ fields=[
25
+ Field("id", Datatype.int32(), is_merge_key=True),
26
+ Field("name", Datatype.string()),
27
+ Field("age", Datatype.int32()),
28
+ ]
29
+ )
30
+
31
+
32
+ @pytest.fixture
33
+ def extended_schema():
34
+ return Schema(
35
+ fields=[
36
+ Field("id", Datatype.int64(), is_merge_key=True),
37
+ Field("height", Datatype.int64()),
38
+ Field("gender", Datatype.string()),
39
+ ]
40
+ )
41
+
42
+
43
+ @pytest.fixture
44
+ def sample_data():
45
+ return {
46
+ "id": [1, 2, 3],
47
+ "name": ["Alice", "Bob", "Charlie"],
48
+ "age": [25, 30, 35],
49
+ }
50
+
51
+
52
+ @pytest.fixture
53
+ def extended_data():
54
+ return {
55
+ "id": [1, 2, 3],
56
+ "height": [150, 160, 159],
57
+ "gender": ["male", "female", "male"],
58
+ }
59
+
60
+
61
+ @pytest.fixture
62
+ def combined_data(sample_data, extended_data):
63
+ data = sample_data.copy()
64
+ data.update(extended_data)
65
+ return data
66
+
67
+
68
+ @pytest.fixture
69
+ def parquet_data(tmp_path, sample_data):
70
+ parquet_path = tmp_path / "test.parquet"
71
+ table = pa.Table.from_pydict(sample_data)
72
+ pa.parquet.write_table(table, parquet_path)
73
+ return parquet_path
74
+
75
+
76
+ @pytest.fixture
77
+ def sample_dataset(parquet_data, tmp_path):
78
+ return Dataset.from_parquet(
79
+ name="test_dataset",
80
+ file_uri=str(parquet_data),
81
+ metadata_uri=str(tmp_path),
82
+ merge_keys="id",
83
+ )
84
+
85
+
86
+ def test_end_to_end_scan_with_multiple_schemas(
87
+ sample_dataset,
88
+ initial_schema,
89
+ extended_schema,
90
+ combined_schema,
91
+ sample_data,
92
+ extended_data,
93
+ combined_data,
94
+ ):
95
+ # Verify initial scan.
96
+ verify_pyarrow_scan(sample_dataset.scan().to_arrow(), initial_schema, sample_data)
97
+
98
+ # Add a new schema to the dataset
99
+ sample_dataset.add_schema(schema=extended_schema, schema_name="schema2")
100
+ new_data = [
101
+ {"id": 1, "height": 150, "gender": "male"},
102
+ {"id": 2, "height": 160, "gender": "female"},
103
+ {"id": 3, "height": 159, "gender": "male"},
104
+ ]
105
+ writer = sample_dataset.writer(schema_name="schema2")
106
+ writer.write(new_data)
107
+ writer.flush()
108
+
109
+ # Verify scan with the extended schema retrieves only extended datfa
110
+ verify_pyarrow_scan(
111
+ sample_dataset.scan(schema_name="schema2").to_arrow(),
112
+ extended_schema,
113
+ extended_data,
114
+ )
115
+
116
+ # Verify a combined scan retrieves data matching the combined schema
117
+ verify_pyarrow_scan(
118
+ sample_dataset.scan().to_arrow(), combined_schema, combined_data
119
+ )
@@ -0,0 +1,71 @@
1
+ import pytest
2
+ import os
3
+
4
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
5
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
6
+ DatasetMetastore,
7
+ )
8
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
9
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
10
+ from deltacat.experimental.storage.rivulet import Schema
11
+
12
+
13
+ @pytest.fixture
14
+ def sample_schema():
15
+ return Schema(
16
+ {("id", Datatype.int32()), ("name", Datatype.string())},
17
+ "id",
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def sample_pydict():
23
+ return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
24
+
25
+
26
+ def test_dataset_metastore_e2e(sample_schema, tmp_path):
27
+ # Setup
28
+ dataset = Dataset(metadata_uri=tmp_path, dataset_name="dataset")
29
+ file_provider = dataset._file_provider
30
+ manifest_io = DeltacatManifestIO(file_provider.uri, dataset._locator)
31
+
32
+ # Create multiple manifests
33
+ manifests_data = [
34
+ {"sst_files": ["sst1.sst", "sst2.sst"], "level": 1},
35
+ {"sst_files": ["sst3.sst", "sst4.sst"], "level": 2},
36
+ ]
37
+
38
+ # Create SST files and manifests
39
+ manifest_paths = []
40
+ for manifest_data in manifests_data:
41
+ sst_files = manifest_data["sst_files"]
42
+ for sst in sst_files:
43
+ with open(os.path.join(file_provider.uri, sst), "w") as f:
44
+ f.write("test data")
45
+
46
+ manifest_path = manifest_io.write(
47
+ sst_files, sample_schema, manifest_data["level"]
48
+ )
49
+ manifest_paths.append(manifest_path)
50
+
51
+ # Initialize DatasetMetastore
52
+ metastore = DatasetMetastore(
53
+ file_provider.uri,
54
+ file_provider,
55
+ file_provider._locator,
56
+ manifest_io=manifest_io,
57
+ )
58
+
59
+ # Test manifest generation
60
+ manifest_accessors = list(metastore.generate_manifests())
61
+ assert len(manifest_accessors) == len(manifests_data)
62
+
63
+ # Verify each manifest accessor
64
+ for accessor in manifest_accessors:
65
+ assert accessor.context.schema == sample_schema
66
+ manifests_data_index = 0 if accessor.context.level == 1 else 1
67
+ assert accessor.context.level == manifests_data[manifests_data_index]["level"]
68
+ assert (
69
+ accessor.manifest.sst_files
70
+ == manifests_data[manifests_data_index]["sst_files"]
71
+ )
@@ -1,6 +1,6 @@
1
1
  import pytest
2
2
  import pyarrow as pa
3
- from deltacat.storage.rivulet import Schema, Field, Datatype
3
+ from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
4
4
 
5
5
 
6
6
  def test_field_initialization():
@@ -0,0 +1,162 @@
1
+ import pytest
2
+ import pyarrow as pa
3
+ import pyarrow.parquet as pq
4
+
5
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
6
+ from deltacat.experimental.storage.rivulet.shard.range_shard import (
7
+ RangeShard,
8
+ RangeShardingStrategy,
9
+ )
10
+
11
+
12
+ @pytest.fixture
13
+ def sample_numeric_dataset(tmp_path):
14
+ """
15
+ Creates a small Parquet file with integer-based min/max keys and
16
+ initializes a Dataset from it. Merge key is 'id' with values [1,2,3].
17
+ So min_key=1, max_key=3.
18
+ """
19
+ data = {
20
+ "id": [1, 2, 3],
21
+ "name": ["Alice", "Bob", "Charlie"],
22
+ "age": [25, 30, 35],
23
+ }
24
+ table = pa.Table.from_pydict(data)
25
+ parquet_file = tmp_path / "numeric_data.parquet"
26
+ pq.write_table(table, parquet_file)
27
+
28
+ ds = Dataset.from_parquet(
29
+ name="numeric_dataset",
30
+ file_uri=str(parquet_file),
31
+ metadata_uri=tmp_path,
32
+ merge_keys="id",
33
+ )
34
+ return ds
35
+
36
+
37
+ @pytest.fixture
38
+ def sample_string_dataset(tmp_path):
39
+ """
40
+ Creates a small Parquet file with a string-based merge key ('name')
41
+ and initializes a Dataset from it. Merge key has values
42
+ ['Alice', 'Bob', 'Charlie'] => min_key='Alice', max_key='Charlie'.
43
+ """
44
+ data = {
45
+ "name": ["Alice", "Charlie", "Bob"], # random order
46
+ "value": [100, 200, 150],
47
+ }
48
+ table = pa.Table.from_pydict(data)
49
+ parquet_file = tmp_path / "string_data.parquet"
50
+ pq.write_table(table, parquet_file)
51
+
52
+ ds = Dataset.from_parquet(
53
+ name="string_dataset",
54
+ file_uri=str(parquet_file),
55
+ metadata_uri=tmp_path,
56
+ merge_keys="name",
57
+ )
58
+ return ds
59
+
60
+
61
+ def test_shards(sample_numeric_dataset, sample_string_dataset):
62
+ shards = sample_numeric_dataset.shards(num_shards=2)
63
+
64
+ num_shards = len(list(shards))
65
+ assert num_shards == 2
66
+
67
+ shard = shards[0]
68
+ records = list(sample_numeric_dataset.scan(shard=shard).to_pydict())
69
+ num_records = len(records)
70
+ assert num_records == 2
71
+
72
+ assert records[0]["id"] == 1
73
+ assert records[0]["name"] == "Alice"
74
+
75
+ assert records[1]["id"] == 2
76
+ assert records[1]["name"] == "Bob"
77
+
78
+
79
+ def test_range_shard_repr():
80
+ shard = RangeShard(min_key=5, max_key=15)
81
+ assert repr(shard) == "Shard(type=range, min_key=5, max_key=15)"
82
+
83
+
84
+ def test_range_shard_split_integers():
85
+ shards = RangeShard.split(global_min=1, global_max=10, num_shards=2)
86
+ assert len(shards) == 2
87
+
88
+ assert shards[0].min_key == 1
89
+ assert shards[0].max_key == 5
90
+ assert shards[1].min_key == 6
91
+ assert shards[1].max_key == 10
92
+
93
+
94
+ def test_range_shard_split_integers_single_shard():
95
+ shards = RangeShard.split(global_min=1, global_max=10, num_shards=1)
96
+ assert len(shards) == 1
97
+ assert shards[0].min_key == 1
98
+ assert shards[0].max_key == 10
99
+
100
+
101
+ def test_range_shard_split_integers_same_value():
102
+ shards = RangeShard.split(global_min=5, global_max=5, num_shards=3)
103
+ assert len(shards) == 1
104
+
105
+
106
+ def test_range_sharding_strategy_integers(sample_numeric_dataset):
107
+ strategy = RangeShardingStrategy()
108
+ shards = list(
109
+ strategy.shards(num_shards=2, metastore=sample_numeric_dataset._metastore)
110
+ )
111
+
112
+ assert len(shards) == 2, "Expected 2 shards for dataset with keys [1,2,3]"
113
+
114
+ shard1, shard2 = shards
115
+ assert isinstance(shard1, RangeShard)
116
+ assert isinstance(shard2, RangeShard)
117
+ assert shard1.min_key == 1
118
+ assert shard1.max_key == 2
119
+ assert shard2.min_key == 3
120
+ assert shard2.max_key == 3
121
+
122
+
123
+ def test_range_sharding_strategy_integers_single_shard(sample_numeric_dataset):
124
+ strategy = RangeShardingStrategy()
125
+ shards = list(
126
+ strategy.shards(num_shards=1, metastore=sample_numeric_dataset._metastore)
127
+ )
128
+ assert len(shards) == 1
129
+ shard = shards[0]
130
+ assert shard.min_key == 1
131
+ assert shard.max_key == 3
132
+
133
+
134
+ def test_range_sharding_strategy_strings(sample_string_dataset):
135
+ strategy = RangeShardingStrategy()
136
+ shards = list(
137
+ strategy.shards(num_shards=2, metastore=sample_string_dataset._metastore)
138
+ )
139
+
140
+ assert len(shards) == 2, "Expected 2 shards for string-based dataset"
141
+ shard1, shard2 = shards
142
+ assert isinstance(shard1, RangeShard)
143
+ assert isinstance(shard2, RangeShard)
144
+
145
+ assert shard1.min_key == "Alice"
146
+ assert shard1.max_key < "Charlie"
147
+
148
+ assert shard2.min_key == shard1.max_key
149
+ assert shard2.max_key == "Charlie"
150
+
151
+
152
+ def test_range_sharding_strategy_strings_single_shard(sample_string_dataset):
153
+ strategy = RangeShardingStrategy()
154
+ shards = list(
155
+ strategy.shards(num_shards=1, metastore=sample_string_dataset._metastore)
156
+ )
157
+
158
+ assert len(shards) == 1
159
+
160
+ shard = shards[0]
161
+ assert shard.min_key == "Alice"
162
+ assert shard.max_key == "Charlie"
@@ -3,9 +3,11 @@ from deltacat.utils.metafile_locator import _find_partition_path
3
3
  import pytest
4
4
 
5
5
  import pyarrow as pa
6
- from deltacat.storage.rivulet import Schema, Field, Datatype
7
- from deltacat.storage.rivulet.dataset import Dataset
8
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
6
+ from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
7
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
8
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
9
+ QueryExpression,
10
+ )
9
11
 
10
12
 
11
13
  @pytest.fixture
@@ -2,11 +2,11 @@ import os
2
2
 
3
3
  import pytest
4
4
 
5
- from deltacat.storage.rivulet.dataset import Dataset
6
- from deltacat.storage.rivulet.fs.file_store import FileStore
7
- from deltacat.storage.rivulet.schema.datatype import Datatype
8
- from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
9
- from deltacat.storage.rivulet import Schema, Field
5
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
6
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
7
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
8
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
9
+ from deltacat.experimental.storage.rivulet import Schema, Field
10
10
  import pyarrow as pa
11
11
  import pyarrow.parquet
12
12
 
@@ -2,16 +2,16 @@ from typing import List, FrozenSet, Dict
2
2
 
3
3
  import pytest
4
4
 
5
- from deltacat.storage.rivulet.metastore.delta import DeltaContext
6
- from deltacat.storage.rivulet.metastore.sst import SSTable, SSTableRow
7
- from deltacat.storage.rivulet.metastore.sst_interval_tree import (
5
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
6
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
7
+ from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
8
8
  BlockIntervalTree,
9
9
  BlockGroup,
10
10
  OrderedBlockGroups,
11
11
  Block,
12
12
  )
13
- from deltacat.storage.rivulet.schema.datatype import Datatype
14
- from deltacat.storage.rivulet import Schema
13
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
14
+ from deltacat.experimental.storage.rivulet import Schema
15
15
 
16
16
 
17
17
  @pytest.fixture
@@ -3,12 +3,14 @@ import os
3
3
 
4
4
  from pyarrow import RecordBatch, Table
5
5
 
6
- from deltacat.storage.rivulet.dataset import Dataset
7
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
8
- from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter
9
-
10
- from deltacat.storage.rivulet.mvp.Table import MvpTable, MvpRow
11
- from deltacat.storage.rivulet import Schema
6
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
7
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
8
+ QueryExpression,
9
+ )
10
+ from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
11
+
12
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable, MvpRow
13
+ from deltacat.experimental.storage.rivulet import Schema
12
14
  from typing import Dict, List, Generator, Set
13
15
 
14
16
  FIXTURE_ROW_COUNT = 10000
@@ -7,23 +7,25 @@ import msgpack
7
7
  import pytest
8
8
  from pyarrow import RecordBatch
9
9
 
10
- from deltacat.storage.rivulet.dataset import Dataset
11
- from deltacat.storage.rivulet.fs.file_store import FileStore
12
- from deltacat.storage.rivulet.metastore.delta import (
10
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
11
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
12
+ from deltacat.experimental.storage.rivulet.metastore.delta import (
13
13
  ManifestIO,
14
14
  TreeLevel,
15
15
  DeltacatManifestIO,
16
16
  )
17
17
 
18
- from deltacat.storage.rivulet.mvp.Table import MvpTable, MvpRow
19
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
20
- from deltacat.storage.rivulet import Schema
21
- from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
18
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable, MvpRow
19
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
20
+ QueryExpression,
21
+ )
22
+ from deltacat.experimental.storage.rivulet import Schema
23
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
22
24
  MemtableDatasetWriter,
23
25
  )
24
26
 
25
- from deltacat.tests.storage.rivulet.test_utils import FIXTURE_ROW_COUNT
26
- from deltacat.tests.storage.rivulet.test_utils import (
27
+ from deltacat.tests.experimental.storage.rivulet.test_utils import FIXTURE_ROW_COUNT
28
+ from deltacat.tests.experimental.storage.rivulet.test_utils import (
27
29
  write_mvp_table,
28
30
  compare_mvp_table_to_scan_results,
29
31
  mvp_table_to_record_batches,
@@ -2,8 +2,8 @@ import pytest
2
2
  import shutil
3
3
  import tempfile
4
4
 
5
- from deltacat.storage.rivulet.fs.file_store import FileStore
6
- from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
5
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
6
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
7
7
  MemtableDatasetWriter,
8
8
  )
9
9
  from ..test_utils import (
@@ -1,12 +1,12 @@
1
1
  import pytest
2
2
 
3
- from deltacat.storage.rivulet.dataset import Dataset
4
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
5
- from deltacat.storage.rivulet.fs.file_store import FileStore
6
- from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
7
- from deltacat.storage.rivulet import Schema
8
- from deltacat.storage.rivulet.schema.datatype import Datatype
9
- from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
3
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
4
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
5
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
6
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
7
+ from deltacat.experimental.storage.rivulet import Schema
8
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
9
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
10
10
  MemtableDatasetWriter,
11
11
  )
12
12
 
@@ -7,7 +7,9 @@ def test_sharding_strategy_from_string_range():
7
7
  """
8
8
  Tests that from_string('range') returns an instance of RangeShardingStrategy.
9
9
  """
10
- from deltacat.storage.rivulet.shard.range_shard import RangeShardingStrategy
10
+ from deltacat.experimental.storage.rivulet.shard.range_shard import (
11
+ RangeShardingStrategy,
12
+ )
11
13
 
12
14
  strategy = ShardingStrategy.from_string("range")
13
15
  assert isinstance(strategy, RangeShardingStrategy)
deltacat/types/media.py CHANGED
@@ -148,9 +148,9 @@ class DatastoreType(str, Enum):
148
148
  writer for that data store. Note that, although some overlap exists between
149
149
  enum values here and in :class:`deltacat.types.media.ContentType`, each
150
150
  enum serve a different purpose. The purpose of
151
- :class:`deltacat.types.media.ContentType` is to resolve the MIME type for
152
- specific types of files, and may be used together with multi-content-type
153
- datastore types to describe the specific file types read/written to that
151
+ :class:`deltacat.types.media.ContentType` is to resolve a file's MIME type,
152
+ and may be used together with datastores that support storing different
153
+ file types to describe the specific file type read/written from/to that
154
154
  datastore (e.g., Iceberg, Hudi, Delta Lake, Audio, Images, Video, etc.)
155
155
  """
156
156