deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. deltacat/__init__.py +41 -16
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/benchmark_engine.py +4 -2
  5. deltacat/benchmarking/conftest.py +1 -1
  6. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  7. deltacat/catalog/__init__.py +62 -5
  8. deltacat/catalog/main/impl.py +26 -10
  9. deltacat/catalog/model/catalog.py +165 -109
  10. deltacat/catalog/model/properties.py +25 -24
  11. deltacat/compute/__init__.py +14 -0
  12. deltacat/compute/converter/constants.py +5 -0
  13. deltacat/compute/converter/converter_session.py +78 -36
  14. deltacat/compute/converter/model/convert_input.py +24 -4
  15. deltacat/compute/converter/model/convert_result.py +61 -0
  16. deltacat/compute/converter/model/converter_session_params.py +52 -10
  17. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  18. deltacat/compute/converter/steps/convert.py +84 -36
  19. deltacat/compute/converter/steps/dedupe.py +25 -4
  20. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  21. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  22. deltacat/compute/converter/utils/io.py +82 -11
  23. deltacat/compute/converter/utils/s3u.py +13 -4
  24. deltacat/compute/jobs/client.py +406 -0
  25. deltacat/constants.py +5 -6
  26. deltacat/env.py +10 -0
  27. deltacat/examples/basic_logging.py +6 -6
  28. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  29. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  30. deltacat/examples/hello_world.py +4 -2
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +198 -0
  33. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  34. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  35. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  36. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  37. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  38. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  39. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  40. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  41. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  42. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  43. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  44. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  45. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  46. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  47. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  49. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  50. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  51. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  52. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  53. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  54. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  55. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  56. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  57. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  58. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  59. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  60. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  61. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  62. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  63. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  64. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  65. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  66. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  67. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  68. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  69. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  70. deltacat/io/__init__.py +13 -0
  71. deltacat/io/dataset/__init__.py +0 -0
  72. deltacat/io/dataset/deltacat_dataset.py +91 -0
  73. deltacat/io/datasink/__init__.py +0 -0
  74. deltacat/io/datasink/deltacat_datasink.py +207 -0
  75. deltacat/io/datasource/__init__.py +0 -0
  76. deltacat/io/datasource/deltacat_datasource.py +580 -0
  77. deltacat/io/reader/__init__.py +0 -0
  78. deltacat/io/reader/deltacat_read_api.py +172 -0
  79. deltacat/storage/__init__.py +2 -0
  80. deltacat/storage/model/expression/__init__.py +47 -0
  81. deltacat/storage/model/expression/expression.py +656 -0
  82. deltacat/storage/model/expression/visitor.py +248 -0
  83. deltacat/storage/model/metafile.py +74 -42
  84. deltacat/storage/model/scan/push_down.py +32 -5
  85. deltacat/storage/model/shard.py +6 -2
  86. deltacat/storage/model/types.py +5 -3
  87. deltacat/tests/_io/reader/__init__.py +0 -0
  88. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  89. deltacat/tests/catalog/data/__init__.py +0 -0
  90. deltacat/tests/catalog/main/__init__.py +0 -0
  91. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  92. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  93. deltacat/tests/catalog/model/__init__.py +0 -0
  94. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  95. deltacat/tests/catalog/test_catalogs.py +52 -98
  96. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  97. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  98. deltacat/tests/daft/__init__.py +0 -0
  99. deltacat/tests/daft/test_model.py +97 -0
  100. deltacat/tests/experimental/__init__.py +0 -0
  101. deltacat/tests/experimental/catalog/__init__.py +0 -0
  102. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  103. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  104. deltacat/tests/experimental/daft/__init__.py +0 -0
  105. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  106. deltacat/tests/experimental/storage/__init__.py +0 -0
  107. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  108. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  109. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  110. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  111. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  112. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  113. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  114. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  115. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  116. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  117. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  118. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  119. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  120. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  121. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  122. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  123. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  124. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  125. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  126. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  127. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  128. deltacat/tests/storage/model/test_expression.py +327 -0
  129. deltacat/tests/storage/model/test_shard.py +3 -1
  130. deltacat/tests/test_deltacat_api.py +50 -9
  131. deltacat/types/media.py +141 -43
  132. deltacat/types/tables.py +35 -7
  133. deltacat/utils/daft.py +531 -5
  134. deltacat/utils/export.py +3 -1
  135. deltacat/utils/filesystem.py +39 -9
  136. deltacat/utils/polars.py +128 -0
  137. deltacat/utils/pyarrow.py +151 -15
  138. deltacat/utils/ray_utils/concurrency.py +1 -1
  139. deltacat/utils/ray_utils/runtime.py +56 -4
  140. deltacat/utils/url.py +1284 -0
  141. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
  142. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
  143. deltacat/catalog/iceberg/__init__.py +0 -4
  144. deltacat/daft/daft_scan.py +0 -111
  145. deltacat/daft/model.py +0 -258
  146. deltacat/examples/common/fixtures.py +0 -15
  147. deltacat/storage/rivulet/__init__.py +0 -11
  148. deltacat/storage/rivulet/feather/__init__.py +0 -5
  149. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  150. /deltacat/{daft → compute/jobs}/__init__.py +0 -0
  151. /deltacat/examples/{common → experimental}/__init__.py +0 -0
  152. /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
  153. /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
  154. /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
  155. /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  156. /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
  157. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  158. /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
  159. /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  160. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  161. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  162. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
  163. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  164. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  165. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  166. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  167. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  168. /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  169. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  170. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  171. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  172. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  173. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  174. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  175. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,162 @@
1
+ import pytest
2
+ import pyarrow as pa
3
+ import pyarrow.parquet as pq
4
+
5
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
6
+ from deltacat.experimental.storage.rivulet.shard.range_shard import (
7
+ RangeShard,
8
+ RangeShardingStrategy,
9
+ )
10
+
11
+
12
+ @pytest.fixture
13
+ def sample_numeric_dataset(tmp_path):
14
+ """
15
+ Creates a small Parquet file with integer-based min/max keys and
16
+ initializes a Dataset from it. Merge key is 'id' with values [1,2,3].
17
+ So min_key=1, max_key=3.
18
+ """
19
+ data = {
20
+ "id": [1, 2, 3],
21
+ "name": ["Alice", "Bob", "Charlie"],
22
+ "age": [25, 30, 35],
23
+ }
24
+ table = pa.Table.from_pydict(data)
25
+ parquet_file = tmp_path / "numeric_data.parquet"
26
+ pq.write_table(table, parquet_file)
27
+
28
+ ds = Dataset.from_parquet(
29
+ name="numeric_dataset",
30
+ file_uri=str(parquet_file),
31
+ metadata_uri=tmp_path,
32
+ merge_keys="id",
33
+ )
34
+ return ds
35
+
36
+
37
+ @pytest.fixture
38
+ def sample_string_dataset(tmp_path):
39
+ """
40
+ Creates a small Parquet file with a string-based merge key ('name')
41
+ and initializes a Dataset from it. Merge key has values
42
+ ['Alice', 'Bob', 'Charlie'] => min_key='Alice', max_key='Charlie'.
43
+ """
44
+ data = {
45
+ "name": ["Alice", "Charlie", "Bob"], # random order
46
+ "value": [100, 200, 150],
47
+ }
48
+ table = pa.Table.from_pydict(data)
49
+ parquet_file = tmp_path / "string_data.parquet"
50
+ pq.write_table(table, parquet_file)
51
+
52
+ ds = Dataset.from_parquet(
53
+ name="string_dataset",
54
+ file_uri=str(parquet_file),
55
+ metadata_uri=tmp_path,
56
+ merge_keys="name",
57
+ )
58
+ return ds
59
+
60
+
61
+ def test_shards(sample_numeric_dataset, sample_string_dataset):
62
+ shards = sample_numeric_dataset.shards(num_shards=2)
63
+
64
+ num_shards = len(list(shards))
65
+ assert num_shards == 2
66
+
67
+ shard = shards[0]
68
+ records = list(sample_numeric_dataset.scan(shard=shard).to_pydict())
69
+ num_records = len(records)
70
+ assert num_records == 2
71
+
72
+ assert records[0]["id"] == 1
73
+ assert records[0]["name"] == "Alice"
74
+
75
+ assert records[1]["id"] == 2
76
+ assert records[1]["name"] == "Bob"
77
+
78
+
79
+ def test_range_shard_repr():
80
+ shard = RangeShard(min_key=5, max_key=15)
81
+ assert repr(shard) == "Shard(type=range, min_key=5, max_key=15)"
82
+
83
+
84
+ def test_range_shard_split_integers():
85
+ shards = RangeShard.split(global_min=1, global_max=10, num_shards=2)
86
+ assert len(shards) == 2
87
+
88
+ assert shards[0].min_key == 1
89
+ assert shards[0].max_key == 5
90
+ assert shards[1].min_key == 6
91
+ assert shards[1].max_key == 10
92
+
93
+
94
+ def test_range_shard_split_integers_single_shard():
95
+ shards = RangeShard.split(global_min=1, global_max=10, num_shards=1)
96
+ assert len(shards) == 1
97
+ assert shards[0].min_key == 1
98
+ assert shards[0].max_key == 10
99
+
100
+
101
+ def test_range_shard_split_integers_same_value():
102
+ shards = RangeShard.split(global_min=5, global_max=5, num_shards=3)
103
+ assert len(shards) == 1
104
+
105
+
106
+ def test_range_sharding_strategy_integers(sample_numeric_dataset):
107
+ strategy = RangeShardingStrategy()
108
+ shards = list(
109
+ strategy.shards(num_shards=2, metastore=sample_numeric_dataset._metastore)
110
+ )
111
+
112
+ assert len(shards) == 2, "Expected 2 shards for dataset with keys [1,2,3]"
113
+
114
+ shard1, shard2 = shards
115
+ assert isinstance(shard1, RangeShard)
116
+ assert isinstance(shard2, RangeShard)
117
+ assert shard1.min_key == 1
118
+ assert shard1.max_key == 2
119
+ assert shard2.min_key == 3
120
+ assert shard2.max_key == 3
121
+
122
+
123
+ def test_range_sharding_strategy_integers_single_shard(sample_numeric_dataset):
124
+ strategy = RangeShardingStrategy()
125
+ shards = list(
126
+ strategy.shards(num_shards=1, metastore=sample_numeric_dataset._metastore)
127
+ )
128
+ assert len(shards) == 1
129
+ shard = shards[0]
130
+ assert shard.min_key == 1
131
+ assert shard.max_key == 3
132
+
133
+
134
+ def test_range_sharding_strategy_strings(sample_string_dataset):
135
+ strategy = RangeShardingStrategy()
136
+ shards = list(
137
+ strategy.shards(num_shards=2, metastore=sample_string_dataset._metastore)
138
+ )
139
+
140
+ assert len(shards) == 2, "Expected 2 shards for string-based dataset"
141
+ shard1, shard2 = shards
142
+ assert isinstance(shard1, RangeShard)
143
+ assert isinstance(shard2, RangeShard)
144
+
145
+ assert shard1.min_key == "Alice"
146
+ assert shard1.max_key < "Charlie"
147
+
148
+ assert shard2.min_key == shard1.max_key
149
+ assert shard2.max_key == "Charlie"
150
+
151
+
152
+ def test_range_sharding_strategy_strings_single_shard(sample_string_dataset):
153
+ strategy = RangeShardingStrategy()
154
+ shards = list(
155
+ strategy.shards(num_shards=1, metastore=sample_string_dataset._metastore)
156
+ )
157
+
158
+ assert len(shards) == 1
159
+
160
+ shard = shards[0]
161
+ assert shard.min_key == "Alice"
162
+ assert shard.max_key == "Charlie"
@@ -3,9 +3,11 @@ from deltacat.utils.metafile_locator import _find_partition_path
3
3
  import pytest
4
4
 
5
5
  import pyarrow as pa
6
- from deltacat.storage.rivulet import Schema, Field, Datatype
7
- from deltacat.storage.rivulet.dataset import Dataset
8
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
6
+ from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
7
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
8
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
9
+ QueryExpression,
10
+ )
9
11
 
10
12
 
11
13
  @pytest.fixture
@@ -57,7 +59,7 @@ def test_dataset_creation_metadata_structure(tmp_path):
57
59
  dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
58
60
 
59
61
  assert dataset._metadata_folder.startswith(".riv-meta")
60
- assert dataset._namespace == "DEFAULT"
62
+ assert dataset._namespace == "default"
61
63
  assert dataset.dataset_name == "test_dataset"
62
64
  assert dataset._metadata_path == str(tmp_path / ".riv-meta-test_dataset")
63
65
 
@@ -2,11 +2,11 @@ import os
2
2
 
3
3
  import pytest
4
4
 
5
- from deltacat import Dataset
6
- from deltacat.storage.rivulet.fs.file_store import FileStore
7
- from deltacat.storage.rivulet.schema.datatype import Datatype
8
- from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
9
- from deltacat.storage.rivulet import Schema, Field
5
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
6
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
7
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
8
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
9
+ from deltacat.experimental.storage.rivulet import Schema, Field
10
10
  import pyarrow as pa
11
11
  import pyarrow.parquet
12
12
 
@@ -2,16 +2,16 @@ from typing import List, FrozenSet, Dict
2
2
 
3
3
  import pytest
4
4
 
5
- from deltacat.storage.rivulet.metastore.delta import DeltaContext
6
- from deltacat.storage.rivulet.metastore.sst import SSTable, SSTableRow
7
- from deltacat.storage.rivulet.metastore.sst_interval_tree import (
5
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
6
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
7
+ from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
8
8
  BlockIntervalTree,
9
9
  BlockGroup,
10
10
  OrderedBlockGroups,
11
11
  Block,
12
12
  )
13
- from deltacat.storage.rivulet.schema.datatype import Datatype
14
- from deltacat.storage.rivulet import Schema
13
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
14
+ from deltacat.experimental.storage.rivulet import Schema
15
15
 
16
16
 
17
17
  @pytest.fixture
@@ -3,12 +3,14 @@ import os
3
3
 
4
4
  from pyarrow import RecordBatch, Table
5
5
 
6
- from deltacat.storage.rivulet.dataset import Dataset
7
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
8
- from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter
9
-
10
- from deltacat.storage.rivulet.mvp.Table import MvpTable, MvpRow
11
- from deltacat.storage.rivulet import Schema
6
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
7
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
8
+ QueryExpression,
9
+ )
10
+ from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
11
+
12
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable, MvpRow
13
+ from deltacat.experimental.storage.rivulet import Schema
12
14
  from typing import Dict, List, Generator, Set
13
15
 
14
16
  FIXTURE_ROW_COUNT = 10000
@@ -7,23 +7,25 @@ import msgpack
7
7
  import pytest
8
8
  from pyarrow import RecordBatch
9
9
 
10
- from deltacat.storage.rivulet.dataset import Dataset
11
- from deltacat.storage.rivulet.fs.file_store import FileStore
12
- from deltacat.storage.rivulet.metastore.delta import (
10
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
11
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
12
+ from deltacat.experimental.storage.rivulet.metastore.delta import (
13
13
  ManifestIO,
14
14
  TreeLevel,
15
15
  DeltacatManifestIO,
16
16
  )
17
17
 
18
- from deltacat.storage.rivulet.mvp.Table import MvpTable, MvpRow
19
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
20
- from deltacat.storage.rivulet import Schema
21
- from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
18
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable, MvpRow
19
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
20
+ QueryExpression,
21
+ )
22
+ from deltacat.experimental.storage.rivulet import Schema
23
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
22
24
  MemtableDatasetWriter,
23
25
  )
24
26
 
25
- from deltacat.tests.storage.rivulet.test_utils import FIXTURE_ROW_COUNT
26
- from deltacat.tests.storage.rivulet.test_utils import (
27
+ from deltacat.tests.experimental.storage.rivulet.test_utils import FIXTURE_ROW_COUNT
28
+ from deltacat.tests.experimental.storage.rivulet.test_utils import (
27
29
  write_mvp_table,
28
30
  compare_mvp_table_to_scan_results,
29
31
  mvp_table_to_record_batches,
@@ -2,8 +2,8 @@ import pytest
2
2
  import shutil
3
3
  import tempfile
4
4
 
5
- from deltacat.storage.rivulet.fs.file_store import FileStore
6
- from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
5
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
6
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
7
7
  MemtableDatasetWriter,
8
8
  )
9
9
  from ..test_utils import (
@@ -1,12 +1,12 @@
1
1
  import pytest
2
2
 
3
- from deltacat import Dataset
4
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
5
- from deltacat.storage.rivulet.fs.file_store import FileStore
6
- from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
7
- from deltacat.storage.rivulet import Schema
8
- from deltacat.storage.rivulet.schema.datatype import Datatype
9
- from deltacat.storage.rivulet.writer.memtable_dataset_writer import (
3
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
4
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
5
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
6
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
7
+ from deltacat.experimental.storage.rivulet import Schema
8
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
9
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
10
10
  MemtableDatasetWriter,
11
11
  )
12
12
 
@@ -7,6 +7,7 @@ import sqlite3
7
7
  from sqlite3 import Cursor, Connection
8
8
  import uuid
9
9
  import ray
10
+
10
11
  import io
11
12
 
12
13
  from deltacat.tests.test_utils.storage import create_empty_delta
@@ -0,0 +1,327 @@
1
+ import pytest
2
+ import pyarrow as pa
3
+
4
+ from deltacat.storage.model.expression import (
5
+ Reference,
6
+ Literal,
7
+ Equal,
8
+ NotEqual,
9
+ GreaterThan,
10
+ LessThan,
11
+ GreaterThanEqual,
12
+ LessThanEqual,
13
+ And,
14
+ Or,
15
+ Not,
16
+ In,
17
+ Between,
18
+ Like,
19
+ IsNull,
20
+ )
21
+ from deltacat.storage.model.expression.visitor import DisplayVisitor, ExpressionVisitor
22
+
23
+
24
+ @pytest.fixture
25
+ def field_ref():
26
+ return Reference("field1")
27
+
28
+
29
+ @pytest.fixture
30
+ def field_ref2():
31
+ return Reference("field2")
32
+
33
+
34
+ @pytest.fixture
35
+ def literal_int():
36
+ return Literal(pa.scalar(42))
37
+
38
+
39
+ @pytest.fixture
40
+ def literal_str():
41
+ return Literal(pa.scalar("test"))
42
+
43
+
44
+ @pytest.fixture
45
+ def display_visitor():
46
+ return DisplayVisitor()
47
+
48
+
49
+ class TestExpressionLibrary:
50
+ """Test suite for the Deltacat expression library."""
51
+
52
+ def test_reference_creation(self):
53
+ ref = Reference("field1")
54
+ assert ref.field == "field1"
55
+ assert ref.index is None
56
+
57
+ def test_reference_with_index(self):
58
+ ref = Reference("field1", 0)
59
+ assert ref.field == "field1"
60
+ assert ref.index == 0
61
+
62
+ def test_literal_creation(self):
63
+ lit = Literal(pa.scalar(42))
64
+ assert lit.value.as_py() == 42
65
+
66
+ # Test the factory methods (.of)
67
+ def test_factory_methods(self):
68
+ # Reference.of
69
+ ref = Reference.of("field1")
70
+ assert ref.field == "field1"
71
+
72
+ # Literal.of
73
+ lit = Literal.of(42)
74
+ assert lit.value.as_py() == 42
75
+
76
+ # Equal.of with mixed types
77
+ eq = Equal.of("field1", 42)
78
+ assert isinstance(eq.left, Literal)
79
+ assert isinstance(eq.right, Literal)
80
+ assert eq.left.value.as_py() == "field1"
81
+ assert eq.right.value.as_py() == 42
82
+
83
+ # Not.of
84
+ not_expr = Not.of(Equal.of("field1", 42))
85
+ assert isinstance(not_expr.operand, Equal)
86
+
87
+ # In.of
88
+ in_expr = In.of("field1", [1, 2, 3])
89
+ assert isinstance(in_expr.value, Literal)
90
+ assert len(in_expr.values) == 3
91
+ assert all(isinstance(v, Literal) for v in in_expr.values)
92
+
93
+ # Between.of
94
+ between_expr = Between.of("field1", 10, 20)
95
+ assert isinstance(between_expr.value, Literal)
96
+ assert between_expr.lower.value.as_py() == 10
97
+ assert between_expr.upper.value.as_py() == 20
98
+
99
+ # Like.of
100
+ like_expr = Like.of("field1", "%test%")
101
+ assert isinstance(like_expr.value, Literal)
102
+ assert like_expr.pattern.value.as_py() == "%test%"
103
+
104
+ # Test reference comparison helper methods
105
+ def test_reference_comparison_helpers(self, field_ref):
106
+ # Test eq, ne, gt, lt, ge, le methods
107
+ eq_expr = field_ref.eq(42)
108
+ assert isinstance(eq_expr, Equal)
109
+ assert eq_expr.left == field_ref
110
+ assert eq_expr.right.value.as_py() == 42
111
+
112
+ ne_expr = field_ref.ne(42)
113
+ assert isinstance(ne_expr, NotEqual)
114
+
115
+ gt_expr = field_ref.gt(42)
116
+ assert isinstance(gt_expr, GreaterThan)
117
+
118
+ lt_expr = field_ref.lt(42)
119
+ assert isinstance(lt_expr, LessThan)
120
+
121
+ ge_expr = field_ref.ge(42)
122
+ assert isinstance(ge_expr, GreaterThanEqual)
123
+
124
+ le_expr = field_ref.le(42)
125
+ assert isinstance(le_expr, LessThanEqual)
126
+
127
+ # Test reference special operation helpers
128
+ def test_reference_special_helpers(self, field_ref):
129
+ # Test is_null, in_, between, like methods
130
+ is_null_expr = field_ref.is_null()
131
+ assert isinstance(is_null_expr, IsNull)
132
+ assert is_null_expr.operand == field_ref
133
+
134
+ in_expr = field_ref.in_([1, 2, 3])
135
+ assert isinstance(in_expr, In)
136
+ assert in_expr.value == field_ref
137
+ assert len(in_expr.values) == 3
138
+ assert in_expr.values[0].value.as_py() == 1
139
+
140
+ between_expr = field_ref.between(10, 20)
141
+ assert isinstance(between_expr, Between)
142
+ assert between_expr.value == field_ref
143
+ assert between_expr.lower.value.as_py() == 10
144
+ assert between_expr.upper.value.as_py() == 20
145
+
146
+ like_expr = field_ref.like("%test%")
147
+ assert isinstance(like_expr, Like)
148
+ assert like_expr.value == field_ref
149
+ assert like_expr.pattern.value.as_py() == "%test%"
150
+
151
+ # Test boolean expression helper methods
152
+ def test_boolean_expression_helpers(self, field_ref):
153
+ # Test and_, or_, not_ methods
154
+ expr1 = field_ref.eq(42)
155
+ expr2 = field_ref.gt(10)
156
+
157
+ and_expr = expr1.and_(expr2)
158
+ assert isinstance(and_expr, And)
159
+ assert and_expr.left == expr1
160
+ assert and_expr.right == expr2
161
+
162
+ or_expr = expr1.or_(expr2)
163
+ assert isinstance(or_expr, Or)
164
+ assert or_expr.left == expr1
165
+ assert or_expr.right == expr2
166
+
167
+ not_expr = expr1.not_()
168
+ assert isinstance(not_expr, Not)
169
+ assert not_expr.operand == expr1
170
+
171
+ # Test building complex expressions
172
+ def test_complex_expression_building(self, field_ref, field_ref2):
173
+ # Test building more complex expressions using method chaining
174
+ expr = field_ref.eq(42).and_(field_ref2.gt(10)).or_(field_ref.is_null()).not_()
175
+
176
+ assert isinstance(expr, Not)
177
+ assert isinstance(expr.operand, Or)
178
+ assert isinstance(expr.operand.left, And)
179
+ assert isinstance(expr.operand.right, IsNull)
180
+
181
+ # Test DisplayVisitor for different expression types
182
+ def test_reference_display(self, field_ref, display_visitor):
183
+ assert display_visitor.visit(field_ref) == "field1"
184
+
185
+ def test_literal_display(self, literal_int, literal_str, display_visitor):
186
+ assert display_visitor.visit(literal_int) == "42"
187
+ assert display_visitor.visit(literal_str) == "test"
188
+
189
+ def test_comparison_display(self, field_ref, literal_int, display_visitor):
190
+ assert display_visitor.visit(Equal(field_ref, literal_int)) == "field1 = 42"
191
+ assert display_visitor.visit(NotEqual(field_ref, literal_int)) == "field1 <> 42"
192
+ assert (
193
+ display_visitor.visit(GreaterThan(field_ref, literal_int)) == "field1 > 42"
194
+ )
195
+ assert display_visitor.visit(LessThan(field_ref, literal_int)) == "field1 < 42"
196
+ assert (
197
+ display_visitor.visit(GreaterThanEqual(field_ref, literal_int))
198
+ == "field1 >= 42"
199
+ )
200
+ assert (
201
+ display_visitor.visit(LessThanEqual(field_ref, literal_int))
202
+ == "field1 <= 42"
203
+ )
204
+
205
+ def test_logical_operator_display(self, field_ref, literal_int, display_visitor):
206
+ eq_expr = Equal(field_ref, literal_int)
207
+ gt_expr = GreaterThan(field_ref, literal_int)
208
+
209
+ assert (
210
+ display_visitor.visit(And(eq_expr, gt_expr))
211
+ == "(field1 = 42 AND field1 > 42)"
212
+ )
213
+ assert (
214
+ display_visitor.visit(Or(eq_expr, gt_expr))
215
+ == "(field1 = 42 OR field1 > 42)"
216
+ )
217
+ assert display_visitor.visit(Not(eq_expr)) == "NOT (field1 = 42)"
218
+
219
+ def test_special_operator_display(self, field_ref, display_visitor):
220
+ assert display_visitor.visit(IsNull(field_ref)) == "(field1) IS NULL"
221
+
222
+ values = [Literal(pa.scalar(1)), Literal(pa.scalar(2)), Literal(pa.scalar(3))]
223
+ assert display_visitor.visit(In(field_ref, values)) == "field1 IN (1, 2, 3)"
224
+
225
+ lower = Literal(pa.scalar(10))
226
+ upper = Literal(pa.scalar(20))
227
+ assert (
228
+ display_visitor.visit(Between(field_ref, lower, upper))
229
+ == "field1 BETWEEN 10 AND 20"
230
+ )
231
+
232
+ pattern = Literal(pa.scalar("%test%"))
233
+ assert display_visitor.visit(Like(field_ref, pattern)) == "field1 LIKE %test%"
234
+
235
+ def test_complex_expression_display(self, field_ref, field_ref2, display_visitor):
236
+ expr = field_ref.eq(42).and_(field_ref2.gt(10)).or_(field_ref.is_null()).not_()
237
+
238
+ # Check that the DisplayVisitor correctly formats the complex expression
239
+ assert (
240
+ display_visitor.visit(expr)
241
+ == "NOT (((field1 = 42 AND field2 > 10) OR (field1) IS NULL))"
242
+ )
243
+
244
+ # Test BinaryExpression with_ methods
245
+ def test_binary_expression_with_methods(self, field_ref, field_ref2, literal_int):
246
+ eq_expr = Equal(field_ref, literal_int)
247
+
248
+ # Test with_left
249
+ new_expr = eq_expr.with_left(field_ref2)
250
+ assert isinstance(new_expr, Equal)
251
+ assert new_expr.left == field_ref2
252
+ assert new_expr.right == literal_int
253
+
254
+ # Test with_right
255
+ new_lit = Literal(pa.scalar(100))
256
+ new_expr = eq_expr.with_right(new_lit)
257
+ assert new_expr.left == field_ref
258
+ assert new_expr.right == new_lit
259
+
260
+ # Test __str__ method which uses DisplayVisitor
261
+ def test_expression_str_method(self, field_ref, literal_int):
262
+ eq_expr = Equal(field_ref, literal_int)
263
+ assert str(eq_expr) == "field1 = 42"
264
+
265
+ # Test proper parenthesization in complex expressions
266
+ def test_nested_parentheses(self, field_ref, field_ref2, display_visitor):
267
+ # Create a complex expression: (field1 = 1 AND field2 = 2) OR field2 = 3
268
+ expr1 = Equal(field_ref, Literal(pa.scalar(1)))
269
+ expr2 = Equal(field_ref2, Literal(pa.scalar(2)))
270
+ expr3 = Equal(field_ref2, Literal(pa.scalar(3)))
271
+
272
+ and_expr = And(expr1, expr2)
273
+ or_expr = Or(and_expr, expr3)
274
+
275
+ assert (
276
+ display_visitor.visit(or_expr)
277
+ == "((field1 = 1 AND field2 = 2) OR field2 = 3)"
278
+ )
279
+
280
+ # Test Literal comparison methods
281
+ def test_literal_comparison_methods(self, literal_int):
282
+ eq_expr = literal_int.eq("test")
283
+ assert isinstance(eq_expr, Equal)
284
+ assert eq_expr.left == literal_int
285
+ assert eq_expr.right.value.as_py() == "test"
286
+
287
+ ne_expr = literal_int.ne("test")
288
+ assert isinstance(ne_expr, NotEqual)
289
+ assert ne_expr.left == literal_int
290
+ assert ne_expr.right.value.as_py() == "test"
291
+
292
+ # Test a custom ExpressionVisitor implementation
293
+ def test_custom_visitor(self, field_ref, literal_int):
294
+ class CountingVisitor(ExpressionVisitor[None, int]):
295
+ """Simple visitor that counts expression nodes"""
296
+
297
+ def visit_reference(self, expr, context=None):
298
+ return 1
299
+
300
+ def visit_literal(self, expr, context=None):
301
+ return 1
302
+
303
+ def visit_binary_expression(self, expr, left, right, context=None):
304
+ return left + right + 1
305
+
306
+ def visit_unary_expression(self, expr, operand, context=None):
307
+ return operand + 1
308
+
309
+ def visit_in(self, expr, context=None):
310
+ return 1 + len(expr.values) + 1 # value + all values + In operator
311
+
312
+ def visit_between(self, expr, context=None):
313
+ return 3 # value + lower + upper
314
+
315
+ def visit_like(self, expr, context=None):
316
+ return 2 # value + pattern
317
+
318
+ visitor = CountingVisitor()
319
+
320
+ # Count nodes in simple expressions
321
+ assert visitor.visit(field_ref) == 1
322
+ assert visitor.visit(literal_int) == 1
323
+ assert visitor.visit(Equal(field_ref, literal_int)) == 3 # left + right + Equal
324
+
325
+ # Count nodes in a more complex expression
326
+ expr = field_ref.eq(42).and_(field_ref.gt(10))
327
+ assert visitor.visit(expr) == 7 # (1+1+1) + (1+1+1) + 1
@@ -7,7 +7,9 @@ def test_sharding_strategy_from_string_range():
7
7
  """
8
8
  Tests that from_string('range') returns an instance of RangeShardingStrategy.
9
9
  """
10
- from deltacat.storage.rivulet.shard.range_shard import RangeShardingStrategy
10
+ from deltacat.experimental.storage.rivulet.shard.range_shard import (
11
+ RangeShardingStrategy,
12
+ )
11
13
 
12
14
  strategy = ShardingStrategy.from_string("range")
13
15
  assert isinstance(strategy, RangeShardingStrategy)