deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.0b2.dist-info/METADATA +65 -0
- deltacat-2.0.0b2.dist-info/RECORD +349 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,241 @@
|
|
1
|
+
import pytest
|
2
|
+
import pyarrow as pa
|
3
|
+
from deltacat.storage.rivulet import Schema, Field, Datatype
|
4
|
+
|
5
|
+
|
6
|
+
def test_field_initialization():
|
7
|
+
field = Field(name="test_field", datatype=Datatype.string(), is_merge_key=True)
|
8
|
+
assert field.name == "test_field"
|
9
|
+
assert field.datatype == Datatype.string()
|
10
|
+
assert field.is_merge_key
|
11
|
+
|
12
|
+
|
13
|
+
def test_schema_initialization():
|
14
|
+
fields = [("id", Datatype.int64()), ("name", Datatype.string())]
|
15
|
+
schema = Schema(fields, merge_keys=["id"])
|
16
|
+
assert len(schema) == 2
|
17
|
+
assert "id" in schema.keys()
|
18
|
+
assert schema["id"].datatype == Datatype.int64()
|
19
|
+
assert "name" in schema.keys()
|
20
|
+
assert schema["name"].datatype == Datatype.string()
|
21
|
+
|
22
|
+
|
23
|
+
def test_merge_key_conflict_on_init():
|
24
|
+
fields = [
|
25
|
+
Field("id", Datatype.int64(), is_merge_key=False), # Merge key off here
|
26
|
+
("name", Datatype.string()),
|
27
|
+
]
|
28
|
+
with pytest.raises(TypeError):
|
29
|
+
Schema(fields, merge_keys=["id"]) # Merge key on here
|
30
|
+
|
31
|
+
|
32
|
+
def test_simultaneous_duplicate_field():
|
33
|
+
with pytest.raises(ValueError):
|
34
|
+
Schema(
|
35
|
+
[
|
36
|
+
("id", Datatype.int32()),
|
37
|
+
("name", Datatype.string()),
|
38
|
+
("age", Datatype.int32()),
|
39
|
+
("age", Datatype.string()),
|
40
|
+
],
|
41
|
+
merge_keys=["id"],
|
42
|
+
)
|
43
|
+
|
44
|
+
|
45
|
+
def test_add_field():
|
46
|
+
schema = Schema()
|
47
|
+
field = Field("new_field", Datatype.float(), True)
|
48
|
+
schema.add_field(field)
|
49
|
+
assert len(schema) == 1
|
50
|
+
assert "new_field" in schema.keys()
|
51
|
+
assert schema["new_field"].datatype == Datatype.float()
|
52
|
+
|
53
|
+
field2 = Field("another_field", Datatype.string(), True)
|
54
|
+
schema.add_field(field2)
|
55
|
+
assert len(schema) == 2
|
56
|
+
assert "another_field" in schema.keys()
|
57
|
+
assert schema["another_field"].datatype == Datatype.string()
|
58
|
+
|
59
|
+
with pytest.raises(ValueError):
|
60
|
+
schema.add_field(field2)
|
61
|
+
|
62
|
+
|
63
|
+
def test_setitem_field():
|
64
|
+
schema = Schema()
|
65
|
+
field = Field("test_field", Datatype.int64(), is_merge_key=True)
|
66
|
+
schema["test_field"] = field
|
67
|
+
assert schema["test_field"] == field
|
68
|
+
|
69
|
+
|
70
|
+
def test_setitem_datatype():
|
71
|
+
schema = Schema()
|
72
|
+
schema["id"] = (Datatype.int64(), True)
|
73
|
+
schema["test_field"] = Datatype.int64()
|
74
|
+
assert schema["test_field"].name == "test_field"
|
75
|
+
assert schema["test_field"].datatype == Datatype.int64()
|
76
|
+
assert not schema["test_field"].is_merge_key
|
77
|
+
|
78
|
+
|
79
|
+
def test_setitem_tuple_with_merge_key():
|
80
|
+
schema = Schema()
|
81
|
+
schema["test_field"] = (Datatype.int64(), True)
|
82
|
+
assert schema["test_field"].name == "test_field"
|
83
|
+
assert schema["test_field"].datatype == Datatype.int64()
|
84
|
+
assert schema["test_field"].is_merge_key
|
85
|
+
|
86
|
+
|
87
|
+
def test_setitem_invalid_type():
|
88
|
+
schema = Schema()
|
89
|
+
with pytest.raises(TypeError):
|
90
|
+
schema["test_field"] = "invalid"
|
91
|
+
|
92
|
+
|
93
|
+
def test_non_empty_merge_key():
|
94
|
+
with pytest.raises(TypeError):
|
95
|
+
_ = Schema([], merge_keys=["id"])
|
96
|
+
|
97
|
+
|
98
|
+
def test_merge_schemas():
|
99
|
+
schema1 = Schema([("id", Datatype.int64())], merge_keys=["id"])
|
100
|
+
schema2 = Schema(
|
101
|
+
[("other_id", Datatype.string()), ("name", Datatype.string())],
|
102
|
+
merge_keys="other_id",
|
103
|
+
)
|
104
|
+
schema1.merge(schema2)
|
105
|
+
assert len(schema1) == 3
|
106
|
+
assert "id" in schema1.keys()
|
107
|
+
assert "name" in schema1.keys()
|
108
|
+
assert "other_id" in schema1.keys()
|
109
|
+
|
110
|
+
|
111
|
+
def test_merge_schemas_same_merge_key():
|
112
|
+
schema1 = Schema(
|
113
|
+
[("id", Datatype.int64()), ("name", Datatype.string())], merge_keys=["id"]
|
114
|
+
)
|
115
|
+
schema2 = Schema(
|
116
|
+
[("id", Datatype.int64()), ("other_name", Datatype.string())],
|
117
|
+
merge_keys="id",
|
118
|
+
)
|
119
|
+
schema1.merge(schema2)
|
120
|
+
assert len(schema1) == 3
|
121
|
+
assert "id" in schema1.keys()
|
122
|
+
assert "name" in schema1.keys()
|
123
|
+
assert "other_name" in schema1.keys()
|
124
|
+
|
125
|
+
|
126
|
+
def test_merge_schema_conflict():
|
127
|
+
schema1 = Schema([("id", Datatype.int64())], merge_keys=["id"])
|
128
|
+
schema1_dup = Schema([("id", Datatype.int64())], merge_keys=["id"])
|
129
|
+
schema2 = Schema([("id", Datatype.string())], merge_keys=["id"])
|
130
|
+
|
131
|
+
with pytest.raises(ValueError):
|
132
|
+
schema1.merge(schema2)
|
133
|
+
|
134
|
+
schema1.merge(
|
135
|
+
schema1_dup
|
136
|
+
) # Merging the same field is allowed (unlike using add_field)
|
137
|
+
assert schema1["id"].datatype == Datatype.int64()
|
138
|
+
assert len(schema1) == 1
|
139
|
+
|
140
|
+
|
141
|
+
def test_to_pyarrow_schema():
|
142
|
+
fields = [("id", Datatype.int64()), ("name", Datatype.string())]
|
143
|
+
schema = Schema(fields, merge_keys=["id"])
|
144
|
+
pa_schema = schema.to_pyarrow()
|
145
|
+
assert isinstance(pa_schema, pa.Schema)
|
146
|
+
assert len(pa_schema) == 2
|
147
|
+
assert pa_schema.field("id").type == pa.int64()
|
148
|
+
assert pa_schema.field("name").type == pa.string()
|
149
|
+
|
150
|
+
|
151
|
+
def test_from_pyarrow_schema():
|
152
|
+
pa_schema = pa.schema([("id", pa.int64()), ("name", pa.string())])
|
153
|
+
schema = Schema.from_pyarrow(pa_schema, merge_keys=["id"])
|
154
|
+
assert len(schema) == 2
|
155
|
+
assert schema["id"].is_merge_key
|
156
|
+
|
157
|
+
|
158
|
+
def test_from_pyarrow_schema_invalid_merge_keys():
|
159
|
+
pa_schema = pa.schema([("id", pa.int64()), ("name", pa.string())])
|
160
|
+
with pytest.raises(ValueError):
|
161
|
+
Schema.from_pyarrow(pa_schema, merge_keys=["bad_key"])
|
162
|
+
|
163
|
+
|
164
|
+
def test_get_field():
|
165
|
+
schema = Schema([("id", Datatype.int64())], merge_keys=["id"])
|
166
|
+
field = schema["id"]
|
167
|
+
assert field.name == "id"
|
168
|
+
assert field.datatype == Datatype.int64()
|
169
|
+
|
170
|
+
|
171
|
+
def test_set_field():
|
172
|
+
schema = Schema([("id", Datatype.int64())], merge_keys=["id"])
|
173
|
+
schema["name"] = Field("name", Datatype.string())
|
174
|
+
assert len(schema) == 2
|
175
|
+
assert "name" in schema.keys()
|
176
|
+
assert schema["name"].datatype == Datatype.string()
|
177
|
+
|
178
|
+
|
179
|
+
def test_delete_field():
|
180
|
+
schema = Schema(
|
181
|
+
[("name", Datatype.string()), ("zip", Datatype.int32())], merge_keys=["name"]
|
182
|
+
)
|
183
|
+
del schema["zip"]
|
184
|
+
assert "zip" not in schema.keys()
|
185
|
+
assert "name" in schema.keys()
|
186
|
+
|
187
|
+
|
188
|
+
def test_delete_merge_key_field():
|
189
|
+
schema = Schema([("id", Datatype.int64())], merge_keys=["id"])
|
190
|
+
with pytest.raises(ValueError):
|
191
|
+
del schema["id"]
|
192
|
+
|
193
|
+
|
194
|
+
def test_schema_iter():
|
195
|
+
fields = [
|
196
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
197
|
+
Field("name", Datatype.string()),
|
198
|
+
]
|
199
|
+
schema = Schema(fields)
|
200
|
+
iter_result = list(iter(schema))
|
201
|
+
assert len(iter_result) == 2
|
202
|
+
assert all(isinstance(item, str) for item in iter_result)
|
203
|
+
|
204
|
+
|
205
|
+
def test_merge_all():
|
206
|
+
schema1 = Schema(
|
207
|
+
[
|
208
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
209
|
+
Field("name", Datatype.string()),
|
210
|
+
]
|
211
|
+
)
|
212
|
+
schema2 = Schema(
|
213
|
+
[
|
214
|
+
Field("age", Datatype.int32()),
|
215
|
+
Field("email", Datatype.string(), is_merge_key=True),
|
216
|
+
]
|
217
|
+
)
|
218
|
+
merged_schema = Schema.merge_all([schema1, schema2])
|
219
|
+
assert len(merged_schema) == 4
|
220
|
+
|
221
|
+
|
222
|
+
def test_schema_values():
|
223
|
+
fields = [
|
224
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
225
|
+
Field("name", Datatype.string()),
|
226
|
+
]
|
227
|
+
schema = Schema(fields)
|
228
|
+
values = list(schema.values())
|
229
|
+
assert len(values) == 2
|
230
|
+
assert all(isinstance(v, Field) for v in values)
|
231
|
+
|
232
|
+
|
233
|
+
def test_schema_items():
|
234
|
+
fields = [
|
235
|
+
Field("id", Datatype.int64(), is_merge_key=True),
|
236
|
+
Field("name", Datatype.string()),
|
237
|
+
]
|
238
|
+
schema = Schema(fields)
|
239
|
+
items = list(schema.items())
|
240
|
+
assert len(items) == 2
|
241
|
+
assert all(isinstance(k, str) and isinstance(v, Field) for k, v in items)
|
@@ -0,0 +1,406 @@
|
|
1
|
+
import posixpath
|
2
|
+
from deltacat.utils.metafile_locator import _find_partition_path
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
import pyarrow as pa
|
6
|
+
from deltacat.storage.rivulet import Schema, Field, Datatype
|
7
|
+
from deltacat.storage.rivulet.dataset import Dataset
|
8
|
+
from deltacat.storage.rivulet.reader.query_expression import QueryExpression
|
9
|
+
|
10
|
+
|
11
|
+
@pytest.fixture
|
12
|
+
def sample_schema():
|
13
|
+
return Schema(
|
14
|
+
fields=[
|
15
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
16
|
+
Field("name", Datatype.string()),
|
17
|
+
Field("age", Datatype.int32()),
|
18
|
+
]
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
@pytest.fixture
|
23
|
+
def sample_pydict():
|
24
|
+
return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
|
25
|
+
|
26
|
+
|
27
|
+
@pytest.fixture
|
28
|
+
def sample_parquet_data(tmp_path, sample_pydict):
|
29
|
+
parquet_path = tmp_path / "test.parquet"
|
30
|
+
table = pa.Table.from_pydict(sample_pydict)
|
31
|
+
pa.parquet.write_table(table, parquet_path)
|
32
|
+
return parquet_path
|
33
|
+
|
34
|
+
|
35
|
+
# Updated Tests
|
36
|
+
|
37
|
+
|
38
|
+
def test_dataset_creation_with_schema(tmp_path, sample_schema):
|
39
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
40
|
+
assert len(dataset.fields) == 3
|
41
|
+
assert "id" in dataset.fields
|
42
|
+
assert dataset.fields["id"].is_merge_key
|
43
|
+
|
44
|
+
|
45
|
+
def test_dataset_initialization_with_metadata(tmp_path):
|
46
|
+
dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
|
47
|
+
assert dataset.dataset_name == "test_dataset"
|
48
|
+
assert dataset._metadata_folder.startswith(".riv-meta")
|
49
|
+
|
50
|
+
|
51
|
+
def test_invalid_dataset_initialization():
|
52
|
+
with pytest.raises(ValueError, match="Name must be a non-empty string"):
|
53
|
+
Dataset(dataset_name="")
|
54
|
+
|
55
|
+
|
56
|
+
def test_dataset_creation_metadata_structure(tmp_path):
|
57
|
+
dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
|
58
|
+
|
59
|
+
assert dataset._metadata_folder.startswith(".riv-meta")
|
60
|
+
assert dataset._namespace == "DEFAULT"
|
61
|
+
assert dataset.dataset_name == "test_dataset"
|
62
|
+
assert dataset._metadata_path == str(tmp_path / ".riv-meta-test_dataset")
|
63
|
+
|
64
|
+
locator = dataset._locator
|
65
|
+
root_uri = dataset._metadata_path
|
66
|
+
|
67
|
+
partition_path = _find_partition_path(root_uri, locator)
|
68
|
+
|
69
|
+
# Ensures that directory structure for namespace -> table -> table_version -> stream_id -> partition_id exists
|
70
|
+
assert posixpath.exists(partition_path)
|
71
|
+
|
72
|
+
|
73
|
+
def test_fields_accessor_add_field(tmp_path, sample_schema):
|
74
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
75
|
+
dataset.fields.add("new_field", Datatype.float())
|
76
|
+
assert "new_field" in dataset.fields
|
77
|
+
assert dataset.fields["new_field"].datatype == Datatype.float()
|
78
|
+
|
79
|
+
dataset.fields["new_field2"] = Field("new_field2", Datatype.int32())
|
80
|
+
assert "new_field2" in dataset.fields
|
81
|
+
assert "new_field2" in dataset.schemas["all"]
|
82
|
+
with pytest.raises(TypeError):
|
83
|
+
dataset.fields["new_field3"] = 2
|
84
|
+
|
85
|
+
|
86
|
+
def test_field_removal(tmp_path, sample_schema):
|
87
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
88
|
+
del dataset.fields["age"]
|
89
|
+
assert "age" not in dataset.fields
|
90
|
+
with pytest.raises(ValueError):
|
91
|
+
del dataset.fields["age"]
|
92
|
+
with pytest.raises(KeyError):
|
93
|
+
_ = dataset.fields["age"]
|
94
|
+
|
95
|
+
|
96
|
+
def test_fields_accessor_repr(tmp_path, sample_schema):
|
97
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
98
|
+
repr_output = repr(dataset.fields)
|
99
|
+
for field_name in ["id", "name", "age"]:
|
100
|
+
assert field_name in repr_output, f"Field '{field_name}' missing in repr output"
|
101
|
+
|
102
|
+
|
103
|
+
def test_schemas_accessor_add_group(tmp_path, sample_schema):
|
104
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
105
|
+
dataset.schemas["analytics"] = ["id", "name"]
|
106
|
+
assert "analytics" in dataset.schemas
|
107
|
+
assert len(dataset.schemas["analytics"]) == 2
|
108
|
+
|
109
|
+
|
110
|
+
def test_schema_removal(tmp_path, sample_schema):
|
111
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
112
|
+
with pytest.raises(ValueError):
|
113
|
+
del dataset.schemas["all"]
|
114
|
+
with pytest.raises(ValueError):
|
115
|
+
del dataset.schemas["does_not_exist"]
|
116
|
+
dataset.schemas["new"] = ["id", "name"]
|
117
|
+
del dataset.schemas["new"]
|
118
|
+
with pytest.raises(KeyError):
|
119
|
+
_ = dataset.schemas["new"]
|
120
|
+
|
121
|
+
|
122
|
+
def test_dataset_from_parquet(tmp_path, sample_parquet_data):
|
123
|
+
dataset = Dataset.from_parquet(
|
124
|
+
name="test_dataset",
|
125
|
+
file_uri=str(sample_parquet_data),
|
126
|
+
metadata_uri=str(tmp_path),
|
127
|
+
merge_keys="id",
|
128
|
+
)
|
129
|
+
assert len(dataset.fields) == 3
|
130
|
+
assert "id" in dataset.fields
|
131
|
+
assert dataset.fields["id"].is_merge_key
|
132
|
+
|
133
|
+
|
134
|
+
def test_parquet_schema_modes(tmp_path, sample_pydict):
|
135
|
+
# Create two parquet files with overlapping and unique schemas
|
136
|
+
data_1 = {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
|
137
|
+
data_2 = {"id": [4, 5, 6], "age": [25, 30, 35]}
|
138
|
+
|
139
|
+
path_1 = tmp_path / "data1.parquet"
|
140
|
+
path_2 = tmp_path / "data2.parquet"
|
141
|
+
pa.parquet.write_table(pa.Table.from_pydict(data_1), path_1)
|
142
|
+
pa.parquet.write_table(pa.Table.from_pydict(data_2), path_2)
|
143
|
+
|
144
|
+
dataset_union = Dataset.from_parquet(
|
145
|
+
name="test_dataset_union",
|
146
|
+
file_uri=str(tmp_path),
|
147
|
+
merge_keys="id",
|
148
|
+
schema_mode="union",
|
149
|
+
)
|
150
|
+
assert len(dataset_union.fields) == 3 # id, name, age
|
151
|
+
|
152
|
+
dataset_intersect = Dataset.from_parquet(
|
153
|
+
name="test_dataset_intersect",
|
154
|
+
file_uri=str(tmp_path),
|
155
|
+
merge_keys="id",
|
156
|
+
schema_mode="intersect",
|
157
|
+
)
|
158
|
+
assert len(dataset_intersect.fields) == 1 # Only id
|
159
|
+
|
160
|
+
|
161
|
+
def test_merge_all_schemas():
|
162
|
+
schema1 = Schema(
|
163
|
+
fields=[
|
164
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
165
|
+
Field("name", Datatype.string()),
|
166
|
+
]
|
167
|
+
)
|
168
|
+
schema2 = Schema(
|
169
|
+
fields=[
|
170
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
171
|
+
Field("age", Datatype.int32()),
|
172
|
+
]
|
173
|
+
)
|
174
|
+
merged_schema = Schema.merge_all([schema1, schema2])
|
175
|
+
assert len(merged_schema) == 3
|
176
|
+
assert "id" in merged_schema
|
177
|
+
assert "name" in merged_schema
|
178
|
+
assert "age" in merged_schema
|
179
|
+
|
180
|
+
|
181
|
+
def test_writer_creation_with_custom_format(tmp_path, sample_schema):
|
182
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
183
|
+
writer = dataset.writer(file_format="feather")
|
184
|
+
assert writer is not None
|
185
|
+
|
186
|
+
|
187
|
+
def test_scan_with_query(tmp_path, sample_schema):
|
188
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
189
|
+
query = QueryExpression() # Placeholder query
|
190
|
+
scan = dataset.scan(query)
|
191
|
+
assert scan is not None
|
192
|
+
|
193
|
+
|
194
|
+
def test_add_schema_to_new_schemas(tmp_path):
|
195
|
+
"""Test adding a schema to a new field group."""
|
196
|
+
base_uri = str(tmp_path / "test_dataset")
|
197
|
+
dataset = Dataset(dataset_name=base_uri)
|
198
|
+
|
199
|
+
schema = Schema(
|
200
|
+
[
|
201
|
+
("id", Datatype.int32()),
|
202
|
+
("name", Datatype.string()),
|
203
|
+
("age", Datatype.int32()),
|
204
|
+
],
|
205
|
+
merge_keys=["id"],
|
206
|
+
)
|
207
|
+
|
208
|
+
dataset.add_schema(schema, schema_name="new_group")
|
209
|
+
|
210
|
+
# Verify the field group is added
|
211
|
+
assert "new_group" in dataset.schemas
|
212
|
+
assert len(dataset.schemas["new_group"]) == 3
|
213
|
+
assert dataset.schemas["new_group"]["id"].datatype == Datatype.int32()
|
214
|
+
assert dataset.schemas["new_group"]["name"].datatype == Datatype.string()
|
215
|
+
assert dataset.schemas["new_group"]["age"].datatype == Datatype.int32()
|
216
|
+
|
217
|
+
|
218
|
+
def test_add_schema_to_existing_schemas(tmp_path):
|
219
|
+
"""Test merging a schema into an existing field group."""
|
220
|
+
base_uri = str(tmp_path / "test_dataset")
|
221
|
+
dataset = Dataset(dataset_name=base_uri)
|
222
|
+
|
223
|
+
schema_1 = Schema(
|
224
|
+
[
|
225
|
+
("id", Datatype.int32()),
|
226
|
+
("name", Datatype.string()),
|
227
|
+
],
|
228
|
+
merge_keys=["id"],
|
229
|
+
)
|
230
|
+
|
231
|
+
dataset.add_schema(schema_1, schema_name="existing_group")
|
232
|
+
|
233
|
+
schema_2 = Schema(
|
234
|
+
[
|
235
|
+
("age", Datatype.int32()),
|
236
|
+
("email", Datatype.string()),
|
237
|
+
],
|
238
|
+
merge_keys=["id"],
|
239
|
+
)
|
240
|
+
|
241
|
+
dataset.add_schema(schema_2, schema_name="existing_group")
|
242
|
+
|
243
|
+
# Verify the merged schema
|
244
|
+
assert "existing_group" in dataset.schemas
|
245
|
+
assert len(dataset.schemas["existing_group"]) == 4
|
246
|
+
assert dataset.schemas["existing_group"]["id"].datatype == Datatype.int32()
|
247
|
+
assert dataset.schemas["existing_group"]["name"].datatype == Datatype.string()
|
248
|
+
assert dataset.schemas["existing_group"]["age"].datatype == Datatype.int32()
|
249
|
+
assert dataset.schemas["existing_group"]["email"].datatype == Datatype.string()
|
250
|
+
|
251
|
+
|
252
|
+
def test_add_schema_conflicting_fields(tmp_path):
|
253
|
+
"""Test adding a schema with conflicting fields."""
|
254
|
+
base_uri = str(tmp_path / "test_dataset")
|
255
|
+
dataset = Dataset(dataset_name=base_uri)
|
256
|
+
|
257
|
+
schema_1 = Schema(
|
258
|
+
[
|
259
|
+
("id", Datatype.int32()),
|
260
|
+
("name", Datatype.string()),
|
261
|
+
],
|
262
|
+
merge_keys=["id"],
|
263
|
+
)
|
264
|
+
|
265
|
+
dataset.add_schema(schema_1, schema_name="conflicting_group")
|
266
|
+
|
267
|
+
schema_2 = Schema(
|
268
|
+
[
|
269
|
+
("id", Datatype.string()), # Conflict: datatype mismatch
|
270
|
+
("age", Datatype.int32()),
|
271
|
+
],
|
272
|
+
merge_keys=["id"],
|
273
|
+
)
|
274
|
+
|
275
|
+
with pytest.raises(ValueError, match="already exists"):
|
276
|
+
dataset.add_schema(schema_2, schema_name="conflicting_group")
|
277
|
+
|
278
|
+
schema_3 = Schema(
|
279
|
+
[
|
280
|
+
("id", Datatype.int32()), # Conflict: datatype mismatch
|
281
|
+
("age", Datatype.int32()),
|
282
|
+
],
|
283
|
+
merge_keys=["id"],
|
284
|
+
)
|
285
|
+
|
286
|
+
dataset.add_schema(schema_3, schema_name="conflicting_group")
|
287
|
+
assert "conflicting_group" in dataset.schemas
|
288
|
+
assert len(dataset.schemas["conflicting_group"]) == 3
|
289
|
+
assert dataset.schemas["conflicting_group"]["id"].datatype == Datatype.int32()
|
290
|
+
assert dataset.schemas["conflicting_group"]["name"].datatype == Datatype.string()
|
291
|
+
assert dataset.schemas["conflicting_group"]["age"].datatype == Datatype.int32()
|
292
|
+
|
293
|
+
|
294
|
+
def test_add_fields_with_merge_key_field(tmp_path):
|
295
|
+
base_uri = str(tmp_path / "test_dataset")
|
296
|
+
dataset = Dataset(dataset_name=base_uri)
|
297
|
+
dataset.add_fields([Field("my_merge_key", Datatype.string(), True)])
|
298
|
+
assert dataset.schemas["default"].get_merge_key() == "my_merge_key"
|
299
|
+
|
300
|
+
|
301
|
+
def test_add_schema_to_nonexistent_schemas(tmp_path):
|
302
|
+
"""Test adding a schema to a nonexistent field group."""
|
303
|
+
base_uri = str(tmp_path / "test_dataset")
|
304
|
+
dataset = Dataset(dataset_name=base_uri)
|
305
|
+
|
306
|
+
schema = Schema(
|
307
|
+
[
|
308
|
+
("id", Datatype.int32()),
|
309
|
+
("name", Datatype.string()),
|
310
|
+
],
|
311
|
+
merge_keys=["id"],
|
312
|
+
)
|
313
|
+
|
314
|
+
# Add to a non-existent field group
|
315
|
+
dataset.add_schema(schema, schema_name="nonexistent_group")
|
316
|
+
|
317
|
+
# Verify the field group is created
|
318
|
+
assert "nonexistent_group" in dataset.schemas
|
319
|
+
assert len(dataset.schemas["nonexistent_group"]) == 2
|
320
|
+
|
321
|
+
|
322
|
+
def test_add_missing_field_to_schema_raises_error(tmp_path, sample_schema):
|
323
|
+
"""
|
324
|
+
Test that attempting to add a missing field to the 'all' schema raises a ValueError.
|
325
|
+
"""
|
326
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
327
|
+
|
328
|
+
# Attempt to add a non-existent field to the 'all' schema
|
329
|
+
with pytest.raises(
|
330
|
+
ValueError, match="Field 'missing_field' does not exist in the dataset."
|
331
|
+
):
|
332
|
+
dataset.schemas["all"] = [
|
333
|
+
"missing_field"
|
334
|
+
] # Attempt to set a list with a missing field
|
335
|
+
|
336
|
+
|
337
|
+
def test_schemas_accessor_methods(tmp_path, sample_schema):
|
338
|
+
"""
|
339
|
+
Test the __iter__, __len__, and __repr__ methods of SchemasAccessor.
|
340
|
+
"""
|
341
|
+
dataset = Dataset(
|
342
|
+
dataset_name="test_dataset", schema=sample_schema
|
343
|
+
) # Default schema is defined automatically
|
344
|
+
dataset.schemas["schema_1"] = ["id", "name"]
|
345
|
+
dataset.schemas["schema_2"] = ["age"]
|
346
|
+
|
347
|
+
# Test __iter__
|
348
|
+
schema_names = list(iter(dataset.schemas))
|
349
|
+
assert set(schema_names) == {
|
350
|
+
"schema_1",
|
351
|
+
"schema_2",
|
352
|
+
"all",
|
353
|
+
"default",
|
354
|
+
}, "Schema names do not match expected values"
|
355
|
+
|
356
|
+
# Test __len__
|
357
|
+
assert len(dataset.schemas) == 4, "Length of schemas accessor is incorrect"
|
358
|
+
|
359
|
+
# Test __repr__
|
360
|
+
repr_output = repr(dataset.schemas)
|
361
|
+
for schema_name in ["schema_1", "schema_2", "all"]:
|
362
|
+
assert (
|
363
|
+
schema_name in repr_output
|
364
|
+
), f"Schema '{schema_name}' missing in repr output"
|
365
|
+
|
366
|
+
|
367
|
+
def test_get_merge_keys(tmp_path, sample_schema):
|
368
|
+
"""
|
369
|
+
Test the get_merge_keys method to ensure it returns all merge keys in the dataset.
|
370
|
+
"""
|
371
|
+
dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
|
372
|
+
|
373
|
+
# Add fields with additional merge key to the dataset
|
374
|
+
other_schema = Schema(
|
375
|
+
[("id2", Datatype.int32()), ("zip", Datatype.string())], merge_keys=["id2"]
|
376
|
+
)
|
377
|
+
|
378
|
+
dataset.add_schema(other_schema, "id2+zip")
|
379
|
+
|
380
|
+
# Call get_merge_keys and validate the result
|
381
|
+
merge_keys = dataset.get_merge_keys()
|
382
|
+
assert merge_keys == [
|
383
|
+
"id",
|
384
|
+
"id2",
|
385
|
+
], f"Expected merge keys ['id', 'id2'], got {merge_keys}"
|
386
|
+
|
387
|
+
|
388
|
+
def test_add_fields_no_fields_raises_error(tmp_path, sample_schema):
|
389
|
+
dataset = Dataset(dataset_name="test_dataset")
|
390
|
+
with pytest.raises(ValueError):
|
391
|
+
dataset.add_fields(fields=[])
|
392
|
+
|
393
|
+
|
394
|
+
def test_add_fields_mismatched_merge_keys_raises_error(tmp_path, sample_schema):
|
395
|
+
dataset = Dataset(dataset_name="test_dataset")
|
396
|
+
with pytest.raises(
|
397
|
+
ValueError,
|
398
|
+
match="The following merge keys were not found in the provided fields: does_not_exist",
|
399
|
+
):
|
400
|
+
dataset.add_fields(fields=sample_schema.values(), merge_keys=["does_not_exist"])
|
401
|
+
|
402
|
+
with pytest.raises(TypeError, match="Merge key status conflict"):
|
403
|
+
dataset.add_fields(
|
404
|
+
fields=[Field("id", Datatype.int32()), Field("name", Datatype.string())],
|
405
|
+
merge_keys=["id"],
|
406
|
+
)
|