deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,67 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
from deltacat import Dataset
|
6
|
+
from deltacat.storage.rivulet.fs.file_store import FileStore
|
7
|
+
from deltacat.storage.rivulet.schema.datatype import Datatype
|
8
|
+
from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO
|
9
|
+
from deltacat.storage.rivulet import Schema, Field
|
10
|
+
import pyarrow as pa
|
11
|
+
import pyarrow.parquet
|
12
|
+
|
13
|
+
|
14
|
+
@pytest.fixture
|
15
|
+
def sample_schema():
|
16
|
+
return Schema(
|
17
|
+
fields=[
|
18
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
19
|
+
Field("name", Datatype.string()),
|
20
|
+
Field("age", Datatype.int32()),
|
21
|
+
]
|
22
|
+
)
|
23
|
+
|
24
|
+
|
25
|
+
@pytest.fixture
|
26
|
+
def sample_pydict():
|
27
|
+
return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
|
28
|
+
|
29
|
+
|
30
|
+
@pytest.fixture
|
31
|
+
def path(tmp_path):
|
32
|
+
return tmp_path
|
33
|
+
|
34
|
+
|
35
|
+
@pytest.fixture
|
36
|
+
def sample_parquet_data(path, sample_pydict):
|
37
|
+
parquet_path = path / "test.parquet"
|
38
|
+
table = pa.Table.from_pydict(sample_pydict)
|
39
|
+
pyarrow.parquet.write_table(table, parquet_path)
|
40
|
+
return parquet_path
|
41
|
+
|
42
|
+
|
43
|
+
def test_write_manifest_round_trip(sample_parquet_data, sample_schema):
|
44
|
+
dataset = Dataset.from_parquet(
|
45
|
+
file_uri=sample_parquet_data, name="dataset", merge_keys="id"
|
46
|
+
)
|
47
|
+
|
48
|
+
path, filesystem = FileStore.filesystem(dataset._metadata_path)
|
49
|
+
file_store = FileStore(path, filesystem=filesystem)
|
50
|
+
manifest_io = DeltacatManifestIO(path, dataset._locator)
|
51
|
+
|
52
|
+
sst_files = ["sst1.sst", "sst2.sst"]
|
53
|
+
schema = Schema(
|
54
|
+
{("id", Datatype.int32()), ("name", Datatype.string())},
|
55
|
+
"id",
|
56
|
+
)
|
57
|
+
level = 2
|
58
|
+
|
59
|
+
uri = os.path.join(path, "manifest.json")
|
60
|
+
|
61
|
+
file_store.create_output_file(uri)
|
62
|
+
written = manifest_io.write(sst_files, schema, level)
|
63
|
+
manifest = manifest_io.read(written)
|
64
|
+
|
65
|
+
assert manifest.context.schema == schema
|
66
|
+
assert manifest.context.level == level
|
67
|
+
assert manifest.sst_files == sst_files
|
@@ -0,0 +1,232 @@
|
|
1
|
+
from typing import List, FrozenSet, Dict
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
from deltacat.storage.rivulet.metastore.delta import DeltaContext
|
6
|
+
from deltacat.storage.rivulet.metastore.sst import SSTable, SSTableRow
|
7
|
+
from deltacat.storage.rivulet.metastore.sst_interval_tree import (
|
8
|
+
BlockIntervalTree,
|
9
|
+
BlockGroup,
|
10
|
+
OrderedBlockGroups,
|
11
|
+
Block,
|
12
|
+
)
|
13
|
+
from deltacat.storage.rivulet.schema.datatype import Datatype
|
14
|
+
from deltacat.storage.rivulet import Schema
|
15
|
+
|
16
|
+
|
17
|
+
@pytest.fixture
|
18
|
+
def schema1() -> Schema:
|
19
|
+
return Schema(
|
20
|
+
{
|
21
|
+
("id", Datatype.int32()),
|
22
|
+
("name", Datatype.string()),
|
23
|
+
("age", Datatype.int32()),
|
24
|
+
},
|
25
|
+
"id",
|
26
|
+
)
|
27
|
+
|
28
|
+
|
29
|
+
@pytest.fixture
|
30
|
+
def schema2() -> Schema:
|
31
|
+
return Schema(
|
32
|
+
{
|
33
|
+
("id", Datatype.int32()),
|
34
|
+
("address", Datatype.string()),
|
35
|
+
("zip", Datatype.string()),
|
36
|
+
},
|
37
|
+
"id",
|
38
|
+
)
|
39
|
+
|
40
|
+
|
41
|
+
@pytest.fixture
|
42
|
+
def sst_row_list() -> List[SSTableRow]:
|
43
|
+
return [
|
44
|
+
SSTableRow(0, 100, "block1", 0, 1),
|
45
|
+
SSTableRow(3, 90, "block2", 0, 1),
|
46
|
+
SSTableRow(10, 95, "block3", 0, 1),
|
47
|
+
SSTableRow(0, 10, "block4", 0, 1),
|
48
|
+
SSTableRow(0, 100, "block5", 0, 1),
|
49
|
+
]
|
50
|
+
|
51
|
+
|
52
|
+
@pytest.fixture
|
53
|
+
def sst1(sst_row_list) -> SSTable:
|
54
|
+
return SSTable(sst_row_list[0:3], 0, 100)
|
55
|
+
|
56
|
+
|
57
|
+
@pytest.fixture
|
58
|
+
def sst2(sst_row_list) -> SSTable:
|
59
|
+
return SSTable(sst_row_list[3:5], 0, 100)
|
60
|
+
|
61
|
+
|
62
|
+
@pytest.fixture
|
63
|
+
def manifest_context1(schema1) -> DeltaContext:
|
64
|
+
return DeltaContext(schema1, "manifest-001", 0)
|
65
|
+
|
66
|
+
|
67
|
+
@pytest.fixture
|
68
|
+
def manifest_context2(schema2) -> DeltaContext:
|
69
|
+
return DeltaContext(schema2, "manifest-002", 1)
|
70
|
+
|
71
|
+
|
72
|
+
def with_field_group(
|
73
|
+
context: DeltaContext, rows: List[SSTableRow], indexes: List[int]
|
74
|
+
) -> Dict[Schema, FrozenSet[Block]]:
|
75
|
+
"""Construct a BlockGroup dict for a singular field group"""
|
76
|
+
schema = context.schema
|
77
|
+
return {schema: frozenset([Block(rows[i], context) for i in indexes])}
|
78
|
+
|
79
|
+
|
80
|
+
@pytest.fixture
|
81
|
+
def expected_block_groups(
|
82
|
+
manifest_context1, manifest_context2, sst_row_list
|
83
|
+
) -> List[BlockGroup]:
|
84
|
+
return [
|
85
|
+
BlockGroup(
|
86
|
+
0,
|
87
|
+
3,
|
88
|
+
with_field_group(manifest_context1, sst_row_list, [0])
|
89
|
+
| with_field_group(manifest_context2, sst_row_list, [3, 4]),
|
90
|
+
),
|
91
|
+
BlockGroup(
|
92
|
+
3,
|
93
|
+
10,
|
94
|
+
with_field_group(manifest_context1, sst_row_list, [0, 1])
|
95
|
+
| with_field_group(manifest_context2, sst_row_list, [3, 4]),
|
96
|
+
),
|
97
|
+
BlockGroup(
|
98
|
+
10,
|
99
|
+
90,
|
100
|
+
with_field_group(manifest_context1, sst_row_list, [0, 1, 2])
|
101
|
+
| with_field_group(manifest_context2, sst_row_list, [3, 4]),
|
102
|
+
),
|
103
|
+
BlockGroup(
|
104
|
+
90,
|
105
|
+
95,
|
106
|
+
with_field_group(manifest_context1, sst_row_list, [0, 1, 2])
|
107
|
+
| with_field_group(manifest_context2, sst_row_list, [4]),
|
108
|
+
),
|
109
|
+
BlockGroup(
|
110
|
+
95,
|
111
|
+
100,
|
112
|
+
with_field_group(manifest_context1, sst_row_list, [0, 2])
|
113
|
+
| with_field_group(manifest_context2, sst_row_list, [4]),
|
114
|
+
),
|
115
|
+
]
|
116
|
+
|
117
|
+
|
118
|
+
def test_build_sst(
|
119
|
+
sst1,
|
120
|
+
sst2,
|
121
|
+
manifest_context1,
|
122
|
+
manifest_context2,
|
123
|
+
sst_row_list,
|
124
|
+
expected_block_groups,
|
125
|
+
):
|
126
|
+
t = BlockIntervalTree()
|
127
|
+
t.add_sst_table(sst1, manifest_context1)
|
128
|
+
t.add_sst_table(sst2, manifest_context2)
|
129
|
+
|
130
|
+
block_groups = t.get_sorted_block_groups()
|
131
|
+
expected = _build_ordered_block_groups(expected_block_groups)
|
132
|
+
assert expected == block_groups
|
133
|
+
|
134
|
+
|
135
|
+
def test_build_sst_with_bounds(
|
136
|
+
sst1,
|
137
|
+
sst2,
|
138
|
+
manifest_context1,
|
139
|
+
manifest_context2,
|
140
|
+
sst_row_list,
|
141
|
+
expected_block_groups,
|
142
|
+
):
|
143
|
+
t = BlockIntervalTree()
|
144
|
+
t.add_sst_table(sst1, manifest_context1)
|
145
|
+
t.add_sst_table(sst2, manifest_context2)
|
146
|
+
|
147
|
+
block_groups_filtered = t.get_sorted_block_groups(20, 100)
|
148
|
+
expected = _build_ordered_block_groups(expected_block_groups[2:])
|
149
|
+
assert expected == block_groups_filtered
|
150
|
+
|
151
|
+
block_groups_filtered = t.get_sorted_block_groups(96, 100)
|
152
|
+
expected = _build_ordered_block_groups(expected_block_groups[4:])
|
153
|
+
assert expected == block_groups_filtered
|
154
|
+
|
155
|
+
block_groups_filtered = t.get_sorted_block_groups(0, 10)
|
156
|
+
expected = _build_ordered_block_groups(expected_block_groups[0:3])
|
157
|
+
assert expected == block_groups_filtered
|
158
|
+
|
159
|
+
# Max key of 95 is inclusive of last range so it is included
|
160
|
+
block_groups_filtered = t.get_sorted_block_groups(None, 95)
|
161
|
+
expected = _build_ordered_block_groups(expected_block_groups)
|
162
|
+
assert expected == block_groups_filtered
|
163
|
+
|
164
|
+
block_groups_filtered = t.get_sorted_block_groups(None, 94)
|
165
|
+
expected = _build_ordered_block_groups(expected_block_groups[0:4])
|
166
|
+
assert expected == block_groups_filtered
|
167
|
+
|
168
|
+
block_groups_filtered = t.get_sorted_block_groups(0, 10)
|
169
|
+
expected = _build_ordered_block_groups(expected_block_groups[0:3])
|
170
|
+
assert expected == block_groups_filtered
|
171
|
+
|
172
|
+
block_groups_filtered = t.get_sorted_block_groups(0, 0)
|
173
|
+
expected = _build_ordered_block_groups(expected_block_groups[0:1])
|
174
|
+
assert expected == block_groups_filtered
|
175
|
+
|
176
|
+
|
177
|
+
def test_build_sst_with_non_zero_min_key_matching_global_min_key(manifest_context1):
|
178
|
+
# Using a non-0 value since 0 evaluates to False
|
179
|
+
min_key = 1
|
180
|
+
max_key = 95
|
181
|
+
|
182
|
+
sst_row = SSTableRow(min_key, max_key, "row-with-non-zero-min-key", 0, 1)
|
183
|
+
t = BlockIntervalTree()
|
184
|
+
t.add_sst_table(SSTable([sst_row], min_key, max_key), manifest_context1)
|
185
|
+
|
186
|
+
block_groups_filtered = t.get_sorted_block_groups(min_key, min_key + 1)
|
187
|
+
expected = _build_ordered_block_groups(
|
188
|
+
[
|
189
|
+
BlockGroup(
|
190
|
+
min_key,
|
191
|
+
max_key,
|
192
|
+
{
|
193
|
+
manifest_context1.schema: frozenset(
|
194
|
+
[Block(sst_row, manifest_context1)]
|
195
|
+
)
|
196
|
+
},
|
197
|
+
)
|
198
|
+
]
|
199
|
+
)
|
200
|
+
assert expected == block_groups_filtered
|
201
|
+
|
202
|
+
|
203
|
+
def test_build_sst_invalid_bounds(
|
204
|
+
sst1, sst2, schema1, schema2, sst_row_list, expected_block_groups
|
205
|
+
):
|
206
|
+
t = BlockIntervalTree()
|
207
|
+
|
208
|
+
with pytest.raises(ValueError):
|
209
|
+
t.get_sorted_block_groups(10, 0)
|
210
|
+
|
211
|
+
|
212
|
+
def _build_ordered_block_groups(block_groups: List[BlockGroup]) -> OrderedBlockGroups:
|
213
|
+
"""
|
214
|
+
Helper method to build OrderedBlockGroups from a sorted list of block groups
|
215
|
+
|
216
|
+
"""
|
217
|
+
ordered_groups = []
|
218
|
+
boundary_table = []
|
219
|
+
for i, bg in enumerate(block_groups):
|
220
|
+
boundary_table.append(bg.key_min)
|
221
|
+
is_last = i == len(block_groups) - 1
|
222
|
+
if is_last:
|
223
|
+
bg = BlockGroup(bg.key_min, bg.key_max, bg.field_group_to_blocks, True)
|
224
|
+
boundary_table.append(bg.key_max)
|
225
|
+
ordered_groups.append(bg)
|
226
|
+
|
227
|
+
return OrderedBlockGroups(
|
228
|
+
ordered_groups[0].key_min,
|
229
|
+
ordered_groups[-1].key_max,
|
230
|
+
ordered_groups,
|
231
|
+
boundary_table,
|
232
|
+
)
|
@@ -0,0 +1,122 @@
|
|
1
|
+
import inspect
|
2
|
+
import os
|
3
|
+
|
4
|
+
from pyarrow import RecordBatch, Table
|
5
|
+
|
6
|
+
from deltacat.storage.rivulet.dataset import Dataset
|
7
|
+
from deltacat.storage.rivulet.reader.query_expression import QueryExpression
|
8
|
+
from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter
|
9
|
+
|
10
|
+
from deltacat.storage.rivulet.mvp.Table import MvpTable, MvpRow
|
11
|
+
from deltacat.storage.rivulet import Schema
|
12
|
+
from typing import Dict, List, Generator, Set
|
13
|
+
|
14
|
+
FIXTURE_ROW_COUNT = 10000
|
15
|
+
|
16
|
+
|
17
|
+
def write_mvp_table(writer: DatasetWriter, table: MvpTable):
|
18
|
+
writer.write(table.to_rows_list())
|
19
|
+
|
20
|
+
|
21
|
+
def mvp_table_to_record_batches(table: MvpTable, schema: Schema) -> RecordBatch:
|
22
|
+
data = table.to_rows_list()
|
23
|
+
columns = {key: [d.get(key) for d in data] for key in schema.keys()}
|
24
|
+
record_batch = RecordBatch.from_pydict(columns, schema=schema.to_pyarrow())
|
25
|
+
return record_batch
|
26
|
+
|
27
|
+
|
28
|
+
def compare_mvp_table_to_scan_results(
|
29
|
+
table: MvpTable, scan_results: List[dict], pk: str
|
30
|
+
):
|
31
|
+
table_row_list = table.to_rows_list()
|
32
|
+
assert len(scan_results) == len(table_row_list)
|
33
|
+
rows_by_pk: Dict[str, MvpRow] = table.to_rows_by_key(pk)
|
34
|
+
assert len(rows_by_pk) == len(scan_results)
|
35
|
+
for record in scan_results:
|
36
|
+
pk_val = record[pk]
|
37
|
+
assert rows_by_pk[pk_val].data == record
|
38
|
+
|
39
|
+
|
40
|
+
def validate_with_full_scan(dataset: Dataset, expected: MvpTable, schema: Schema):
|
41
|
+
# best way to validate is to use dataset reader and read records
|
42
|
+
read_records = list(dataset.scan(QueryExpression()).to_pydict())
|
43
|
+
compare_mvp_table_to_scan_results(
|
44
|
+
expected, read_records, list(dataset.get_merge_keys())[0]
|
45
|
+
)
|
46
|
+
|
47
|
+
|
48
|
+
def generate_data_files(dataset: Dataset) -> Generator[str, None, None]:
|
49
|
+
for ma in dataset._metastore.generate_manifests():
|
50
|
+
for sstable in ma.generate_sstables():
|
51
|
+
for row in sstable.rows:
|
52
|
+
yield row.uri
|
53
|
+
|
54
|
+
|
55
|
+
def assert_data_file_extension(dataset: Dataset, file_extension: str):
|
56
|
+
data_file_count = 0
|
57
|
+
for data_file in generate_data_files(dataset):
|
58
|
+
data_file_count += 1
|
59
|
+
assert data_file.endswith(file_extension)
|
60
|
+
assert data_file_count > 0, "No data files found in dataset"
|
61
|
+
print(f"Asserted that {data_file_count} data files end with {file_extension}")
|
62
|
+
|
63
|
+
|
64
|
+
def assert_data_file_extension_set(dataset: Dataset, file_extension_set: Set[str]):
|
65
|
+
"""
|
66
|
+
Asserts that each file extension in set appears at least once in dataset
|
67
|
+
"""
|
68
|
+
data_file_count = 0
|
69
|
+
found_extensions = set()
|
70
|
+
|
71
|
+
for data_file in generate_data_files(dataset):
|
72
|
+
data_file_count += 1
|
73
|
+
for extension in file_extension_set:
|
74
|
+
if data_file.endswith(extension):
|
75
|
+
found_extensions.add(extension)
|
76
|
+
break
|
77
|
+
|
78
|
+
assert data_file_count > 0, "No data files found in dataset"
|
79
|
+
assert (
|
80
|
+
found_extensions == file_extension_set
|
81
|
+
), f"Missing extensions: {file_extension_set - found_extensions}"
|
82
|
+
print(
|
83
|
+
f"Asserted that among {data_file_count} data files, all extensions {file_extension_set} were found"
|
84
|
+
)
|
85
|
+
|
86
|
+
|
87
|
+
def create_dataset_for_method(temp_dir: str):
|
88
|
+
"""
|
89
|
+
Given a temp directory, creates a directory within it based on the name of the function calling this.
|
90
|
+
Then returns a dataset based from that directory
|
91
|
+
"""
|
92
|
+
caller_frame = inspect.getouterframes(inspect.currentframe())[1]
|
93
|
+
dataset_dir = os.path.join(temp_dir, caller_frame.function)
|
94
|
+
os.makedirs(dataset_dir)
|
95
|
+
return Dataset(
|
96
|
+
dataset_name=f"dataset-${caller_frame.function}", metadata_uri=dataset_dir
|
97
|
+
)
|
98
|
+
|
99
|
+
|
100
|
+
def verify_pyarrow_scan(
|
101
|
+
scan_result: Generator[RecordBatch, None, None],
|
102
|
+
expected_schema: Schema,
|
103
|
+
expected_data: dict,
|
104
|
+
):
|
105
|
+
record_batches = list(scan_result)
|
106
|
+
assert record_batches, "Scan should return at least one record batch."
|
107
|
+
|
108
|
+
combined_table = Table.from_batches(record_batches)
|
109
|
+
|
110
|
+
expected_fields = {field.name for field in expected_schema.values()}
|
111
|
+
scanned_fields = set(combined_table.schema.names)
|
112
|
+
assert (
|
113
|
+
scanned_fields == expected_fields
|
114
|
+
), f"Scanned fields {scanned_fields} do not match expected fields {expected_fields}."
|
115
|
+
|
116
|
+
for field in expected_fields:
|
117
|
+
assert (
|
118
|
+
field in combined_table.column_names
|
119
|
+
), f"Field '{field}' is missing in the scan result."
|
120
|
+
assert (
|
121
|
+
combined_table[field].to_pylist() == expected_data[field]
|
122
|
+
), f"Field '{field}' data does not match expected values."
|
File without changes
|