deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,308 @@
|
|
1
|
+
import pytest
|
2
|
+
|
3
|
+
from deltacat.storage import (
|
4
|
+
Transaction,
|
5
|
+
TransactionOperation,
|
6
|
+
TransactionType,
|
7
|
+
TransactionOperationType,
|
8
|
+
)
|
9
|
+
from deltacat.storage.model.metafile import (
|
10
|
+
Metafile,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
class TestAbsToRelative:
|
15
|
+
@classmethod
|
16
|
+
def setup_method(cls):
|
17
|
+
cls.catalog_root = "/catalog/root/path"
|
18
|
+
|
19
|
+
# Test cases for the abs_to_relative function
|
20
|
+
def test_abs_to_relative_simple(self):
|
21
|
+
"""
|
22
|
+
Tests the function which relativizes absolute paths (string) into relative paths (string)
|
23
|
+
"""
|
24
|
+
catalog_root = TestAbsToRelative.catalog_root
|
25
|
+
absolute_path = "/catalog/root/path/namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk"
|
26
|
+
relative_path = Transaction._abs_txn_meta_path_to_relative(
|
27
|
+
catalog_root, absolute_path
|
28
|
+
)
|
29
|
+
assert (
|
30
|
+
relative_path
|
31
|
+
== "namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk"
|
32
|
+
)
|
33
|
+
|
34
|
+
def test_abs_to_relative_same_paths(self):
|
35
|
+
catalog_root = TestAbsToRelative.catalog_root
|
36
|
+
absolute_path = TestAbsToRelative.catalog_root
|
37
|
+
with pytest.raises(
|
38
|
+
ValueError,
|
39
|
+
match="Target and root are identical, but expected target to be a child of root.",
|
40
|
+
):
|
41
|
+
Transaction._abs_txn_meta_path_to_relative(catalog_root, absolute_path)
|
42
|
+
|
43
|
+
def test_abs_to_relative_root_with_trailing_slash(self):
|
44
|
+
catalog_root = "/catalog/root/path/"
|
45
|
+
absolute_path = "/catalog/root/path/namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk"
|
46
|
+
relative_path = Transaction._abs_txn_meta_path_to_relative(
|
47
|
+
catalog_root, absolute_path
|
48
|
+
)
|
49
|
+
assert (
|
50
|
+
relative_path
|
51
|
+
== "namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk"
|
52
|
+
)
|
53
|
+
|
54
|
+
def test_abs_to_relative_bad_root(self):
|
55
|
+
catalog_root = TestAbsToRelative.catalog_root
|
56
|
+
absolute_path = "/cat/rt/pth/namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk"
|
57
|
+
with pytest.raises(ValueError, match="Expected target to be a child of root."):
|
58
|
+
Transaction._abs_txn_meta_path_to_relative(catalog_root, absolute_path)
|
59
|
+
|
60
|
+
def test_abs_to_relative_empty_path(self):
|
61
|
+
with pytest.raises(ValueError, match="Expected target to be a child of root."):
|
62
|
+
Transaction._abs_txn_meta_path_to_relative("", "/lorem/ipsum")
|
63
|
+
with pytest.raises(ValueError, match="Expected target to be a child of root."):
|
64
|
+
Transaction._abs_txn_meta_path_to_relative("/lorem/ipsum/", "")
|
65
|
+
|
66
|
+
# Test cases for the relativize_operation_paths function
|
67
|
+
def test_relativize_metafile_write_paths(self):
|
68
|
+
catalog_root = "/catalog/root"
|
69
|
+
absolute_paths = [
|
70
|
+
"/catalog/root/path/to/metafile1.mpk",
|
71
|
+
"/catalog/root/path/to/metafile2.mpk",
|
72
|
+
"/catalog/root/another/path/lore_ipsum.mpk",
|
73
|
+
"/catalog/root/another/path/meta/to/lorem_ipsum.mpk",
|
74
|
+
"/catalog/root/another/path/lorem_ipsum.mpk",
|
75
|
+
"/catalog/root/here.mpk",
|
76
|
+
]
|
77
|
+
expected_relative_paths = [
|
78
|
+
"path/to/metafile1.mpk",
|
79
|
+
"path/to/metafile2.mpk",
|
80
|
+
"another/path/lore_ipsum.mpk",
|
81
|
+
"another/path/meta/to/lorem_ipsum.mpk",
|
82
|
+
"another/path/lorem_ipsum.mpk",
|
83
|
+
"here.mpk",
|
84
|
+
]
|
85
|
+
# Create a dummy transaction operation with absolute paths
|
86
|
+
dest_metafile = Metafile({"id": "dummy_metafile_id"})
|
87
|
+
transaction_operation = TransactionOperation.of(
|
88
|
+
operation_type=TransactionOperationType.CREATE,
|
89
|
+
dest_metafile=dest_metafile,
|
90
|
+
)
|
91
|
+
# use replace method as setter
|
92
|
+
transaction_operation.metafile_write_paths = absolute_paths
|
93
|
+
# Create a transaction and relativize paths
|
94
|
+
transaction = Transaction.of(
|
95
|
+
txn_type=TransactionType.APPEND, txn_operations=[transaction_operation]
|
96
|
+
)
|
97
|
+
transaction.relativize_operation_paths(transaction_operation, catalog_root)
|
98
|
+
# Verify the paths have been correctly relativized
|
99
|
+
assert transaction_operation.metafile_write_paths == expected_relative_paths
|
100
|
+
|
101
|
+
def test_relativize_locator_write_paths(self):
|
102
|
+
catalog_root = "/catalog/root"
|
103
|
+
absolute_paths = [
|
104
|
+
"/catalog/root/path/to/loc1.mpk",
|
105
|
+
"/catalog/root/path/to/loc2.mpk",
|
106
|
+
"/catalog/root/another/path/lore_ipsum.mpk",
|
107
|
+
"/catalog/root/another/path/meta/to/lorem_ipsum.mpk",
|
108
|
+
"/catalog/root/another/path/lorem_ipsum.mpk",
|
109
|
+
"/catalog/root/here.mpk",
|
110
|
+
]
|
111
|
+
expected_relative_paths = [
|
112
|
+
"path/to/loc1.mpk",
|
113
|
+
"path/to/loc2.mpk",
|
114
|
+
"another/path/lore_ipsum.mpk",
|
115
|
+
"another/path/meta/to/lorem_ipsum.mpk",
|
116
|
+
"another/path/lorem_ipsum.mpk",
|
117
|
+
"here.mpk",
|
118
|
+
]
|
119
|
+
# Create a dummy transaction operation with absolute paths
|
120
|
+
dest_metafile = Metafile({"id": "dummy_metafile_id"})
|
121
|
+
transaction_operation = TransactionOperation.of(
|
122
|
+
operation_type=TransactionOperationType.CREATE,
|
123
|
+
dest_metafile=dest_metafile,
|
124
|
+
)
|
125
|
+
# use replace as setter
|
126
|
+
transaction_operation.locator_write_paths = absolute_paths
|
127
|
+
# Create a transaction and relativize paths
|
128
|
+
transaction = Transaction.of(
|
129
|
+
txn_type=TransactionType.APPEND, txn_operations=[transaction_operation]
|
130
|
+
)
|
131
|
+
transaction.relativize_operation_paths(transaction_operation, catalog_root)
|
132
|
+
# Verify the paths have been correctly relativized
|
133
|
+
assert transaction_operation.locator_write_paths == expected_relative_paths
|
134
|
+
|
135
|
+
def test_relativize_metafile_and_locator_paths(self):
|
136
|
+
catalog_root = "/meta_catalog/root_dir/a/b/c"
|
137
|
+
meta_absolute_paths = [
|
138
|
+
"/meta_catalog/root_dir/a/b/c/namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk",
|
139
|
+
"/meta_catalog/root_dir/a/b/c/namespace/table/table_version/stream_id/partition_id/00000000000000000002.mpk",
|
140
|
+
"/meta_catalog/root_dir/a/b/c/namespace/table/table_version/stream_id/partition_id/00000000000000000003.mpk",
|
141
|
+
]
|
142
|
+
loc_absolute_paths = [
|
143
|
+
"/meta_catalog/root_dir/a/b/c/d/table/table_version/stream_id/partition_id/00000000000000000001.mpk",
|
144
|
+
"/meta_catalog/root_dir/a/b/c/e/table/table_version/stream_id/partition_id/00000000000000000002.mpk",
|
145
|
+
"/meta_catalog/root_dir/a/b/c/f/table/table_version/stream_id/partition_id/00000000000000000003.mpk",
|
146
|
+
]
|
147
|
+
meta_relative_paths = [
|
148
|
+
"namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk",
|
149
|
+
"namespace/table/table_version/stream_id/partition_id/00000000000000000002.mpk",
|
150
|
+
"namespace/table/table_version/stream_id/partition_id/00000000000000000003.mpk",
|
151
|
+
]
|
152
|
+
loc_relative_paths = [
|
153
|
+
"d/table/table_version/stream_id/partition_id/00000000000000000001.mpk",
|
154
|
+
"e/table/table_version/stream_id/partition_id/00000000000000000002.mpk",
|
155
|
+
"f/table/table_version/stream_id/partition_id/00000000000000000003.mpk",
|
156
|
+
]
|
157
|
+
# Create a dummy transaction operation with absolute paths
|
158
|
+
dest_metafile = Metafile({"id": "dummy_metafile_id"})
|
159
|
+
transaction_operation = TransactionOperation.of(
|
160
|
+
operation_type=TransactionOperationType.CREATE,
|
161
|
+
dest_metafile=dest_metafile,
|
162
|
+
)
|
163
|
+
# use replace as setter
|
164
|
+
transaction_operation.metafile_write_paths = meta_absolute_paths
|
165
|
+
transaction_operation.locator_write_paths = loc_absolute_paths
|
166
|
+
# Create a transaction and relativize paths
|
167
|
+
transaction = Transaction.of(
|
168
|
+
txn_type=TransactionType.APPEND, txn_operations=[transaction_operation]
|
169
|
+
)
|
170
|
+
transaction.relativize_operation_paths(transaction_operation, catalog_root)
|
171
|
+
# Verify the paths have been correctly relativized
|
172
|
+
assert (
|
173
|
+
transaction_operation.metafile_write_paths == meta_relative_paths
|
174
|
+
), f"Expected: {meta_relative_paths}, but got: {transaction_operation.metafile_write_paths}"
|
175
|
+
assert (
|
176
|
+
transaction_operation.locator_write_paths == loc_relative_paths
|
177
|
+
), f"Expected: {loc_relative_paths}, but got: {transaction_operation.locator_write_paths}"
|
178
|
+
|
179
|
+
def test_multiple_operations_relativize_paths(self):
|
180
|
+
catalog_root = "/catalog/root"
|
181
|
+
meta_absolute_paths = [
|
182
|
+
"/catalog/root/path/to/metafile1.mpk",
|
183
|
+
"/catalog/root/path/to/metafile2.mpk",
|
184
|
+
"/catalog/root/another/path/lore_ipsum.mpk",
|
185
|
+
"/catalog/root/another/path/meta/to/lorem_ipsum.mpk",
|
186
|
+
"/catalog/root/another/path/lorem_ipsum.mpk",
|
187
|
+
"/catalog/root/here.mpk",
|
188
|
+
]
|
189
|
+
loc_absolute_paths = [
|
190
|
+
"/catalog/root/path/to/loc1.mpk",
|
191
|
+
"/catalog/root/path/to/loc2.mpk",
|
192
|
+
"/catalog/root/another/path/lore_ipsum.mpk",
|
193
|
+
"/catalog/root/another/path/meta/to/lorem_ipsum.mpk",
|
194
|
+
"/catalog/root/another/path/lorem_ipsum.mpk",
|
195
|
+
"/catalog/root/here.mpk",
|
196
|
+
]
|
197
|
+
meta_expected_relative_paths = [
|
198
|
+
"path/to/metafile1.mpk",
|
199
|
+
"path/to/metafile2.mpk",
|
200
|
+
"another/path/lore_ipsum.mpk",
|
201
|
+
"another/path/meta/to/lorem_ipsum.mpk",
|
202
|
+
"another/path/lorem_ipsum.mpk",
|
203
|
+
"here.mpk",
|
204
|
+
]
|
205
|
+
loc_expected_relative_paths = [
|
206
|
+
"path/to/loc1.mpk",
|
207
|
+
"path/to/loc2.mpk",
|
208
|
+
"another/path/lore_ipsum.mpk",
|
209
|
+
"another/path/meta/to/lorem_ipsum.mpk",
|
210
|
+
"another/path/lorem_ipsum.mpk",
|
211
|
+
"here.mpk",
|
212
|
+
]
|
213
|
+
# Create a dummy transaction operation with absolute paths
|
214
|
+
dest_metafile = Metafile({"id": "dummy_metafile_id"})
|
215
|
+
transaction_operations = []
|
216
|
+
for i in range(11):
|
217
|
+
transaction_operation = TransactionOperation.of(
|
218
|
+
operation_type=TransactionOperationType.CREATE,
|
219
|
+
dest_metafile=dest_metafile,
|
220
|
+
)
|
221
|
+
transaction_operation.metafile_write_paths = meta_absolute_paths
|
222
|
+
transaction_operation.locator_write_paths = loc_absolute_paths
|
223
|
+
transaction_operations.append(transaction_operation)
|
224
|
+
# Create a transaction and relativize paths
|
225
|
+
transaction = Transaction.of(
|
226
|
+
txn_type=TransactionType.APPEND, txn_operations=transaction_operations
|
227
|
+
)
|
228
|
+
for operation in transaction_operations:
|
229
|
+
transaction.relativize_operation_paths(operation, catalog_root)
|
230
|
+
# Verify the paths have been correctly relativized
|
231
|
+
for operation in transaction_operations:
|
232
|
+
assert operation.metafile_write_paths == meta_expected_relative_paths
|
233
|
+
assert operation.locator_write_paths == loc_expected_relative_paths
|
234
|
+
|
235
|
+
def test_empty_metafile_and_locator_write_paths(self):
|
236
|
+
catalog_root = "/catalog/root"
|
237
|
+
transaction_operation = TransactionOperation.of(
|
238
|
+
operation_type=TransactionOperationType.CREATE,
|
239
|
+
dest_metafile=Metafile({"id": "dummy_metafile_id"}),
|
240
|
+
)
|
241
|
+
# Empty paths
|
242
|
+
transaction_operation.metafile_write_paths = []
|
243
|
+
transaction_operation.locator_write_paths = []
|
244
|
+
transaction = Transaction.of(
|
245
|
+
txn_type=TransactionType.APPEND, txn_operations=[transaction_operation]
|
246
|
+
)
|
247
|
+
transaction.relativize_operation_paths(transaction_operation, catalog_root)
|
248
|
+
assert transaction_operation.metafile_write_paths == []
|
249
|
+
assert transaction_operation.locator_write_paths == []
|
250
|
+
|
251
|
+
def test_large_number_of_paths(self):
|
252
|
+
catalog_root = "/catalog/root"
|
253
|
+
absolute_paths = [f"/catalog/root/path/to/file{i}.mpk" for i in range(5000)]
|
254
|
+
expected_paths = [f"path/to/file{i}.mpk" for i in range(5000)]
|
255
|
+
transaction_operation = TransactionOperation.of(
|
256
|
+
operation_type=TransactionOperationType.CREATE,
|
257
|
+
dest_metafile=Metafile({"id": "dummy_metafile_id"}),
|
258
|
+
)
|
259
|
+
transaction_operation.metafile_write_paths = absolute_paths
|
260
|
+
transaction = Transaction.of(
|
261
|
+
txn_type=TransactionType.APPEND, txn_operations=[transaction_operation]
|
262
|
+
)
|
263
|
+
transaction.relativize_operation_paths(transaction_operation, catalog_root)
|
264
|
+
assert transaction_operation.metafile_write_paths == expected_paths
|
265
|
+
|
266
|
+
def test_large_number_of_paths_multi_ops(self):
|
267
|
+
catalog_root = "/catalog/root"
|
268
|
+
absolute_paths = [f"/catalog/root/path/to/file{i}.mpk" for i in range(1000)]
|
269
|
+
expected_paths = [f"path/to/file{i}.mpk" for i in range(1000)]
|
270
|
+
|
271
|
+
# Different operation types to test
|
272
|
+
operation_types = [
|
273
|
+
TransactionOperationType.CREATE,
|
274
|
+
# TransactionOperationType.UPDATE,
|
275
|
+
TransactionOperationType.DELETE,
|
276
|
+
TransactionOperationType.READ_EXISTS,
|
277
|
+
TransactionOperationType.READ_LATEST,
|
278
|
+
TransactionOperationType.READ_CHILDREN,
|
279
|
+
TransactionOperationType.READ_SIBLINGS,
|
280
|
+
]
|
281
|
+
|
282
|
+
# Different transaction types to test
|
283
|
+
txn_types = [
|
284
|
+
TransactionType.APPEND,
|
285
|
+
TransactionType.ALTER,
|
286
|
+
TransactionType.DELETE,
|
287
|
+
TransactionType.OVERWRITE,
|
288
|
+
TransactionType.READ,
|
289
|
+
TransactionType.RESTATE,
|
290
|
+
]
|
291
|
+
|
292
|
+
for txn_type in txn_types:
|
293
|
+
transaction_ops = []
|
294
|
+
for op_type in operation_types:
|
295
|
+
transaction_operation = TransactionOperation.of(
|
296
|
+
operation_type=op_type,
|
297
|
+
dest_metafile=Metafile({"id": "dummy_metafile_id"}),
|
298
|
+
)
|
299
|
+
transaction_operation.metafile_write_paths = absolute_paths
|
300
|
+
transaction_ops.append(transaction_operation)
|
301
|
+
transaction = Transaction.of(
|
302
|
+
txn_type=txn_type, txn_operations=[transaction_operation]
|
303
|
+
)
|
304
|
+
transaction.relativize_operation_paths(transaction_operation, catalog_root)
|
305
|
+
# Assert paths are relativized correctly
|
306
|
+
assert (
|
307
|
+
transaction_operation.metafile_write_paths == expected_paths
|
308
|
+
), f"Failed for transaction type {txn_type} and operation type {op_type}"
|
File without changes
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import io
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
from faker import Faker
|
5
|
+
|
6
|
+
from deltacat.storage.rivulet.schema.datatype import Datatype
|
7
|
+
from deltacat.storage.rivulet.mvp.Table import MvpTable
|
8
|
+
from deltacat.storage.rivulet.schema.schema import Schema
|
9
|
+
import random
|
10
|
+
import string
|
11
|
+
from PIL import Image
|
12
|
+
|
13
|
+
FIXTURE_ROW_COUNT = 10000
|
14
|
+
|
15
|
+
|
16
|
+
@pytest.fixture
|
17
|
+
def ds1_dataset() -> MvpTable:
|
18
|
+
"""
|
19
|
+
dataset with one million rows
|
20
|
+
primary key is integer between 1 and 1,000,000
|
21
|
+
|
22
|
+
TODO change to user Faker instead of int ranges
|
23
|
+
"""
|
24
|
+
|
25
|
+
# Function to generate random names
|
26
|
+
def generate_random_name():
|
27
|
+
return "".join(
|
28
|
+
random.choices(
|
29
|
+
string.ascii_uppercase + string.ascii_lowercase, k=random.randint(3, 10)
|
30
|
+
)
|
31
|
+
)
|
32
|
+
|
33
|
+
# Create a list of numbers from 1 to TEST_ROW_COUNT
|
34
|
+
ids = list(range(1, FIXTURE_ROW_COUNT + 1))
|
35
|
+
random.shuffle(ids)
|
36
|
+
|
37
|
+
# Generate one million rows
|
38
|
+
return MvpTable(
|
39
|
+
{
|
40
|
+
"id": ids,
|
41
|
+
"name": [generate_random_name() for _ in range(FIXTURE_ROW_COUNT)],
|
42
|
+
"age": [random.randint(18, 100) for _ in range(FIXTURE_ROW_COUNT)],
|
43
|
+
}
|
44
|
+
)
|
45
|
+
|
46
|
+
|
47
|
+
@pytest.fixture
|
48
|
+
def ds1_schema():
|
49
|
+
return Schema(
|
50
|
+
{
|
51
|
+
("id", Datatype.int32()),
|
52
|
+
("name", Datatype.string()),
|
53
|
+
("age", Datatype.int32()),
|
54
|
+
},
|
55
|
+
"id",
|
56
|
+
)
|
57
|
+
|
58
|
+
|
59
|
+
@pytest.fixture
|
60
|
+
def ds2_dataset():
|
61
|
+
"""
|
62
|
+
dataset2 with one million rows that can be joined to ds1
|
63
|
+
primary key is integer between 1 and 1,000,000
|
64
|
+
"""
|
65
|
+
# Create a list of numbers from 1 to 1,000,000
|
66
|
+
ids = list(range(1, FIXTURE_ROW_COUNT + 1))
|
67
|
+
random.shuffle(ids)
|
68
|
+
|
69
|
+
fake = Faker()
|
70
|
+
|
71
|
+
# Generate one million rows
|
72
|
+
return MvpTable(
|
73
|
+
{
|
74
|
+
"id": ids,
|
75
|
+
"address": [fake.address() for _ in range(FIXTURE_ROW_COUNT)],
|
76
|
+
"zip": [fake.zipcode() for _ in range(FIXTURE_ROW_COUNT)],
|
77
|
+
}
|
78
|
+
)
|
79
|
+
|
80
|
+
|
81
|
+
@pytest.fixture
|
82
|
+
def ds2_schema():
|
83
|
+
return Schema(
|
84
|
+
{
|
85
|
+
("id", Datatype.int32()),
|
86
|
+
("address", Datatype.string()),
|
87
|
+
("zip", Datatype.string()),
|
88
|
+
},
|
89
|
+
"id",
|
90
|
+
)
|
91
|
+
|
92
|
+
|
93
|
+
@pytest.fixture
|
94
|
+
def combined_schema(ds1_schema, ds2_schema):
|
95
|
+
return Schema(
|
96
|
+
{
|
97
|
+
("id", Datatype.int32()),
|
98
|
+
("address", Datatype.string()),
|
99
|
+
("zip", Datatype.string()),
|
100
|
+
("name", Datatype.string()),
|
101
|
+
("age", Datatype.int32()),
|
102
|
+
},
|
103
|
+
"id",
|
104
|
+
)
|
105
|
+
|
106
|
+
|
107
|
+
@pytest.fixture
|
108
|
+
def dataset_images_with_label() -> (MvpTable, Schema):
|
109
|
+
"""
|
110
|
+
dataset with one thousand images and labels, generated dynamically
|
111
|
+
primary key is integer between 1 and 1,000
|
112
|
+
"""
|
113
|
+
ROW_COUNT = 1000
|
114
|
+
fake = Faker()
|
115
|
+
schema = Schema(
|
116
|
+
{
|
117
|
+
("id", Datatype.int32()),
|
118
|
+
("image", Datatype.image("jpg")),
|
119
|
+
("label", Datatype.string()),
|
120
|
+
},
|
121
|
+
"id",
|
122
|
+
)
|
123
|
+
|
124
|
+
# Create a list of numbers from 1 to ROW_COUNT
|
125
|
+
ids = list(range(1, ROW_COUNT + 1))
|
126
|
+
random.shuffle(ids)
|
127
|
+
|
128
|
+
fake_image = Image.new(
|
129
|
+
"RGB",
|
130
|
+
(512, 512),
|
131
|
+
color=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
|
132
|
+
)
|
133
|
+
# get bytes from image encoded as png
|
134
|
+
buffer = io.BytesIO()
|
135
|
+
fake_image.save(buffer, format="PNG")
|
136
|
+
# seek to start of buffer since we just wrote to it
|
137
|
+
buffer.seek(0)
|
138
|
+
image_bytes = buffer.read()
|
139
|
+
# Generate one million rows
|
140
|
+
return (
|
141
|
+
MvpTable(
|
142
|
+
{
|
143
|
+
"id": ids,
|
144
|
+
"image": [image_bytes for _ in range(ROW_COUNT)],
|
145
|
+
"label": [fake.name() for _ in range(ROW_COUNT)],
|
146
|
+
}
|
147
|
+
),
|
148
|
+
schema,
|
149
|
+
)
|
File without changes
|
@@ -0,0 +1,93 @@
|
|
1
|
+
import pytest
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
import pyarrow.parquet as pq
|
5
|
+
from deltacat import Datatype, Dataset
|
6
|
+
from deltacat.storage.rivulet import Schema, Field
|
7
|
+
from deltacat.utils.metafile_locator import _find_partition_path
|
8
|
+
|
9
|
+
|
10
|
+
@pytest.fixture
|
11
|
+
def sample_schema():
|
12
|
+
return Schema(
|
13
|
+
fields=[
|
14
|
+
Field("id", Datatype.int32(), is_merge_key=True),
|
15
|
+
Field("name", Datatype.string()),
|
16
|
+
Field("age", Datatype.int32()),
|
17
|
+
]
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@pytest.fixture
|
22
|
+
def sample_pydict():
|
23
|
+
return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
|
24
|
+
|
25
|
+
|
26
|
+
@pytest.fixture
|
27
|
+
def temp_storage_path(tmp_path):
|
28
|
+
return tmp_path
|
29
|
+
|
30
|
+
|
31
|
+
@pytest.fixture
|
32
|
+
def sample_parquet_data(temp_storage_path, sample_pydict):
|
33
|
+
parquet_path = temp_storage_path / "test.parquet"
|
34
|
+
table = pa.Table.from_pydict(sample_pydict)
|
35
|
+
pq.write_table(table, parquet_path)
|
36
|
+
return parquet_path
|
37
|
+
|
38
|
+
|
39
|
+
@pytest.fixture
|
40
|
+
def dataset(sample_parquet_data):
|
41
|
+
return Dataset.from_parquet(
|
42
|
+
file_uri=sample_parquet_data, name="dataset", merge_keys="id"
|
43
|
+
)
|
44
|
+
|
45
|
+
|
46
|
+
@pytest.fixture
|
47
|
+
def file_provider(dataset):
|
48
|
+
return dataset._file_provider
|
49
|
+
|
50
|
+
|
51
|
+
def test_provide_data_file(file_provider):
|
52
|
+
output_file = file_provider.provide_data_file("parquet")
|
53
|
+
assert "data" in output_file.location
|
54
|
+
assert output_file.location.endswith(".parquet")
|
55
|
+
|
56
|
+
output_file2 = file_provider.provide_data_file("parquet")
|
57
|
+
assert "data" in output_file2.location
|
58
|
+
assert output_file2.location.endswith(".parquet")
|
59
|
+
|
60
|
+
assert (
|
61
|
+
output_file.location != output_file2.location
|
62
|
+
), "Two output files should have different locations."
|
63
|
+
|
64
|
+
|
65
|
+
def test_provide_manifest_file(file_provider):
|
66
|
+
output_file = file_provider.provide_manifest_file()
|
67
|
+
assert "metadata/manifests" in output_file.location
|
68
|
+
assert output_file.location.endswith(".json")
|
69
|
+
|
70
|
+
|
71
|
+
def test_provide_l0_sst_file(file_provider):
|
72
|
+
output_file = file_provider.provide_l0_sst_file()
|
73
|
+
assert "metadata/ssts/0" in output_file.location
|
74
|
+
assert output_file.location.endswith(".json")
|
75
|
+
|
76
|
+
|
77
|
+
def test_provide_input_file(file_provider, sample_parquet_data):
|
78
|
+
input_file = file_provider.provide_input_file(str(sample_parquet_data))
|
79
|
+
assert input_file.location == str(sample_parquet_data)
|
80
|
+
|
81
|
+
|
82
|
+
def test_generate_sst_uris(file_provider):
|
83
|
+
generated_files = list(file_provider.generate_sst_uris())
|
84
|
+
for file in generated_files:
|
85
|
+
assert "metadata/ssts/0" in file.location
|
86
|
+
assert file.location.endswith(".json")
|
87
|
+
|
88
|
+
|
89
|
+
def test_get_scan_directories(file_provider):
|
90
|
+
partition_path = _find_partition_path(file_provider.uri, file_provider._locator)
|
91
|
+
assert file_provider.get_sst_scan_directories() == [
|
92
|
+
f"{partition_path}/metadata/ssts/0/"
|
93
|
+
]
|
File without changes
|