deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import List, Callable, Any, Protocol
|
3
|
+
|
4
|
+
from deltacat.storage.rivulet.dataset_executor import DatasetExecutor
|
5
|
+
from deltacat.storage.rivulet.mvp.Table import MvpTable
|
6
|
+
from deltacat.storage.rivulet import Schema
|
7
|
+
|
8
|
+
|
9
|
+
class DatasetOperation(Protocol):
|
10
|
+
def visit(self, executor: DatasetExecutor):
|
11
|
+
...
|
12
|
+
|
13
|
+
|
14
|
+
@dataclass
|
15
|
+
class SelectOperation(DatasetOperation):
|
16
|
+
"""
|
17
|
+
Select a subset of fields within the schema
|
18
|
+
|
19
|
+
TODO need better interface for defining selection
|
20
|
+
(e.g. "all fields except X")
|
21
|
+
|
22
|
+
TODO in the future this should support basic filters (e.g. on primary key)
|
23
|
+
"""
|
24
|
+
|
25
|
+
fields: List[str]
|
26
|
+
|
27
|
+
def visit(self, executor: DatasetExecutor):
|
28
|
+
executor.select(self.fields)
|
29
|
+
|
30
|
+
|
31
|
+
@dataclass
|
32
|
+
class MapOperation(DatasetOperation):
|
33
|
+
"""
|
34
|
+
Map a function over each record in the dataset
|
35
|
+
|
36
|
+
TODO need more sophistication in the interface of the callable function
|
37
|
+
For now we will be super simple and just call the transform on each record
|
38
|
+
"""
|
39
|
+
|
40
|
+
transform: Callable[[Any], Any]
|
41
|
+
|
42
|
+
def visit(self, executor: DatasetExecutor):
|
43
|
+
executor.map(self.transform)
|
44
|
+
|
45
|
+
|
46
|
+
class CollectOperation(DatasetOperation):
|
47
|
+
"""
|
48
|
+
Materialize dataset
|
49
|
+
"""
|
50
|
+
|
51
|
+
def visit(self, executor: DatasetExecutor):
|
52
|
+
executor.collect()
|
53
|
+
|
54
|
+
|
55
|
+
class LogicalPlan:
|
56
|
+
"""
|
57
|
+
A fluent builder for constructing a sequence of dataset operations.
|
58
|
+
|
59
|
+
This class allows chaining of different dataset operations such as select and map.
|
60
|
+
The actual implementation of these operations is delegated to the Dataset class
|
61
|
+
using the visitor pattern.
|
62
|
+
|
63
|
+
Example:
|
64
|
+
plan = LogicalPlan().select(lambda x: x['age'] > 30).map(lambda x: x['name'])
|
65
|
+
|
66
|
+
The plan can then be executed on a Dataset object, which will apply the
|
67
|
+
operations in the order they were added.
|
68
|
+
"""
|
69
|
+
|
70
|
+
def __init__(self, schema: Schema):
|
71
|
+
self.operations: List[DatasetOperation] = []
|
72
|
+
self.schema = schema
|
73
|
+
# Tracks effective schema to perform each operation on
|
74
|
+
self.effective_schema: Schema = schema.__deepcopy__()
|
75
|
+
|
76
|
+
def select(self, filter: List[str]) -> "LogicalPlan":
|
77
|
+
# Validate that select statement is allowed and mutate effective schema for future validations
|
78
|
+
invalid_fields = [
|
79
|
+
field for field in filter if field not in self.effective_schema.fields
|
80
|
+
]
|
81
|
+
if invalid_fields:
|
82
|
+
raise ValueError(f"Invalid fields: {', '.join(invalid_fields)}")
|
83
|
+
|
84
|
+
# remove fields from effective schema if they are not in chosen fields
|
85
|
+
remove_fields = [
|
86
|
+
field for field in self.effective_schema.keys() if field not in filter
|
87
|
+
]
|
88
|
+
for field in remove_fields:
|
89
|
+
self.effective_schema.__delitem__(field)
|
90
|
+
|
91
|
+
self.operations.append(SelectOperation(filter))
|
92
|
+
return self
|
93
|
+
|
94
|
+
def map(self, transform: Callable[[dict], dict]) -> "LogicalPlan":
|
95
|
+
self.operations.append(MapOperation(transform))
|
96
|
+
return self
|
97
|
+
|
98
|
+
def collect(self) -> "LogicalPlan":
|
99
|
+
self.operations.append(CollectOperation())
|
100
|
+
return self
|
101
|
+
|
102
|
+
def execute(self, executor: DatasetExecutor) -> "MvpTable":
|
103
|
+
for operation in self.operations:
|
104
|
+
operation.visit(executor)
|
105
|
+
return executor.output
|
File without changes
|
@@ -0,0 +1,190 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Protocol, NamedTuple, List
|
4
|
+
import time
|
5
|
+
|
6
|
+
from deltacat.storage import (
|
7
|
+
ManifestMeta,
|
8
|
+
EntryType,
|
9
|
+
DeltaLocator,
|
10
|
+
Delta,
|
11
|
+
DeltaType,
|
12
|
+
Transaction,
|
13
|
+
TransactionType,
|
14
|
+
TransactionOperation,
|
15
|
+
TransactionOperationType,
|
16
|
+
)
|
17
|
+
from deltacat.storage.model.manifest import Manifest, ManifestEntryList, ManifestEntry
|
18
|
+
from deltacat.storage.model.partition import PartitionLocator
|
19
|
+
from deltacat.storage.model.transaction import TransactionOperationList
|
20
|
+
|
21
|
+
from deltacat.storage.model.types import StreamFormat
|
22
|
+
from deltacat.storage.rivulet import Schema
|
23
|
+
|
24
|
+
StreamPosition = int
|
25
|
+
"""The stream position for creating a consistent ordering of manifests."""
|
26
|
+
TreeLevel = int
|
27
|
+
"""The level of the manifest in the LSM-tree."""
|
28
|
+
|
29
|
+
|
30
|
+
class DeltaContext(NamedTuple):
|
31
|
+
"""Minimal amount of manifest context that may need to be circulated independently or alongside individual files"""
|
32
|
+
|
33
|
+
# Schema needed to understand which field group was added when writing manifest
|
34
|
+
# TODO in the future we should use something like a field group id and keep schema in dataset-level metadata
|
35
|
+
schema: Schema
|
36
|
+
stream_position: StreamPosition
|
37
|
+
level: TreeLevel
|
38
|
+
|
39
|
+
|
40
|
+
class RivuletDelta(dict):
|
41
|
+
"""
|
42
|
+
Temporary class during merging of deltacat/rivulet metadata formats
|
43
|
+
|
44
|
+
This class currently serves two purposes:
|
45
|
+
1. Avoid big bang refactor in which consumers of RivuletDelta have to update their code to consume deltacat Delta/Manifest
|
46
|
+
2. Provide more time to figure out how to represent SST files / schema / etc within deltacat constructs
|
47
|
+
|
48
|
+
"""
|
49
|
+
|
50
|
+
context: DeltaContext
|
51
|
+
|
52
|
+
@staticmethod
|
53
|
+
def of(delta: Delta) -> RivuletDelta:
|
54
|
+
riv_delta = RivuletDelta()
|
55
|
+
riv_delta["dcDelta"] = delta
|
56
|
+
schema = Schema.from_dict(delta.get("schema"))
|
57
|
+
riv_delta["DeltaContext"] = DeltaContext(
|
58
|
+
schema, delta.stream_position, delta.get("level")
|
59
|
+
)
|
60
|
+
|
61
|
+
return riv_delta
|
62
|
+
|
63
|
+
@property
|
64
|
+
def dcDelta(self) -> Delta:
|
65
|
+
return self.get("dcDelta")
|
66
|
+
|
67
|
+
@property
|
68
|
+
def sst_files(self) -> List[str]:
|
69
|
+
if "sst_files" not in self.keys():
|
70
|
+
self["sst_files"] = [m.uri for m in self.dcDelta.manifest.entries]
|
71
|
+
return self["sst_files"]
|
72
|
+
|
73
|
+
@sst_files.setter
|
74
|
+
def sst_files(self, files: List[str]):
|
75
|
+
self["sst_files"] = files
|
76
|
+
|
77
|
+
@property
|
78
|
+
def context(self) -> DeltaContext:
|
79
|
+
return self["DeltaContext"]
|
80
|
+
|
81
|
+
@context.setter
|
82
|
+
def context(self, mc: DeltaContext):
|
83
|
+
self["DeltaContext"] = mc
|
84
|
+
|
85
|
+
|
86
|
+
class ManifestIO(Protocol):
|
87
|
+
"""
|
88
|
+
Minimal interface for reading and writing manifest files
|
89
|
+
"""
|
90
|
+
|
91
|
+
def write(
|
92
|
+
self,
|
93
|
+
sst_files: List[str],
|
94
|
+
schema: Schema,
|
95
|
+
level: TreeLevel,
|
96
|
+
) -> str:
|
97
|
+
...
|
98
|
+
|
99
|
+
def read(self, file: str) -> RivuletDelta:
|
100
|
+
...
|
101
|
+
|
102
|
+
|
103
|
+
class DeltacatManifestIO(ManifestIO):
|
104
|
+
"""
|
105
|
+
Writes manifest data, but by writing to a Deltacat metastore using Deltacat delta/manifest classes.
|
106
|
+
"""
|
107
|
+
|
108
|
+
def __init__(self, root: str, locator: PartitionLocator):
|
109
|
+
self.root = root
|
110
|
+
self.locator = locator
|
111
|
+
|
112
|
+
def write(
|
113
|
+
self,
|
114
|
+
sst_files: List[str],
|
115
|
+
schema: Schema,
|
116
|
+
level: TreeLevel,
|
117
|
+
) -> str:
|
118
|
+
entry_list = ManifestEntryList()
|
119
|
+
"""
|
120
|
+
Currently, we use the "data files" manifest entry field for SST files
|
121
|
+
This is a bit of a hack - we should consider how to better model SST files
|
122
|
+
(e.g.: add Manifest entry of type "SST") and decide whether we also need to record data files separately
|
123
|
+
even though they're referenced by SST
|
124
|
+
Ticket: https://github.com/ray-project/deltacat/issues/469
|
125
|
+
"""
|
126
|
+
for sst_uri in sst_files:
|
127
|
+
entry_list.append(
|
128
|
+
ManifestEntry.of(
|
129
|
+
url=sst_uri,
|
130
|
+
# TODO have rivulet writer populate these values
|
131
|
+
# see: https://github.com/ray-project/deltacat/issues/476
|
132
|
+
meta=ManifestMeta.of(
|
133
|
+
record_count=None, # or known
|
134
|
+
content_length=None,
|
135
|
+
content_type=None,
|
136
|
+
content_encoding=None,
|
137
|
+
entry_type=EntryType.DATA,
|
138
|
+
),
|
139
|
+
)
|
140
|
+
)
|
141
|
+
dc_manifest = Manifest.of(entries=entry_list)
|
142
|
+
|
143
|
+
# Create delta and transaction which writes manifest to root
|
144
|
+
# TODO replace this with higher level storage interface for deltacat
|
145
|
+
delta_locator = DeltaLocator.at(
|
146
|
+
namespace=self.locator.namespace,
|
147
|
+
table_name=self.locator.table_name,
|
148
|
+
table_version=self.locator.table_version,
|
149
|
+
partition_id=self.locator.partition_id,
|
150
|
+
partition_values=self.locator.partition_values,
|
151
|
+
stream_id=self.locator.stream_id,
|
152
|
+
stream_format=StreamFormat.DELTACAT,
|
153
|
+
# Using microsecond precision timestamp as stream position
|
154
|
+
# TODO consider having storage interface auto assign stream position
|
155
|
+
stream_position=time.time_ns(),
|
156
|
+
)
|
157
|
+
|
158
|
+
delta = Delta.of(
|
159
|
+
locator=delta_locator,
|
160
|
+
delta_type=DeltaType.APPEND,
|
161
|
+
meta=None,
|
162
|
+
properties={},
|
163
|
+
manifest=dc_manifest,
|
164
|
+
)
|
165
|
+
# TODO later support multiple schemas (https://github.com/ray-project/deltacat/issues/468)
|
166
|
+
delta["schema"] = schema.to_dict()
|
167
|
+
# TODO consider if level should be added as first class key to delta or
|
168
|
+
# kept as specific to storage interface
|
169
|
+
delta["level"] = level
|
170
|
+
|
171
|
+
tx_results = Transaction.of(
|
172
|
+
txn_type=TransactionType.APPEND,
|
173
|
+
txn_operations=TransactionOperationList.of(
|
174
|
+
[
|
175
|
+
TransactionOperation.of(
|
176
|
+
operation_type=TransactionOperationType.CREATE,
|
177
|
+
dest_metafile=delta,
|
178
|
+
)
|
179
|
+
]
|
180
|
+
),
|
181
|
+
).commit(self.root)
|
182
|
+
paths = tx_results[0]
|
183
|
+
assert (
|
184
|
+
len(paths) == 1
|
185
|
+
), "expected delta commit transaction to write exactly 1 metafile"
|
186
|
+
return paths[0]
|
187
|
+
|
188
|
+
def read(self, file: str):
|
189
|
+
delta = Delta.read(file)
|
190
|
+
return RivuletDelta.of(delta)
|
@@ -0,0 +1,105 @@
|
|
1
|
+
import logging
|
2
|
+
import json
|
3
|
+
|
4
|
+
from itertools import zip_longest
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
from deltacat.storage.rivulet.fs.input_file import InputFile
|
8
|
+
from deltacat.storage.rivulet.fs.output_file import OutputFile
|
9
|
+
from deltacat.storage.rivulet.metastore.sst import (
|
10
|
+
SSTWriter,
|
11
|
+
SSTableRow,
|
12
|
+
SSTReader,
|
13
|
+
SSTable,
|
14
|
+
)
|
15
|
+
from deltacat import logs
|
16
|
+
|
17
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
18
|
+
|
19
|
+
|
20
|
+
class JsonSstWriter(SSTWriter):
|
21
|
+
"""
|
22
|
+
Class for reading and writing Json SST files
|
23
|
+
|
24
|
+
TODO use a more efficient format or compression. Also can factor out URI prefix across rows
|
25
|
+
We can also optimize by omitting offset_end if sequential rows use the same uri
|
26
|
+
"""
|
27
|
+
|
28
|
+
def write(self, file: OutputFile, rows: List[SSTableRow]) -> None:
|
29
|
+
"""
|
30
|
+
Writes SST file
|
31
|
+
"""
|
32
|
+
if len(rows) == 0:
|
33
|
+
return
|
34
|
+
|
35
|
+
# File-level metadata for key min/max and offset end
|
36
|
+
min_key = rows[0].key_min
|
37
|
+
max_key = rows[-1].key_max
|
38
|
+
offset_end = rows[-1].offset_end
|
39
|
+
|
40
|
+
# Convert to dict for json serialization
|
41
|
+
file_rows = [
|
42
|
+
{
|
43
|
+
"key_min": row.key_min,
|
44
|
+
"key_max": row.key_max,
|
45
|
+
"offset": row.offset_start,
|
46
|
+
"uri": row.uri,
|
47
|
+
}
|
48
|
+
for row in rows
|
49
|
+
]
|
50
|
+
|
51
|
+
file_as_dict = {
|
52
|
+
"key_min": min_key,
|
53
|
+
"key_max": max_key,
|
54
|
+
"offset_end": offset_end,
|
55
|
+
"metadata": file_rows,
|
56
|
+
}
|
57
|
+
|
58
|
+
try:
|
59
|
+
with file.create() as f:
|
60
|
+
f.write(json.dumps(file_as_dict).encode())
|
61
|
+
logger.debug(f"SSTable data successfully written to {file.location}")
|
62
|
+
except Exception as e:
|
63
|
+
# TODO better error handling for IO
|
64
|
+
logger.debug(f"Unexpected error occurred while writing SSTable data: {e}")
|
65
|
+
raise e
|
66
|
+
|
67
|
+
|
68
|
+
class JsonSstReader(SSTReader):
|
69
|
+
"""
|
70
|
+
interface for reading SST files
|
71
|
+
"""
|
72
|
+
|
73
|
+
def read(self, file: InputFile) -> SSTable:
|
74
|
+
with file.open() as f:
|
75
|
+
data = json.loads(f.read())
|
76
|
+
sst_rows: List[SSTableRow] = []
|
77
|
+
file_offset_end = data["offset_end"]
|
78
|
+
|
79
|
+
# each row only has "key", "offset", "uri"
|
80
|
+
# need to get "key_end", "offset_end" from either next row
|
81
|
+
# OR from top level metadata
|
82
|
+
for row1, row2 in zip_longest(data["metadata"], data["metadata"][1:]):
|
83
|
+
# if not row2, we are on the very last row and need to use file metadata for key and offset end
|
84
|
+
if not row2:
|
85
|
+
sst_rows.append(
|
86
|
+
SSTableRow(
|
87
|
+
row1["key_min"],
|
88
|
+
row1["key_max"],
|
89
|
+
row1["uri"],
|
90
|
+
row1["offset"],
|
91
|
+
file_offset_end,
|
92
|
+
)
|
93
|
+
)
|
94
|
+
else:
|
95
|
+
sst_rows.append(
|
96
|
+
SSTableRow(
|
97
|
+
row1["key_min"],
|
98
|
+
row1["key_max"],
|
99
|
+
row1["uri"],
|
100
|
+
row1["offset"],
|
101
|
+
file_offset_end,
|
102
|
+
)
|
103
|
+
)
|
104
|
+
|
105
|
+
return SSTable(sst_rows, data["key_min"], data["key_max"])
|
@@ -0,0 +1,82 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import Protocol, Any, List
|
3
|
+
|
4
|
+
from deltacat.storage.rivulet.fs.input_file import InputFile
|
5
|
+
from deltacat.storage.rivulet.fs.output_file import OutputFile
|
6
|
+
|
7
|
+
|
8
|
+
@dataclass(frozen=True)
|
9
|
+
class SSTableRow:
|
10
|
+
"""
|
11
|
+
Row of Sorted String Table.
|
12
|
+
|
13
|
+
The metadata for a SSTable row referencing a offset-range of a data file containing data sorted by primary key.
|
14
|
+
|
15
|
+
Note that the actual file format for SSTables can omit some of these fields (e.g. key_end and offset_end) taking
|
16
|
+
advantage of sorted nature of file.
|
17
|
+
"""
|
18
|
+
|
19
|
+
key_min: Any
|
20
|
+
"""The first primary key found in referenced data range, inclusive."""
|
21
|
+
key_max: Any
|
22
|
+
"""the last primary key found in referenced data range, inclusive"""
|
23
|
+
uri: str
|
24
|
+
"""The URI of the data file containing this row's data.
|
25
|
+
Will be format dependent, e.g. file://<absolute_path> or s3://<bucket>/<key>
|
26
|
+
Note that if uri_prefix is specified in SSTable, this will just be a postfix
|
27
|
+
"""
|
28
|
+
offset_start: int
|
29
|
+
"""
|
30
|
+
offset start for data range within uri.
|
31
|
+
Note this offset is format dependent - e.g. for Parquet it will be zero-indexed row group
|
32
|
+
For other formats it will be byte offset into file
|
33
|
+
"""
|
34
|
+
offset_end: int
|
35
|
+
"""
|
36
|
+
offset end for data range within uri.
|
37
|
+
"""
|
38
|
+
|
39
|
+
"""The starting offset into the data file for data referenced by this row.
|
40
|
+
Note that offset is format dependent.
|
41
|
+
E.g. for parquet files it is row group, for other formats it is byte offset
|
42
|
+
"""
|
43
|
+
|
44
|
+
|
45
|
+
@dataclass(frozen=True)
|
46
|
+
class SSTable:
|
47
|
+
"""
|
48
|
+
In memory representation of Sorted String Table
|
49
|
+
|
50
|
+
List of references to data file ranges with statistics to enable pruning by primary key.
|
51
|
+
"""
|
52
|
+
|
53
|
+
rows: List[SSTableRow]
|
54
|
+
"""Sorted List of rows by key"""
|
55
|
+
min_key: Any
|
56
|
+
"""Minimum observed primary key across all rows."""
|
57
|
+
max_key: Any
|
58
|
+
"""Maximum observed primary key across all rows."""
|
59
|
+
|
60
|
+
|
61
|
+
class SSTWriter(Protocol):
|
62
|
+
"""
|
63
|
+
interface for writing SST files
|
64
|
+
|
65
|
+
Rows may be added iteratively. Input rows (within add_rows batch or across batches) MUST be sorted
|
66
|
+
by key_min
|
67
|
+
"""
|
68
|
+
|
69
|
+
def write(self, file: OutputFile, rows: List[SSTableRow]) -> None:
|
70
|
+
"""
|
71
|
+
Writes SST file
|
72
|
+
"""
|
73
|
+
...
|
74
|
+
|
75
|
+
|
76
|
+
class SSTReader(Protocol):
|
77
|
+
"""
|
78
|
+
interface for reading SST files
|
79
|
+
"""
|
80
|
+
|
81
|
+
def read(self, file: InputFile) -> SSTable:
|
82
|
+
...
|