deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,128 @@
|
|
1
|
+
# Similar to daft's datatype, this is a big ole enum of all possible types
|
2
|
+
# In the long term, this will have to be interoperable with pandas/daft/spark/parquet/iceberg/etc type systems
|
3
|
+
# Our Spec will need to publish data type mappings, such as Iceberg's data type mappings: https://iceberg.apache.org/spec/#file-system-operations
|
4
|
+
# It also has the unique responsibility of representing multi-modal (e.g. image) types
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
import pyarrow as pa
|
9
|
+
|
10
|
+
|
11
|
+
# OPEN QUESTIONS:
|
12
|
+
# Do we want to support the notion of logical vs physical type like parquet?
|
13
|
+
|
14
|
+
# TODO turn into an interface or otherwise allow pluggable datatypes
|
15
|
+
@dataclass(frozen=True)
|
16
|
+
class Datatype:
|
17
|
+
type_name: str
|
18
|
+
|
19
|
+
@property
|
20
|
+
def subtype(self) -> Optional[str]:
|
21
|
+
"""
|
22
|
+
Higher level formats like binary or image will have "subtype", such as image(jpg) or binary(np_array)
|
23
|
+
TODO - Note that we are replacing this schema system with DeltaCat schema model, which supports extended/decorated pyarrow types
|
24
|
+
For now going to do a super minimal/hacky implementation of types like binary and image, where
|
25
|
+
:return: Subtype if it exists, or None
|
26
|
+
"""
|
27
|
+
if not self.type_name.endswith(")"):
|
28
|
+
return None
|
29
|
+
if self.type_name.startswith("binary(") or self.type_name.startswith("image("):
|
30
|
+
return self.type_name[self.type_name.find("(") + 1 : -1]
|
31
|
+
return None
|
32
|
+
|
33
|
+
@classmethod
|
34
|
+
def binary(cls, binary_format):
|
35
|
+
"""
|
36
|
+
:param binary_format:
|
37
|
+
:return:
|
38
|
+
"""
|
39
|
+
return cls(type_name=f"binary({binary_format})")
|
40
|
+
|
41
|
+
@classmethod
|
42
|
+
def image(cls, image_format):
|
43
|
+
return cls(type_name=f"image({image_format})")
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def string(cls):
|
47
|
+
return cls(type_name="string")
|
48
|
+
|
49
|
+
@classmethod
|
50
|
+
def float(cls):
|
51
|
+
return cls(type_name="float")
|
52
|
+
|
53
|
+
@classmethod
|
54
|
+
def int16(cls):
|
55
|
+
return cls(type_name="int16")
|
56
|
+
|
57
|
+
@classmethod
|
58
|
+
def int32(cls):
|
59
|
+
return cls(type_name="int32")
|
60
|
+
|
61
|
+
@classmethod
|
62
|
+
def int64(cls):
|
63
|
+
return cls(type_name="int64")
|
64
|
+
|
65
|
+
@classmethod
|
66
|
+
def bool(cls):
|
67
|
+
return cls(type_name="bool")
|
68
|
+
|
69
|
+
@classmethod
|
70
|
+
def from_pyarrow(cls, pa_type: pa.DataType) -> "Datatype":
|
71
|
+
"""
|
72
|
+
Convert a pa type to a Rivulet Datatype.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
pa_type: pa DataType to convert
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
Datatype: Corresponding Rivulet Datatype
|
79
|
+
|
80
|
+
Raises:
|
81
|
+
ValueError: If the pa type is not supported
|
82
|
+
"""
|
83
|
+
if pa.types.is_string(pa_type):
|
84
|
+
return cls.string()
|
85
|
+
elif pa.types.is_float64(pa_type):
|
86
|
+
return cls.float()
|
87
|
+
elif pa.types.is_int16(pa_type):
|
88
|
+
return cls.int16()
|
89
|
+
elif pa.types.is_int32(pa_type):
|
90
|
+
return cls.int32()
|
91
|
+
elif pa.types.is_int64(pa_type):
|
92
|
+
return cls.int64()
|
93
|
+
elif pa.types.is_boolean(pa_type):
|
94
|
+
return cls.bool()
|
95
|
+
elif pa.types.is_binary(pa_type):
|
96
|
+
# TODO: Use pyarrow metadata on schema field to map correctly into image and other binary types
|
97
|
+
return cls.binary("binary") # Default binary format
|
98
|
+
else:
|
99
|
+
raise ValueError(f"Unsupported pa type: {pa_type}")
|
100
|
+
|
101
|
+
def to_pyarrow(self) -> pa.field:
|
102
|
+
"""
|
103
|
+
In the future we want to be more thoughtful about how we do type conversions
|
104
|
+
|
105
|
+
For now, just build a simple mapping of every time to pyarrow
|
106
|
+
For what it's worth, Daft schema types have a giant if/else like this
|
107
|
+
|
108
|
+
:return: pyarrow type
|
109
|
+
"""
|
110
|
+
if self.type_name == "string":
|
111
|
+
return pa.string()
|
112
|
+
elif self.type_name == "float":
|
113
|
+
return pa.float64()
|
114
|
+
elif self.type_name == "int16":
|
115
|
+
return pa.int16()
|
116
|
+
elif self.type_name == "int32":
|
117
|
+
return pa.int32()
|
118
|
+
elif self.type_name == "int64":
|
119
|
+
return pa.int64()
|
120
|
+
elif self.type_name == "bool":
|
121
|
+
return pa.bool_()
|
122
|
+
elif self.type_name.startswith("image(") or self.type_name.startswith(
|
123
|
+
"binary("
|
124
|
+
):
|
125
|
+
# TODO we will need to think about how custom types work with tabular libraries
|
126
|
+
return pa.binary()
|
127
|
+
else:
|
128
|
+
raise ValueError(f"Unsupported type conversion to pa: {self.type_name}")
|
@@ -0,0 +1,251 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass, asdict
|
4
|
+
from typing import MutableMapping, Dict, Iterable, Tuple, Optional
|
5
|
+
|
6
|
+
import pyarrow as pa
|
7
|
+
|
8
|
+
from deltacat.storage.rivulet.schema.datatype import Datatype
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass(frozen=True)
|
12
|
+
class Field:
|
13
|
+
name: str
|
14
|
+
datatype: Datatype
|
15
|
+
is_merge_key: bool = False
|
16
|
+
|
17
|
+
|
18
|
+
class Schema(MutableMapping[str, Field]):
|
19
|
+
"""
|
20
|
+
A mutable mapping representing a schema for structured data, requiring at least one merge key field.
|
21
|
+
|
22
|
+
TODO FUTURE ITERATIONS
|
23
|
+
1. We may use Deltacat for schema
|
24
|
+
2. We almost certainly want our schema system based on arrow types,
|
25
|
+
since many libraries we are integrating with (e.g. daft) are
|
26
|
+
interoperable with arrow schemas
|
27
|
+
|
28
|
+
Attributes:
|
29
|
+
name: The name of the schema (for storing in dict/map)
|
30
|
+
_fields (dict): Maps field names to Field objects.
|
31
|
+
|
32
|
+
Methods:
|
33
|
+
from_pyarrow(pyarrow_schema: pa.Schema, key: str) -> Schema:
|
34
|
+
Creates a Schema instance from a PyArrow schema.
|
35
|
+
|
36
|
+
__len__() -> int: Returns number of fields.
|
37
|
+
__getitem__(key: str) -> Field: Gets field by name.
|
38
|
+
__setitem__(key: str, value: Field | Datatype): Adds/updates field.
|
39
|
+
__delitem__(key: str): Deletes field if not a merge key.
|
40
|
+
__iter__(): Iterates over fields.
|
41
|
+
|
42
|
+
add_field(field: Field): Adds a Field using its name as the key.
|
43
|
+
to_pyarrow() -> pa.Schema:
|
44
|
+
Converts schema to PyArrow format.
|
45
|
+
|
46
|
+
keys(): Returns field names.
|
47
|
+
values(): Returns Field objects.
|
48
|
+
items(): Returns (name, Field) pairs.
|
49
|
+
"""
|
50
|
+
|
51
|
+
def __init__(
|
52
|
+
self,
|
53
|
+
fields: Iterable[Tuple[str, Datatype] | Field] = None,
|
54
|
+
merge_keys: Optional[Iterable[str]] = None,
|
55
|
+
):
|
56
|
+
self._fields: Dict[str, Field] = {}
|
57
|
+
merge_keys = merge_keys or {}
|
58
|
+
if len(fields or []) == 0:
|
59
|
+
if len(merge_keys) > 0:
|
60
|
+
raise TypeError(
|
61
|
+
"It is invalid to specify merge keys when no fields are specified. Add fields or remove the merge keys."
|
62
|
+
)
|
63
|
+
return
|
64
|
+
# Convert all input tuples to Field objects and add to fields
|
65
|
+
for field in fields:
|
66
|
+
if isinstance(field, tuple):
|
67
|
+
name, datatype = field
|
68
|
+
processed_field = Field(
|
69
|
+
name=name, datatype=datatype, is_merge_key=(name in merge_keys)
|
70
|
+
)
|
71
|
+
elif isinstance(field, Field):
|
72
|
+
processed_field = field
|
73
|
+
name = field.name
|
74
|
+
# Check if merge key status conflicts
|
75
|
+
if len(merge_keys) > 0:
|
76
|
+
expected_merge_key_status = name in merge_keys
|
77
|
+
if processed_field.is_merge_key != expected_merge_key_status:
|
78
|
+
raise TypeError(
|
79
|
+
f"Merge key status conflict for field '{name}': "
|
80
|
+
f"Provided as merge key: {expected_merge_key_status}, "
|
81
|
+
f"Field's current status: {processed_field.is_merge_key}. "
|
82
|
+
f"Merge keys should only be defined if raw (name, Datatype) tuples are used."
|
83
|
+
)
|
84
|
+
else:
|
85
|
+
raise TypeError(f"Unexpected field type: {type(field)}")
|
86
|
+
self.add_field(processed_field)
|
87
|
+
|
88
|
+
@classmethod
|
89
|
+
def from_dict(cls, data) -> Schema:
|
90
|
+
fields = [
|
91
|
+
Field(
|
92
|
+
name=field_data["name"],
|
93
|
+
datatype=Datatype(**field_data["datatype"])
|
94
|
+
if isinstance(field_data["datatype"], dict)
|
95
|
+
else field_data["datatype"],
|
96
|
+
is_merge_key=field_data["is_merge_key"],
|
97
|
+
)
|
98
|
+
for field_data in data["fields"]
|
99
|
+
]
|
100
|
+
return cls(fields)
|
101
|
+
|
102
|
+
@classmethod
|
103
|
+
def from_pyarrow(
|
104
|
+
cls, pyarrow_schema: pa.Schema, merge_keys: str | Iterable[str] = None
|
105
|
+
) -> Schema:
|
106
|
+
"""
|
107
|
+
Create a Schema instance from a PyArrow schema.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
pyarrow_schema: PyArrow Schema to convert
|
111
|
+
merge_keys: The optional set of merge keys to add to the schema as it's being translated.
|
112
|
+
These keys must be present in the schema.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
Schema: New Schema instance
|
116
|
+
|
117
|
+
Raises:
|
118
|
+
ValueError: If key is not found in schema
|
119
|
+
"""
|
120
|
+
merge_keys = [merge_keys] if isinstance(merge_keys, str) else merge_keys
|
121
|
+
fields = {}
|
122
|
+
|
123
|
+
for field in pyarrow_schema:
|
124
|
+
dtype = Datatype.from_pyarrow(field.type)
|
125
|
+
fields[field.name] = Field(
|
126
|
+
field.name, dtype, is_merge_key=(field.name in merge_keys)
|
127
|
+
)
|
128
|
+
|
129
|
+
# Validate that the defined merge_keys are present in the fields being added
|
130
|
+
missing_keys = merge_keys - fields.keys()
|
131
|
+
if missing_keys:
|
132
|
+
raise ValueError(
|
133
|
+
f"The following merge keys not found in the provided schema: {', '.join(missing_keys)}"
|
134
|
+
)
|
135
|
+
|
136
|
+
return cls(fields.values())
|
137
|
+
|
138
|
+
@classmethod
|
139
|
+
def merge_all(cls, schemas: Iterable[Schema]) -> Schema:
|
140
|
+
"""Merges a list of schemas into a new schema"""
|
141
|
+
merged = cls({})
|
142
|
+
for schema in schemas:
|
143
|
+
merged.merge(schema)
|
144
|
+
return merged
|
145
|
+
|
146
|
+
def __getitem__(self, key: str) -> Field:
|
147
|
+
return self._fields[key]
|
148
|
+
|
149
|
+
def __setitem__(
|
150
|
+
self, key: str, value: Field | Datatype | Tuple[Datatype, bool]
|
151
|
+
) -> None:
|
152
|
+
# Create field from [str, Datatype, bool] where bool is merge_key
|
153
|
+
if isinstance(value, Field):
|
154
|
+
processed_field = value
|
155
|
+
elif isinstance(value, Datatype):
|
156
|
+
processed_field = Field(
|
157
|
+
key, value
|
158
|
+
) # is_merge_key is always false in this case
|
159
|
+
elif isinstance(value, tuple):
|
160
|
+
(datatype, merge_key) = value
|
161
|
+
processed_field = Field(key, datatype, merge_key)
|
162
|
+
else:
|
163
|
+
raise TypeError(
|
164
|
+
"The field must be an instance of the Field class, Datatype, or Tuple[Datatype, bool], where bool is whether the field is a merge key."
|
165
|
+
)
|
166
|
+
processed_field: Field = processed_field
|
167
|
+
# if len(self._fields) == 0 and not processed_field.is_merge_key:
|
168
|
+
# raise TypeError("The first field set on a Schema must be a merge key.")
|
169
|
+
|
170
|
+
self._fields[processed_field.name] = processed_field
|
171
|
+
|
172
|
+
def __delitem__(self, key: str) -> None:
|
173
|
+
field = self._fields[key]
|
174
|
+
if field.is_merge_key:
|
175
|
+
raise ValueError("Cannot delete a merge key field")
|
176
|
+
del self._fields[key]
|
177
|
+
|
178
|
+
def __len__(self) -> int:
|
179
|
+
return len(self._fields)
|
180
|
+
|
181
|
+
def __iter__(self) -> Iterable[str]:
|
182
|
+
return iter(self._fields.keys())
|
183
|
+
|
184
|
+
def __hash__(self) -> int:
|
185
|
+
return hash((frozenset(self._fields.items())))
|
186
|
+
|
187
|
+
def __eq__(self, other) -> bool:
|
188
|
+
if isinstance(other, Schema):
|
189
|
+
return self._fields == other._fields
|
190
|
+
return False
|
191
|
+
|
192
|
+
# Has a spurious type check problem in @dataclass + asdict(): https://youtrack.jetbrains.com/issue/PY-76059/Incorrect-Type-warning-with-asdict-and-Dataclass
|
193
|
+
def to_dict(self) -> dict[str, list[dict[str, Field]]]:
|
194
|
+
return {"fields": [asdict(field) for field in self._fields.values()]}
|
195
|
+
|
196
|
+
def add_field(self, field: Field) -> None:
|
197
|
+
"""Adds a Field object using its name as the key, raises ValueError if it already exists"""
|
198
|
+
if field.name in self._fields:
|
199
|
+
raise ValueError(
|
200
|
+
f"Attempting to add a field with the same name as an existing field: {field.name}"
|
201
|
+
)
|
202
|
+
self[field.name] = field
|
203
|
+
|
204
|
+
def get_merge_keys(self) -> Iterable[str]:
|
205
|
+
"""Return a list of all merge keys."""
|
206
|
+
return [field.name for field in self._fields.values() if field.is_merge_key]
|
207
|
+
|
208
|
+
def get_merge_key(self) -> str:
|
209
|
+
"""Returns a single merge key if there is one, or raises if not. Used for simple schemas w/ a single key"""
|
210
|
+
# Get the merge key
|
211
|
+
merge_keys = list(self.get_merge_keys())
|
212
|
+
if len(merge_keys) != 1:
|
213
|
+
raise ValueError(
|
214
|
+
f"Schema must have exactly one merge key, but found {merge_keys}"
|
215
|
+
)
|
216
|
+
return merge_keys[0]
|
217
|
+
|
218
|
+
def merge(self, other: Schema) -> None:
|
219
|
+
"""Merges another schema's fields into the current schema."""
|
220
|
+
if not other:
|
221
|
+
return
|
222
|
+
for name, field in other._fields.items():
|
223
|
+
if name in self._fields:
|
224
|
+
if self._fields[name] != field:
|
225
|
+
raise ValueError(
|
226
|
+
f"Field '{name}' already exists in the current schema with different definition"
|
227
|
+
)
|
228
|
+
else:
|
229
|
+
self.add_field(field)
|
230
|
+
|
231
|
+
def to_pyarrow(self) -> pa.Schema:
|
232
|
+
"""
|
233
|
+
Convert the Schema to a PyArrow schema.
|
234
|
+
|
235
|
+
Returns:
|
236
|
+
pyarrow.schema: A PyArrow schema representation of this Schema.
|
237
|
+
"""
|
238
|
+
# TODO: Should we track merge_keys as it goes to/from pyarrow?
|
239
|
+
fields = []
|
240
|
+
for name, field in self._fields.items():
|
241
|
+
fields.append(pa.field(name, field.datatype.to_pyarrow()))
|
242
|
+
return pa.schema(fields)
|
243
|
+
|
244
|
+
def keys(self) -> Iterable[str]:
|
245
|
+
return self._fields.keys()
|
246
|
+
|
247
|
+
def values(self) -> Iterable[Field]:
|
248
|
+
return self._fields.values()
|
249
|
+
|
250
|
+
def items(self) -> Iterable[tuple[str, Field]]:
|
251
|
+
return self._fields.items()
|
@@ -0,0 +1,40 @@
|
|
1
|
+
from typing import Protocol, Iterable, List, Union, Any, Dict
|
2
|
+
|
3
|
+
from deltacat.storage.rivulet.metastore.sst import SSTableRow
|
4
|
+
import pyarrow as pa
|
5
|
+
|
6
|
+
MEMTABLE_DATA = Union[Iterable[Dict[str, Any]], pa.Table]
|
7
|
+
|
8
|
+
|
9
|
+
class DataSerializer(Protocol):
|
10
|
+
"""
|
11
|
+
Interface for writing data only.
|
12
|
+
|
13
|
+
As data is written, it must emit sufficient metadata to build SSTable
|
14
|
+
Each format will have a specific data writer (e.g. ParquetDataWriter)
|
15
|
+
|
16
|
+
TODO future improvements:
|
17
|
+
1. How does data writer control how it chooses to write to existing files vs new files?
|
18
|
+
For now, we will not expose this configuration and always write each batch to
|
19
|
+
a new file
|
20
|
+
2. Related to 1, how should we expose URI(s) to write to? Probably DataWriter can
|
21
|
+
use FileProvider and needs to know relevant ids like task ID.
|
22
|
+
"""
|
23
|
+
|
24
|
+
def flush_batch(self, sorted_records: MEMTABLE_DATA) -> List[SSTableRow]:
|
25
|
+
"""
|
26
|
+
Flushes rows to file, and return appropriate metadata to build SSTable
|
27
|
+
|
28
|
+
TODO future improvements
|
29
|
+
1. Finalize type for input records (instead of MvpRow)
|
30
|
+
|
31
|
+
Options could be:
|
32
|
+
(a) Something like Iceberg "StructLike" which allows flexible integrations without memcopy for row-oriented formats, e.g. can make Spark InternalRow structlike
|
33
|
+
(b) use arrow. We will probably use arrow for writing parquet, although
|
34
|
+
probably it isn't ideal for row oriented formats
|
35
|
+
2. Keep in mind, most implementation of DataWriter will be written in rust.
|
36
|
+
|
37
|
+
:param sorted_records: Records sorted by key
|
38
|
+
:return: metadata used to build SS Table
|
39
|
+
"""
|
40
|
+
...
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from deltacat.storage.rivulet.parquet.serializer import ParquetDataSerializer
|
4
|
+
from deltacat.storage.rivulet import Schema
|
5
|
+
from deltacat.storage.rivulet.serializer import DataSerializer
|
6
|
+
from deltacat.storage.rivulet.fs.file_provider import FileProvider
|
7
|
+
|
8
|
+
from deltacat.storage.rivulet.feather.serializer import FeatherDataSerializer
|
9
|
+
|
10
|
+
|
11
|
+
class DataSerializerFactory:
|
12
|
+
"""
|
13
|
+
Simple factory class for getting the appropriate serializer given a schema
|
14
|
+
TODO make this more modular/pluggable like DatasetReaderRegistrar
|
15
|
+
This will be more challenging to make pluggable, because we should not rely on a simple 1:1 mapping of type to serializer
|
16
|
+
The actual logic for determining how to serialize a given schema may be complex
|
17
|
+
e.g.: if schema contains datatype X, you must use serializer Y. Otherwise, default to serializer Z
|
18
|
+
"""
|
19
|
+
|
20
|
+
@classmethod
|
21
|
+
def get_serializer(
|
22
|
+
self,
|
23
|
+
schema: Schema,
|
24
|
+
file_provider: FileProvider,
|
25
|
+
user_provided_format: str | None = None,
|
26
|
+
) -> DataSerializer:
|
27
|
+
if user_provided_format == "parquet":
|
28
|
+
return ParquetDataSerializer(file_provider, schema)
|
29
|
+
elif user_provided_format == "feather":
|
30
|
+
return FeatherDataSerializer(file_provider, schema)
|
31
|
+
elif user_provided_format is not None:
|
32
|
+
raise ValueError("Unsupported format. Must be 'parquet' or 'feather'.")
|
33
|
+
|
34
|
+
# Default engine logic. For now, if there is image or binary use feather
|
35
|
+
has_binary_or_image = any(
|
36
|
+
field.datatype.type_name.startswith(("binary", "image"))
|
37
|
+
for field in schema.values()
|
38
|
+
)
|
39
|
+
if has_binary_or_image:
|
40
|
+
return FeatherDataSerializer(file_provider, schema)
|
41
|
+
else:
|
42
|
+
return ParquetDataSerializer(file_provider, schema)
|
File without changes
|
@@ -0,0 +1,29 @@
|
|
1
|
+
from typing import Protocol, Iterable, Union, Any, Dict
|
2
|
+
import pyarrow as pa
|
3
|
+
|
4
|
+
DATA = Union[Iterable[Dict[str, Any]], Iterable[pa.RecordBatch], pa.RecordBatch]
|
5
|
+
|
6
|
+
|
7
|
+
class DatasetWriter(Protocol):
|
8
|
+
"""
|
9
|
+
Top level interface for writing records to rivulet dataset. This is used by dataset.py
|
10
|
+
|
11
|
+
This writes both data AND metadata (SSTs, manifests).
|
12
|
+
|
13
|
+
The general paradigm is that records are written iteratively through write or write_batch. At configurable intervals (based on record count or size), data and metadata gets flushed.
|
14
|
+
|
15
|
+
When the user either closes the dataset writer or calls commit(), this triggers all buffered data and metadata to be flushed.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def write(self, record: DATA) -> None:
|
19
|
+
...
|
20
|
+
|
21
|
+
def flush(self) -> str:
|
22
|
+
"""
|
23
|
+
Explicitly flush any data and metadata and commit to dataset
|
24
|
+
|
25
|
+
This is a blocking operation
|
26
|
+
|
27
|
+
:return: URI of manifest written for commit
|
28
|
+
"""
|
29
|
+
...
|