deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,892 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import logging
|
5
|
+
import copy
|
6
|
+
|
7
|
+
import msgpack
|
8
|
+
from typing import Optional, Any, Dict, Union, List, Callable, Tuple
|
9
|
+
|
10
|
+
import pyarrow as pa
|
11
|
+
from pyarrow import ArrowInvalid
|
12
|
+
|
13
|
+
from deltacat.constants import BYTES_PER_KIBIBYTE
|
14
|
+
from deltacat.storage.model.types import (
|
15
|
+
SchemaConsistencyType,
|
16
|
+
SortOrder,
|
17
|
+
NullOrder,
|
18
|
+
)
|
19
|
+
from deltacat import logs
|
20
|
+
|
21
|
+
# PyArrow Field Metadata Key used to set the Field ID when writing to Parquet.
|
22
|
+
# See: https://arrow.apache.org/docs/cpp/parquet.html#parquet-field-id
|
23
|
+
PARQUET_FIELD_ID_KEY_NAME = b"PARQUET:field_id"
|
24
|
+
|
25
|
+
# PyArrow Field Metadata Key used to store field documentation.
|
26
|
+
FIELD_DOC_KEY_NAME = b"DELTACAT:doc"
|
27
|
+
|
28
|
+
# PyArrow Field Metadata Key used to identify the field as a merge key.
|
29
|
+
FIELD_MERGE_KEY_NAME = b"DELTACAT:merge_key"
|
30
|
+
|
31
|
+
# PyArrow Field Metadata Key used to identify the field as a merge order key.
|
32
|
+
FIELD_MERGE_ORDER_KEY_NAME = b"DELTACAT:merge_order"
|
33
|
+
|
34
|
+
# PyArrow Field Metadata Key used to identify the field as an event time.
|
35
|
+
FIELD_EVENT_TIME_KEY_NAME = b"DELTACAT:event_time"
|
36
|
+
|
37
|
+
# PyArrow Field Metadata Key used to store field past default values.
|
38
|
+
FIELD_PAST_DEFAULT_KEY_NAME = b"DELTACAT:past_default"
|
39
|
+
|
40
|
+
# PyArrow Field Metadata Key used to store field future default values.
|
41
|
+
FIELD_FUTURE_DEFAULT_KEY_NAME = b"DELTACAT:future_default"
|
42
|
+
|
43
|
+
# PyArrow Field Metadata Key used to store field schema consistency type.
|
44
|
+
FIELD_CONSISTENCY_TYPE_KEY_NAME = b"DELTACAT:consistency_type"
|
45
|
+
|
46
|
+
# PyArrow Schema Metadata Key used to store schema ID value.
|
47
|
+
SCHEMA_ID_KEY_NAME = b"DELTACAT:schema_id"
|
48
|
+
|
49
|
+
# PyArrow Schema Metadata Key used to store named subschemas
|
50
|
+
SUBSCHEMAS_KEY_NAME = b"DELTACAT:subschemas"
|
51
|
+
|
52
|
+
# Set max field ID to INT32.MAX_VALUE - 200 for backwards-compatibility with
|
53
|
+
# Apache Iceberg, which sets aside this range for reserved fields
|
54
|
+
MAX_FIELD_ID_EXCLUSIVE = 2147483447
|
55
|
+
|
56
|
+
# Default name assigned to the base, unnamed single schema when a new named
|
57
|
+
# subschema is first added.
|
58
|
+
BASE_SCHEMA_NAME = "_base"
|
59
|
+
|
60
|
+
SchemaId = int
|
61
|
+
SchemaName = str
|
62
|
+
FieldId = int
|
63
|
+
FieldName = str
|
64
|
+
NestedFieldName = List[str]
|
65
|
+
FieldLocator = Union[FieldName, NestedFieldName, FieldId]
|
66
|
+
|
67
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
68
|
+
|
69
|
+
|
70
|
+
class MergeOrder(tuple):
|
71
|
+
@staticmethod
|
72
|
+
def of(
|
73
|
+
sort_order: SortOrder = SortOrder.ASCENDING,
|
74
|
+
null_order: NullOrder = NullOrder.AT_END,
|
75
|
+
) -> MergeOrder:
|
76
|
+
return MergeOrder(
|
77
|
+
(
|
78
|
+
sort_order,
|
79
|
+
null_order,
|
80
|
+
)
|
81
|
+
)
|
82
|
+
|
83
|
+
@property
|
84
|
+
def sort_order(self) -> Optional[SortOrder]:
|
85
|
+
return SortOrder(self[0])
|
86
|
+
|
87
|
+
@property
|
88
|
+
def null_order(self) -> Optional[NullOrder]:
|
89
|
+
return NullOrder(self[1])
|
90
|
+
|
91
|
+
|
92
|
+
class Field(dict):
|
93
|
+
@staticmethod
|
94
|
+
def of(
|
95
|
+
field: pa.Field,
|
96
|
+
field_id: Optional[FieldId] = None,
|
97
|
+
is_merge_key: Optional[bool] = None,
|
98
|
+
merge_order: Optional[MergeOrder] = None,
|
99
|
+
is_event_time: Optional[bool] = None,
|
100
|
+
doc: Optional[str] = None,
|
101
|
+
past_default: Optional[Any] = None,
|
102
|
+
future_default: Optional[Any] = None,
|
103
|
+
consistency_type: Optional[SchemaConsistencyType] = None,
|
104
|
+
path: Optional[NestedFieldName] = None,
|
105
|
+
native_object: Optional[Any] = None,
|
106
|
+
) -> Field:
|
107
|
+
"""
|
108
|
+
Creates a DeltaCAT field from a PyArrow base field. The DeltaCAT
|
109
|
+
field contains a copy of the base field, but ensures that the
|
110
|
+
PyArrow Field's metadata is also populated with optional metadata
|
111
|
+
like documentation or metadata used within the context of a parent
|
112
|
+
schema like field ids, merge keys, and default values.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
field (pa.Field): Arrow base field.
|
116
|
+
|
117
|
+
field_id (Optional[FieldId]): Unique ID of the field within its
|
118
|
+
parent schema, or None if this field has no parent schema. If not
|
119
|
+
given, then the field ID will be derived from the Arrow base
|
120
|
+
field's "PARQUET:field_id" metadata key.
|
121
|
+
|
122
|
+
is_merge_key (Optional[bool]): True if this Field is used as a merge
|
123
|
+
key within its parent schema, False or None if it is not a merge
|
124
|
+
key or has no parent schema. If not given, this will be derived from
|
125
|
+
the Arrow base field's "DELTACAT:merge_key" metadata key. Merge keys
|
126
|
+
are the default keys used to find matching records for equality
|
127
|
+
deletes, upserts, and other equality-key-based merge operations.
|
128
|
+
Must be a non-floating-point primitive type.
|
129
|
+
|
130
|
+
merge_order (Optional[MergeOrder]): Merge order for this field
|
131
|
+
within its parent schema. None if it is not used for merge order or
|
132
|
+
has no parent schema. If not given, this will be derived from
|
133
|
+
the Arrow base field's "DELTACAT:merge_order" metadata key. Merge
|
134
|
+
order is used to determine the record kept amongst all records
|
135
|
+
with matching merge keys for equality deletes, upserts, and other
|
136
|
+
equality-key-based merge operations. Must be a primitive type.
|
137
|
+
|
138
|
+
is_event_time (Optional[bool]): True if this Field is used to derive
|
139
|
+
event time within its parent schema, False or None if it is not used
|
140
|
+
or has no parent schema. If not given, this will be derived from
|
141
|
+
the Arrow base field's "DELTACAT:event_time" metadata key. Event
|
142
|
+
times are used to determine a stream's data completeness watermark.
|
143
|
+
Must be an integer, float, or date type.
|
144
|
+
|
145
|
+
doc (Optional[str]): Documentation for this field or None if this
|
146
|
+
field has no documentation. If not given, then docs will be derived
|
147
|
+
from the Arrow base field's "DELTACAT:doc" metadata key.
|
148
|
+
|
149
|
+
past_default (Optional[Any]): Past default values for records
|
150
|
+
written to the parent schema before this field was appended,
|
151
|
+
or None if this field has no parent schema. If not given, this will
|
152
|
+
be derived from the Arrow base field's "DELTACAT:past_default"
|
153
|
+
metadata key. Must be coercible to the field's base arrow type.
|
154
|
+
|
155
|
+
future_default (Optional[Any]): Future default values for records
|
156
|
+
that omit this field in the parent schema they're written to, or
|
157
|
+
None if this field has no parent schema. If not given, this will
|
158
|
+
be derived from the Arrow base field's "DELTACAT:future_default"
|
159
|
+
metadata key. Must be coercible to the field's base arrow type.
|
160
|
+
|
161
|
+
consistency_type (Optional[SchemaConsistencyType]): Schema
|
162
|
+
consistency type for records written to this field within the
|
163
|
+
context of a parent schema, or None if the field has no parent
|
164
|
+
schema. If not given, this will be derived from the Arrow base
|
165
|
+
field's "DELTACAT:consistency_type" metadata key.
|
166
|
+
|
167
|
+
path (Optional[NestedFieldName]): Fully qualified path of this
|
168
|
+
field within its parent schema. Any manually specified path will
|
169
|
+
be overwritten when this field is added to a schema.
|
170
|
+
|
171
|
+
native_object (Optional[Any]): The native object, if any, that this
|
172
|
+
field was originally derived from.
|
173
|
+
Returns:
|
174
|
+
A new DeltaCAT Field.
|
175
|
+
"""
|
176
|
+
final_field = Field._build(
|
177
|
+
field=field,
|
178
|
+
field_id=Field._field_id(field) if field_id is None else field_id,
|
179
|
+
is_merge_key=Field._is_merge_key(field)
|
180
|
+
if is_merge_key is None
|
181
|
+
else is_merge_key,
|
182
|
+
merge_order=Field._merge_order(field)
|
183
|
+
if merge_order is None
|
184
|
+
else merge_order,
|
185
|
+
is_event_time=Field._is_event_time(field)
|
186
|
+
if is_event_time is None
|
187
|
+
else is_event_time,
|
188
|
+
doc=Field._doc(field) if doc is None else doc,
|
189
|
+
past_default=Field._past_default(field)
|
190
|
+
if past_default is None
|
191
|
+
else past_default,
|
192
|
+
future_default=Field._future_default(field)
|
193
|
+
if future_default is None
|
194
|
+
else future_default,
|
195
|
+
consistency_type=Field._consistency_type(field)
|
196
|
+
if consistency_type is None
|
197
|
+
else consistency_type,
|
198
|
+
)
|
199
|
+
return Field(
|
200
|
+
{
|
201
|
+
"arrow": final_field,
|
202
|
+
"path": copy.deepcopy(path),
|
203
|
+
"nativeObject": native_object,
|
204
|
+
}
|
205
|
+
)
|
206
|
+
|
207
|
+
@property
|
208
|
+
def arrow(self) -> pa.Field:
|
209
|
+
return self["arrow"]
|
210
|
+
|
211
|
+
@property
|
212
|
+
def id(self) -> Optional[FieldId]:
|
213
|
+
return Field._field_id(self.arrow)
|
214
|
+
|
215
|
+
@property
|
216
|
+
def path(self) -> Optional[NestedFieldName]:
|
217
|
+
return self.get("path")
|
218
|
+
|
219
|
+
@property
|
220
|
+
def is_merge_key(self) -> Optional[bool]:
|
221
|
+
return Field._is_merge_key(self.arrow)
|
222
|
+
|
223
|
+
@property
|
224
|
+
def merge_order(self) -> Optional[MergeOrder]:
|
225
|
+
return Field._merge_order(self.arrow)
|
226
|
+
|
227
|
+
@property
|
228
|
+
def doc(self) -> Optional[str]:
|
229
|
+
return Field._doc(self.arrow)
|
230
|
+
|
231
|
+
@property
|
232
|
+
def past_default(self) -> Optional[Any]:
|
233
|
+
return Field._past_default(self.arrow)
|
234
|
+
|
235
|
+
@property
|
236
|
+
def future_default(self) -> Optional[Any]:
|
237
|
+
return Field._future_default(self.arrow)
|
238
|
+
|
239
|
+
@property
|
240
|
+
def consistency_type(self) -> Optional[SchemaConsistencyType]:
|
241
|
+
return Field._consistency_type(self.arrow)
|
242
|
+
|
243
|
+
@property
|
244
|
+
def native_object(self) -> Optional[Any]:
|
245
|
+
return self.get("nativeObject")
|
246
|
+
|
247
|
+
@staticmethod
|
248
|
+
def _field_id(field: pa.Field) -> Optional[FieldId]:
|
249
|
+
field_id = None
|
250
|
+
if field.metadata:
|
251
|
+
bytes_val = field.metadata.get(PARQUET_FIELD_ID_KEY_NAME)
|
252
|
+
field_id = int(bytes_val.decode()) if bytes_val else None
|
253
|
+
return field_id
|
254
|
+
|
255
|
+
@staticmethod
|
256
|
+
def _doc(field: pa.Field) -> Optional[str]:
|
257
|
+
doc = None
|
258
|
+
if field.metadata:
|
259
|
+
bytes_val = field.metadata.get(FIELD_DOC_KEY_NAME)
|
260
|
+
doc = bytes_val.decode() if bytes_val else None
|
261
|
+
return doc
|
262
|
+
|
263
|
+
@staticmethod
|
264
|
+
def _is_merge_key(field: pa.Field) -> Optional[bool]:
|
265
|
+
is_merge_key = None
|
266
|
+
if field.metadata:
|
267
|
+
bytes_val = field.metadata.get(FIELD_MERGE_KEY_NAME)
|
268
|
+
is_merge_key = bool(bytes_val.decode()) if bytes_val else None
|
269
|
+
return is_merge_key
|
270
|
+
|
271
|
+
@staticmethod
|
272
|
+
def _merge_order(field: pa.Field) -> Optional[MergeOrder]:
|
273
|
+
merge_order = None
|
274
|
+
if field.metadata:
|
275
|
+
bytes_val = field.metadata.get(FIELD_MERGE_ORDER_KEY_NAME)
|
276
|
+
merge_order = msgpack.loads(bytes_val) if bytes_val else None
|
277
|
+
return merge_order
|
278
|
+
|
279
|
+
@staticmethod
|
280
|
+
def _is_event_time(field: pa.Field) -> Optional[bool]:
|
281
|
+
is_event_time = None
|
282
|
+
if field.metadata:
|
283
|
+
bytes_val = field.metadata.get(FIELD_EVENT_TIME_KEY_NAME)
|
284
|
+
is_event_time = bool(bytes_val.decode()) if bytes_val else None
|
285
|
+
return is_event_time
|
286
|
+
|
287
|
+
@staticmethod
|
288
|
+
def _past_default(field: pa.Field) -> Optional[Any]:
|
289
|
+
default = None
|
290
|
+
if field.metadata:
|
291
|
+
bytes_val = field.metadata.get(FIELD_PAST_DEFAULT_KEY_NAME)
|
292
|
+
default = msgpack.loads(bytes_val) if bytes_val else None
|
293
|
+
return default
|
294
|
+
|
295
|
+
@staticmethod
|
296
|
+
def _future_default(field: pa.Field) -> Optional[Any]:
|
297
|
+
default = None
|
298
|
+
if field.metadata:
|
299
|
+
bytes_val = field.metadata.get(FIELD_FUTURE_DEFAULT_KEY_NAME)
|
300
|
+
default = msgpack.loads(bytes_val) if bytes_val else None
|
301
|
+
return default
|
302
|
+
|
303
|
+
@staticmethod
|
304
|
+
def _consistency_type(field: pa.Field) -> Optional[SchemaConsistencyType]:
|
305
|
+
t = None
|
306
|
+
if field.metadata:
|
307
|
+
bytes_val = field.metadata.get(FIELD_CONSISTENCY_TYPE_KEY_NAME)
|
308
|
+
t = SchemaConsistencyType(bytes_val.decode()) if bytes_val else None
|
309
|
+
return t
|
310
|
+
|
311
|
+
@staticmethod
|
312
|
+
def _validate_merge_key(field: pa.Field):
|
313
|
+
if not (pa.types.is_string(field.type) or pa.types.is_primitive(field.type)):
|
314
|
+
raise ValueError(f"Merge key {field} must be a primitive type.")
|
315
|
+
if pa.types.is_floating(field.type):
|
316
|
+
raise ValueError(f"Merge key {field} cannot be floating point.")
|
317
|
+
|
318
|
+
@staticmethod
|
319
|
+
def _validate_merge_order(field: pa.Field):
|
320
|
+
if not pa.types.is_primitive(field.type):
|
321
|
+
raise ValueError(f"Merge order {field} must be a primitive type.")
|
322
|
+
|
323
|
+
@staticmethod
|
324
|
+
def _validate_event_time(field: pa.Field):
|
325
|
+
if (
|
326
|
+
not pa.types.is_integer(field.type)
|
327
|
+
and not pa.types.is_floating(field.type)
|
328
|
+
and not pa.types.is_date(field.type)
|
329
|
+
):
|
330
|
+
raise ValueError(f"Event time {field} must be numeric or date type.")
|
331
|
+
|
332
|
+
@staticmethod
|
333
|
+
def _validate_default(
|
334
|
+
default: Optional[Any],
|
335
|
+
field: pa.Field,
|
336
|
+
) -> pa.Scalar:
|
337
|
+
try:
|
338
|
+
return pa.scalar(default, field.type)
|
339
|
+
except ArrowInvalid:
|
340
|
+
raise ValueError(
|
341
|
+
f"Cannot treat default value `{default}` as type"
|
342
|
+
f"`{field.type}` for field: {field}"
|
343
|
+
)
|
344
|
+
|
345
|
+
@staticmethod
|
346
|
+
def _build(
|
347
|
+
field: pa.Field,
|
348
|
+
field_id: Optional[int],
|
349
|
+
is_merge_key: Optional[bool],
|
350
|
+
merge_order: Optional[MergeOrder],
|
351
|
+
is_event_time: Optional[bool],
|
352
|
+
doc: Optional[str],
|
353
|
+
past_default: Optional[Any],
|
354
|
+
future_default: Optional[Any],
|
355
|
+
consistency_type: Optional[SchemaConsistencyType],
|
356
|
+
) -> pa.Field:
|
357
|
+
meta = {}
|
358
|
+
if is_merge_key:
|
359
|
+
Field._validate_merge_key(field)
|
360
|
+
meta[FIELD_MERGE_KEY_NAME] = str(is_merge_key)
|
361
|
+
if merge_order:
|
362
|
+
Field._validate_merge_order(field)
|
363
|
+
meta[FIELD_MERGE_ORDER_KEY_NAME] = msgpack.dumps(merge_order)
|
364
|
+
if is_event_time:
|
365
|
+
Field._validate_event_time(field)
|
366
|
+
meta[FIELD_EVENT_TIME_KEY_NAME] = str(is_event_time)
|
367
|
+
if past_default is not None:
|
368
|
+
Field._validate_default(past_default, field)
|
369
|
+
meta[FIELD_PAST_DEFAULT_KEY_NAME] = msgpack.dumps(past_default)
|
370
|
+
if future_default is not None:
|
371
|
+
Field._validate_default(future_default, field)
|
372
|
+
meta[FIELD_FUTURE_DEFAULT_KEY_NAME] = msgpack.dumps(future_default)
|
373
|
+
if field_id is not None:
|
374
|
+
meta[PARQUET_FIELD_ID_KEY_NAME] = str(field_id)
|
375
|
+
if doc is not None:
|
376
|
+
meta[FIELD_DOC_KEY_NAME] = doc
|
377
|
+
if consistency_type is not None:
|
378
|
+
meta[FIELD_CONSISTENCY_TYPE_KEY_NAME] = consistency_type.value
|
379
|
+
return pa.field(
|
380
|
+
name=field.name,
|
381
|
+
type=field.type,
|
382
|
+
nullable=field.nullable,
|
383
|
+
metadata=meta,
|
384
|
+
)
|
385
|
+
|
386
|
+
|
387
|
+
SingleSchema = Union[List[Field], pa.Schema]
|
388
|
+
MultiSchema = Union[Dict[SchemaName, List[Field]], Dict[SchemaName, pa.Schema]]
|
389
|
+
|
390
|
+
|
391
|
+
class Schema(dict):
|
392
|
+
@staticmethod
|
393
|
+
def of(
|
394
|
+
schema: Union[SingleSchema, MultiSchema],
|
395
|
+
schema_id: Optional[SchemaId] = None,
|
396
|
+
native_object: Optional[Any] = None,
|
397
|
+
) -> Schema:
|
398
|
+
"""
|
399
|
+
Creates a DeltaCAT schema from either one or multiple Arrow base schemas
|
400
|
+
or lists of DeltaCAT fields. All field names across all input schemas
|
401
|
+
must be unique (case-insensitive). If a dict of named subschemas is
|
402
|
+
given, then this DeltaCAT schema will be backed by a unified arrow
|
403
|
+
schema created as a union of all input schemas in the natural iteration
|
404
|
+
order of their dictionary keys. This unified schema saves all named
|
405
|
+
subschema field mappings in its metadata to support DeltaCAT subschema
|
406
|
+
retrieval by name after schema creation.
|
407
|
+
|
408
|
+
Args:
|
409
|
+
schema (Union[SingleSchema, MultiSchema]): For a single unnamed
|
410
|
+
schema, either an Arrow base schema or list of DeltaCAT fields.
|
411
|
+
If an Arrow base schema is given, then a copy of the base schema
|
412
|
+
is made with each Arrow field populated with additional metadata.
|
413
|
+
Field IDs, merge keys, docs, and default vals will be read from
|
414
|
+
each Arrow field's metadata if they exist. Any field missing a
|
415
|
+
field ID will be assigned a unique field ID, with assigned field
|
416
|
+
IDs either starting from 0 or the max field ID + 1.
|
417
|
+
For multiple named subschemas, a dictionary of schema names to an
|
418
|
+
arrow base schema or list of DeltaCAT fields. These schemas will
|
419
|
+
be copied into a unified Arrow schema representing a union of all
|
420
|
+
of their fields in their natural iteration order. Any missing
|
421
|
+
field IDs will be autoassigned starting from 0 or the max field ID
|
422
|
+
+ 1 across the natural iteration order of all schemas first, and
|
423
|
+
all fields second.
|
424
|
+
All fields across all schemas must have unique names
|
425
|
+
(case-insensitive).
|
426
|
+
|
427
|
+
schema_id (SchemaId): Unique ID of schema within its parent table
|
428
|
+
version. Defaults to 0.
|
429
|
+
|
430
|
+
native_object (Optional[Any]): The native object, if any, that this
|
431
|
+
schema was converted from.
|
432
|
+
Returns:
|
433
|
+
A new DeltaCAT Schema.
|
434
|
+
"""
|
435
|
+
# normalize the input as a unified pyarrow schema
|
436
|
+
# if the input included multiple subschemas, then also save a mapping
|
437
|
+
# from each subschema to its unique field names
|
438
|
+
schema, subschema_to_field_names = Schema._to_unified_pyarrow_schema(schema)
|
439
|
+
# discover assigned field IDs in the given pyarrow schema
|
440
|
+
field_ids_to_fields = {}
|
441
|
+
schema_metadata = {}
|
442
|
+
visitor_dict = {"maxFieldId": 0}
|
443
|
+
# find and save the schema's max field ID in the visitor dictionary
|
444
|
+
Schema._visit_fields(
|
445
|
+
current=schema,
|
446
|
+
visit=Schema._find_max_field_id,
|
447
|
+
visitor_dict=visitor_dict,
|
448
|
+
)
|
449
|
+
max_field_id = visitor_dict["maxFieldId"]
|
450
|
+
visitor_dict["fieldIdsToFields"] = field_ids_to_fields
|
451
|
+
# populate map of field IDs to DeltaCAT fields w/ IDs, docs, etc.
|
452
|
+
Schema._visit_fields(
|
453
|
+
current=schema,
|
454
|
+
visit=Schema._populate_fields,
|
455
|
+
visitor_dict=visitor_dict,
|
456
|
+
)
|
457
|
+
if schema.metadata:
|
458
|
+
schema_metadata.update(schema.metadata)
|
459
|
+
# populate merge keys
|
460
|
+
merge_keys = [
|
461
|
+
field.id for field in field_ids_to_fields.values() if field.is_merge_key
|
462
|
+
]
|
463
|
+
# create a new pyarrow schema with field ID, doc, etc. field metadata
|
464
|
+
pyarrow_schema = pa.schema(
|
465
|
+
fields=[field.arrow for field in field_ids_to_fields.values()],
|
466
|
+
)
|
467
|
+
# map subschema field names to IDs (for faster lookup and reduced size)
|
468
|
+
subschema_to_field_ids = {
|
469
|
+
schema_name: [
|
470
|
+
Field.of(pyarrow_schema.field(field_name)).id
|
471
|
+
for field_name in field_names
|
472
|
+
]
|
473
|
+
for schema_name, field_names in subschema_to_field_names.items()
|
474
|
+
}
|
475
|
+
# create a final pyarrow schema with populated schema metadata
|
476
|
+
if schema_id is not None:
|
477
|
+
schema_metadata[SCHEMA_ID_KEY_NAME] = str(schema_id)
|
478
|
+
if schema_metadata.get(SCHEMA_ID_KEY_NAME) is None:
|
479
|
+
schema_metadata[SCHEMA_ID_KEY_NAME] = str(0)
|
480
|
+
schema_metadata[SUBSCHEMAS_KEY_NAME] = msgpack.dumps(subschema_to_field_ids)
|
481
|
+
final_schema = pyarrow_schema.with_metadata(schema_metadata)
|
482
|
+
return Schema(
|
483
|
+
{
|
484
|
+
"arrow": final_schema,
|
485
|
+
"mergeKeys": merge_keys or None,
|
486
|
+
"fieldIdsToFields": field_ids_to_fields,
|
487
|
+
"maxFieldId": max_field_id,
|
488
|
+
"nativeObject": native_object,
|
489
|
+
}
|
490
|
+
)
|
491
|
+
|
492
|
+
@staticmethod
|
493
|
+
def deserialize(serialized: pa.Buffer) -> Schema:
|
494
|
+
return Schema.of(schema=pa.ipc.read_schema(serialized))
|
495
|
+
|
496
|
+
def serialize(self) -> pa.Buffer:
|
497
|
+
return self.arrow.serialize()
|
498
|
+
|
499
|
+
def equivalent_to(self, other: Schema, check_metadata: bool = False):
|
500
|
+
if other is None:
|
501
|
+
return False
|
502
|
+
if not isinstance(other, dict):
|
503
|
+
return False
|
504
|
+
if not isinstance(other, Schema):
|
505
|
+
other = Schema(other)
|
506
|
+
return self.arrow.equals(
|
507
|
+
other.arrow,
|
508
|
+
check_metadata,
|
509
|
+
)
|
510
|
+
|
511
|
+
def add_subschema(
|
512
|
+
self,
|
513
|
+
name: SchemaName,
|
514
|
+
schema: SingleSchema,
|
515
|
+
) -> Schema:
|
516
|
+
subschemas = copy.copy(self.subschemas)
|
517
|
+
if not subschemas: # self is SingleSchema
|
518
|
+
subschemas = {BASE_SCHEMA_NAME: self}
|
519
|
+
subschemas = Schema._add_subschema(name, schema, subschemas)
|
520
|
+
return Schema.of(
|
521
|
+
schema=subschemas,
|
522
|
+
schema_id=self.id + 1,
|
523
|
+
)
|
524
|
+
|
525
|
+
def delete_subschema(self, name: SchemaName) -> Schema:
|
526
|
+
subschemas = copy.copy(self.subschemas)
|
527
|
+
subschemas = self._del_subschema(name, subschemas)
|
528
|
+
if not subschemas:
|
529
|
+
raise ValueError(f"Deleting `{name}` would leave the schema empty.")
|
530
|
+
subschemas = {name: val.arrow for name, val in subschemas.items()}
|
531
|
+
return Schema.of(
|
532
|
+
schema=subschemas,
|
533
|
+
schema_id=self.id + 1,
|
534
|
+
)
|
535
|
+
|
536
|
+
def replace_subschema(
|
537
|
+
self,
|
538
|
+
name: SchemaName,
|
539
|
+
schema: SingleSchema,
|
540
|
+
) -> Schema:
|
541
|
+
subschemas = copy.copy(self.subschemas)
|
542
|
+
subschemas = Schema._del_subschema(name, subschemas)
|
543
|
+
subschemas = Schema._add_subschema(name, schema, subschemas)
|
544
|
+
return Schema.of(
|
545
|
+
schema=subschemas,
|
546
|
+
schema_id=self.id + 1,
|
547
|
+
)
|
548
|
+
|
549
|
+
def field_id(self, name: Union[FieldName, NestedFieldName]) -> FieldId:
|
550
|
+
return Schema._field_name_to_field_id(self.arrow, name)
|
551
|
+
|
552
|
+
def field_name(self, field_id: FieldId) -> Union[FieldName, NestedFieldName]:
|
553
|
+
field = self.field_ids_to_fields[field_id]
|
554
|
+
if len(field.path) == 1:
|
555
|
+
return field.arrow.name
|
556
|
+
return field.path
|
557
|
+
|
558
|
+
def field(self, field_locator: FieldLocator) -> Field:
|
559
|
+
field_id = (
|
560
|
+
field_locator
|
561
|
+
if isinstance(field_locator, FieldId)
|
562
|
+
else self.field_id(field_locator)
|
563
|
+
)
|
564
|
+
return self.field_ids_to_fields[field_id]
|
565
|
+
|
566
|
+
@property
|
567
|
+
def fields(self) -> List[Field]:
|
568
|
+
field_ids_to_fields = self.field_ids_to_fields
|
569
|
+
return list(field_ids_to_fields.values())
|
570
|
+
|
571
|
+
@property
|
572
|
+
def merge_keys(self) -> Optional[List[FieldId]]:
|
573
|
+
return self.get("mergeKeys")
|
574
|
+
|
575
|
+
@property
|
576
|
+
def field_ids_to_fields(self) -> Dict[FieldId, Field]:
|
577
|
+
return self.get("fieldIdsToFields")
|
578
|
+
|
579
|
+
@property
|
580
|
+
def arrow(self) -> pa.Schema:
|
581
|
+
return self["arrow"]
|
582
|
+
|
583
|
+
@property
|
584
|
+
def max_field_id(self) -> FieldId:
|
585
|
+
return self["maxFieldId"]
|
586
|
+
|
587
|
+
@property
|
588
|
+
def id(self) -> SchemaId:
|
589
|
+
return Schema._schema_id(self.arrow)
|
590
|
+
|
591
|
+
@property
|
592
|
+
def subschema(self, name: SchemaName) -> Optional[Schema]:
|
593
|
+
subschemas = self.subschemas
|
594
|
+
return subschemas.get(name) if subschemas else None
|
595
|
+
|
596
|
+
@property
|
597
|
+
def subschemas(self) -> Dict[SchemaName, Schema]:
|
598
|
+
# return cached subschemas first if they exist
|
599
|
+
subschemas = self.get("subschemas")
|
600
|
+
if not subschemas:
|
601
|
+
# retrieve any defined subschemas
|
602
|
+
subschemas_to_field_ids = self.subschemas_to_field_ids
|
603
|
+
# rebuild and return the subschema cache
|
604
|
+
if subschemas_to_field_ids:
|
605
|
+
subschemas = {
|
606
|
+
schema_name: Schema.of(
|
607
|
+
schema=pa.schema(
|
608
|
+
[self.field(field_id).arrow for field_id in field_ids]
|
609
|
+
),
|
610
|
+
schema_id=self.id,
|
611
|
+
native_object=self.native_object,
|
612
|
+
)
|
613
|
+
for schema_name, field_ids in subschemas_to_field_ids.items()
|
614
|
+
}
|
615
|
+
self["subschemas"] = subschemas
|
616
|
+
return subschemas or {}
|
617
|
+
|
618
|
+
@property
|
619
|
+
def subschema_field_ids(self, name: SchemaName) -> Optional[List[FieldId]]:
|
620
|
+
return self.subschemas_to_field_ids.get(name)
|
621
|
+
|
622
|
+
@property
|
623
|
+
def subschemas_to_field_ids(self) -> Dict[SchemaName, List[FieldId]]:
|
624
|
+
return Schema._subschemas(self.arrow)
|
625
|
+
|
626
|
+
@property
|
627
|
+
def native_object(self) -> Optional[Any]:
|
628
|
+
return self.get("nativeObject")
|
629
|
+
|
630
|
+
@staticmethod
|
631
|
+
def _schema_id(schema: pa.Schema) -> SchemaId:
|
632
|
+
schema_id = None
|
633
|
+
if schema.metadata:
|
634
|
+
bytes_val = schema.metadata.get(SCHEMA_ID_KEY_NAME)
|
635
|
+
schema_id = int(bytes_val.decode()) if bytes_val else None
|
636
|
+
return schema_id
|
637
|
+
|
638
|
+
@staticmethod
|
639
|
+
def _subschemas(
|
640
|
+
schema: pa.Schema,
|
641
|
+
) -> Dict[SchemaName, List[FieldId]]:
|
642
|
+
subschemas = None
|
643
|
+
if schema.metadata:
|
644
|
+
bytes_val = schema.metadata.get(SUBSCHEMAS_KEY_NAME)
|
645
|
+
subschemas = msgpack.loads(bytes_val) if bytes_val else None
|
646
|
+
return subschemas
|
647
|
+
|
648
|
+
@staticmethod
|
649
|
+
def _field_name_to_field_id(
|
650
|
+
schema: pa.Schema,
|
651
|
+
name: Union[FieldName, NestedFieldName],
|
652
|
+
) -> FieldId:
|
653
|
+
if isinstance(name, str):
|
654
|
+
return Field.of(schema.field(name)).id
|
655
|
+
if isinstance(name, List):
|
656
|
+
if not len(name):
|
657
|
+
raise ValueError(f"Nested field name `{name}` is empty.")
|
658
|
+
field = schema
|
659
|
+
for part in name:
|
660
|
+
field = field[part]
|
661
|
+
return Field.of(field).id
|
662
|
+
raise ValueError(f"Unknown field name type: {type(name)}")
|
663
|
+
|
664
|
+
@staticmethod
|
665
|
+
def _visit_fields(
|
666
|
+
current: Union[pa.Schema, pa.Field],
|
667
|
+
visit: Callable,
|
668
|
+
path: NestedFieldName = [],
|
669
|
+
*args,
|
670
|
+
**kwargs,
|
671
|
+
) -> None:
|
672
|
+
"""
|
673
|
+
Recursively visit all fields in a PyArrow schema, including nested
|
674
|
+
fields.
|
675
|
+
|
676
|
+
Args:
|
677
|
+
current (pa.Schema or pa.Field): The schema or field to visit.
|
678
|
+
visit (callable): A function that visits the current field.
|
679
|
+
path (NestedFieldName): The current path to the field.
|
680
|
+
*args: Additional args to pass to the visit function.
|
681
|
+
**kwargs: Additional keyword args to pass to the visit function.
|
682
|
+
Returns:
|
683
|
+
None
|
684
|
+
"""
|
685
|
+
if isinstance(current, pa.Schema):
|
686
|
+
for field in current:
|
687
|
+
Schema._visit_fields(
|
688
|
+
field,
|
689
|
+
visit,
|
690
|
+
path,
|
691
|
+
*args,
|
692
|
+
**kwargs,
|
693
|
+
)
|
694
|
+
elif isinstance(current, pa.Field):
|
695
|
+
path.append(current.name)
|
696
|
+
visit(current, path, *args, **kwargs)
|
697
|
+
if pa.types.is_nested(current.type):
|
698
|
+
if isinstance(current, pa.StructType):
|
699
|
+
for field in current:
|
700
|
+
Schema._visit_fields(
|
701
|
+
field,
|
702
|
+
visit,
|
703
|
+
path,
|
704
|
+
*args,
|
705
|
+
**kwargs,
|
706
|
+
)
|
707
|
+
elif isinstance(current, pa.ListType):
|
708
|
+
Schema._visit_fields(
|
709
|
+
current.value_field,
|
710
|
+
visit,
|
711
|
+
path,
|
712
|
+
*args,
|
713
|
+
**kwargs,
|
714
|
+
)
|
715
|
+
elif isinstance(current, pa.MapType):
|
716
|
+
Schema._visit_fields(
|
717
|
+
current.key_field,
|
718
|
+
visit,
|
719
|
+
path,
|
720
|
+
*args,
|
721
|
+
**kwargs,
|
722
|
+
)
|
723
|
+
Schema._visit_fields(
|
724
|
+
current.item_field,
|
725
|
+
visit,
|
726
|
+
path,
|
727
|
+
*args,
|
728
|
+
**kwargs,
|
729
|
+
)
|
730
|
+
path.pop()
|
731
|
+
else:
|
732
|
+
raise ValueError(f"Unexpected Schema Field Type: {type(current)}")
|
733
|
+
|
734
|
+
@staticmethod
|
735
|
+
def _find_max_field_id(
|
736
|
+
field: pa.Field,
|
737
|
+
path: NestedFieldName,
|
738
|
+
visitor_dict: Dict[str, Any],
|
739
|
+
) -> None:
|
740
|
+
max_field_id = max(
|
741
|
+
visitor_dict.get("maxFieldId", 0),
|
742
|
+
Field.of(field).id or 0,
|
743
|
+
)
|
744
|
+
visitor_dict["maxFieldId"] = max_field_id
|
745
|
+
|
746
|
+
@staticmethod
|
747
|
+
def _populate_fields(
|
748
|
+
field: pa.Field,
|
749
|
+
path: NestedFieldName,
|
750
|
+
visitor_dict: Dict[str, Any],
|
751
|
+
) -> None:
|
752
|
+
field_ids_to_fields = visitor_dict["fieldIdsToFields"]
|
753
|
+
max_field_id = (
|
754
|
+
visitor_dict["maxFieldId"] + len(field_ids_to_fields)
|
755
|
+
) % MAX_FIELD_ID_EXCLUSIVE
|
756
|
+
dc_field = Field.of(field)
|
757
|
+
if dc_field is not None and dc_field.id is not None:
|
758
|
+
field_id = dc_field.id
|
759
|
+
else:
|
760
|
+
field_id = max_field_id
|
761
|
+
|
762
|
+
if (dupe := field_ids_to_fields.get(field_id)) is not None:
|
763
|
+
raise ValueError(
|
764
|
+
f"Duplicate field id {field_id} for field: {field} "
|
765
|
+
f"Already assigned to field: {dupe}"
|
766
|
+
)
|
767
|
+
field = Field.of(
|
768
|
+
field=field,
|
769
|
+
field_id=field_id,
|
770
|
+
path=path,
|
771
|
+
)
|
772
|
+
field_ids_to_fields[field_id] = field
|
773
|
+
|
774
|
+
@staticmethod
|
775
|
+
def _get_lower_case_field_names(
|
776
|
+
schema: SingleSchema,
|
777
|
+
) -> List[str]:
|
778
|
+
if isinstance(schema, pa.Schema):
|
779
|
+
return [name.lower() for name in schema.names]
|
780
|
+
elif isinstance(schema, List): # List[Field]
|
781
|
+
names = [f.arrow.name.lower() for f in schema if isinstance(f, Field)]
|
782
|
+
if len(names) == len(schema):
|
783
|
+
return names # all items in list are valid Field objects
|
784
|
+
raise ValueError(f"Unsupported schema argument: {schema}")
|
785
|
+
|
786
|
+
@staticmethod
|
787
|
+
def _validate_schema_name(name: str) -> None:
|
788
|
+
if not name:
|
789
|
+
raise ValueError(f"Schema name cannot be empty.")
|
790
|
+
if len(name) > BYTES_PER_KIBIBYTE:
|
791
|
+
raise ValueError(
|
792
|
+
f"Invalid schema name `{name}`. Schema names "
|
793
|
+
f"cannot be greater than {BYTES_PER_KIBIBYTE} "
|
794
|
+
f"characters."
|
795
|
+
)
|
796
|
+
|
797
|
+
@staticmethod
|
798
|
+
def _validate_field_names(
|
799
|
+
schema: Union[SingleSchema, MultiSchema],
|
800
|
+
) -> None:
|
801
|
+
all_names = []
|
802
|
+
if isinstance(schema, dict): # MultiSchema
|
803
|
+
for schema_name, val in schema.items():
|
804
|
+
Schema._validate_schema_name(schema_name)
|
805
|
+
all_names.extend(Schema._get_lower_case_field_names(val))
|
806
|
+
else: # SingleSchema
|
807
|
+
all_names.extend(Schema._get_lower_case_field_names(schema))
|
808
|
+
if not all_names:
|
809
|
+
raise ValueError(f"Schema must contain at least one field.")
|
810
|
+
name_set = set()
|
811
|
+
dupes = []
|
812
|
+
for name in all_names:
|
813
|
+
dupes.append(name) if name in name_set else name_set.add(name)
|
814
|
+
if dupes:
|
815
|
+
raise ValueError(
|
816
|
+
f"Expected all schema fields to have unique names "
|
817
|
+
f"(case-insensitive), but found the following duplicates: "
|
818
|
+
f"{dupes}"
|
819
|
+
)
|
820
|
+
|
821
|
+
@staticmethod
|
822
|
+
def _to_pyarrow_schema(schema: SingleSchema) -> pa.Schema:
|
823
|
+
if isinstance(schema, pa.Schema):
|
824
|
+
return schema
|
825
|
+
elif isinstance(schema, List): # List[Field]
|
826
|
+
return pa.schema(fields=[field.arrow for field in schema])
|
827
|
+
else:
|
828
|
+
raise ValueError(f"Unsupported schema base type: {schema}")
|
829
|
+
|
830
|
+
@staticmethod
|
831
|
+
def _to_unified_pyarrow_schema(
|
832
|
+
schema: Union[SingleSchema, MultiSchema],
|
833
|
+
) -> Tuple[pa.Schema, Dict[SchemaName, List[FieldName]]]:
|
834
|
+
# first, ensure all field names are valid and contain no duplicates
|
835
|
+
Schema._validate_field_names(schema)
|
836
|
+
# now union all schemas into a single schema
|
837
|
+
subschema_to_field_names = {}
|
838
|
+
if isinstance(schema, dict): # MultiSchema
|
839
|
+
all_schemas = []
|
840
|
+
for schema_name, schema_val in schema.items():
|
841
|
+
pyarow_schema = Schema._to_pyarrow_schema(schema_val)
|
842
|
+
all_schemas.append(pyarow_schema)
|
843
|
+
subschema_to_field_names[schema_name] = [
|
844
|
+
field.name for field in pyarow_schema
|
845
|
+
]
|
846
|
+
return pa.unify_schemas(all_schemas), subschema_to_field_names
|
847
|
+
return Schema._to_pyarrow_schema(schema), {} # SingleSchema
|
848
|
+
|
849
|
+
@staticmethod
|
850
|
+
def _del_subschema(
|
851
|
+
name: SchemaName,
|
852
|
+
subschemas: Dict[SchemaName, Schema],
|
853
|
+
) -> Dict[SchemaName, Schema]:
|
854
|
+
deleted_subschema = subschemas.pop(name, None)
|
855
|
+
if deleted_subschema is None:
|
856
|
+
raise ValueError(f"Subschema `{name}` does not exist.")
|
857
|
+
return subschemas
|
858
|
+
|
859
|
+
@staticmethod
|
860
|
+
def _add_subschema(
|
861
|
+
name: SchemaName,
|
862
|
+
schema: SingleSchema,
|
863
|
+
subschemas: Dict[SchemaName, Schema],
|
864
|
+
) -> Dict[SchemaName, Schema]:
|
865
|
+
Schema._validate_schema_name(name)
|
866
|
+
if name == BASE_SCHEMA_NAME:
|
867
|
+
raise ValueError(
|
868
|
+
f"Cannot add subschema with reserved name: {BASE_SCHEMA_NAME}"
|
869
|
+
)
|
870
|
+
if name in subschemas:
|
871
|
+
raise ValueError(f"Subschema `{name}` already exists.")
|
872
|
+
for key, val in subschemas.items():
|
873
|
+
subschemas[key] = val.arrow
|
874
|
+
subschemas[name] = schema
|
875
|
+
return subschemas
|
876
|
+
|
877
|
+
|
878
|
+
class SchemaList(List[Schema]):
|
879
|
+
@staticmethod
|
880
|
+
def of(items: List[Schema]) -> SchemaList:
|
881
|
+
typed_items = SchemaList()
|
882
|
+
for item in items:
|
883
|
+
if item is not None and not isinstance(item, Schema):
|
884
|
+
item = Schema(item)
|
885
|
+
typed_items.append(item)
|
886
|
+
return typed_items
|
887
|
+
|
888
|
+
def __getitem__(self, item):
|
889
|
+
val = super().__getitem__(item)
|
890
|
+
if val is not None and not isinstance(val, Schema):
|
891
|
+
self[item] = val = Schema(val)
|
892
|
+
return val
|