deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,892 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import copy
6
+
7
+ import msgpack
8
+ from typing import Optional, Any, Dict, Union, List, Callable, Tuple
9
+
10
+ import pyarrow as pa
11
+ from pyarrow import ArrowInvalid
12
+
13
+ from deltacat.constants import BYTES_PER_KIBIBYTE
14
+ from deltacat.storage.model.types import (
15
+ SchemaConsistencyType,
16
+ SortOrder,
17
+ NullOrder,
18
+ )
19
+ from deltacat import logs
20
+
21
+ # PyArrow Field Metadata Key used to set the Field ID when writing to Parquet.
22
+ # See: https://arrow.apache.org/docs/cpp/parquet.html#parquet-field-id
23
+ PARQUET_FIELD_ID_KEY_NAME = b"PARQUET:field_id"
24
+
25
+ # PyArrow Field Metadata Key used to store field documentation.
26
+ FIELD_DOC_KEY_NAME = b"DELTACAT:doc"
27
+
28
+ # PyArrow Field Metadata Key used to identify the field as a merge key.
29
+ FIELD_MERGE_KEY_NAME = b"DELTACAT:merge_key"
30
+
31
+ # PyArrow Field Metadata Key used to identify the field as a merge order key.
32
+ FIELD_MERGE_ORDER_KEY_NAME = b"DELTACAT:merge_order"
33
+
34
+ # PyArrow Field Metadata Key used to identify the field as an event time.
35
+ FIELD_EVENT_TIME_KEY_NAME = b"DELTACAT:event_time"
36
+
37
+ # PyArrow Field Metadata Key used to store field past default values.
38
+ FIELD_PAST_DEFAULT_KEY_NAME = b"DELTACAT:past_default"
39
+
40
+ # PyArrow Field Metadata Key used to store field future default values.
41
+ FIELD_FUTURE_DEFAULT_KEY_NAME = b"DELTACAT:future_default"
42
+
43
+ # PyArrow Field Metadata Key used to store field schema consistency type.
44
+ FIELD_CONSISTENCY_TYPE_KEY_NAME = b"DELTACAT:consistency_type"
45
+
46
+ # PyArrow Schema Metadata Key used to store schema ID value.
47
+ SCHEMA_ID_KEY_NAME = b"DELTACAT:schema_id"
48
+
49
+ # PyArrow Schema Metadata Key used to store named subschemas
50
+ SUBSCHEMAS_KEY_NAME = b"DELTACAT:subschemas"
51
+
52
+ # Set max field ID to INT32.MAX_VALUE - 200 for backwards-compatibility with
53
+ # Apache Iceberg, which sets aside this range for reserved fields
54
+ MAX_FIELD_ID_EXCLUSIVE = 2147483447
55
+
56
+ # Default name assigned to the base, unnamed single schema when a new named
57
+ # subschema is first added.
58
+ BASE_SCHEMA_NAME = "_base"
59
+
60
+ SchemaId = int
61
+ SchemaName = str
62
+ FieldId = int
63
+ FieldName = str
64
+ NestedFieldName = List[str]
65
+ FieldLocator = Union[FieldName, NestedFieldName, FieldId]
66
+
67
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
68
+
69
+
70
+ class MergeOrder(tuple):
71
+ @staticmethod
72
+ def of(
73
+ sort_order: SortOrder = SortOrder.ASCENDING,
74
+ null_order: NullOrder = NullOrder.AT_END,
75
+ ) -> MergeOrder:
76
+ return MergeOrder(
77
+ (
78
+ sort_order,
79
+ null_order,
80
+ )
81
+ )
82
+
83
+ @property
84
+ def sort_order(self) -> Optional[SortOrder]:
85
+ return SortOrder(self[0])
86
+
87
+ @property
88
+ def null_order(self) -> Optional[NullOrder]:
89
+ return NullOrder(self[1])
90
+
91
+
92
+ class Field(dict):
93
+ @staticmethod
94
+ def of(
95
+ field: pa.Field,
96
+ field_id: Optional[FieldId] = None,
97
+ is_merge_key: Optional[bool] = None,
98
+ merge_order: Optional[MergeOrder] = None,
99
+ is_event_time: Optional[bool] = None,
100
+ doc: Optional[str] = None,
101
+ past_default: Optional[Any] = None,
102
+ future_default: Optional[Any] = None,
103
+ consistency_type: Optional[SchemaConsistencyType] = None,
104
+ path: Optional[NestedFieldName] = None,
105
+ native_object: Optional[Any] = None,
106
+ ) -> Field:
107
+ """
108
+ Creates a DeltaCAT field from a PyArrow base field. The DeltaCAT
109
+ field contains a copy of the base field, but ensures that the
110
+ PyArrow Field's metadata is also populated with optional metadata
111
+ like documentation or metadata used within the context of a parent
112
+ schema like field ids, merge keys, and default values.
113
+
114
+ Args:
115
+ field (pa.Field): Arrow base field.
116
+
117
+ field_id (Optional[FieldId]): Unique ID of the field within its
118
+ parent schema, or None if this field has no parent schema. If not
119
+ given, then the field ID will be derived from the Arrow base
120
+ field's "PARQUET:field_id" metadata key.
121
+
122
+ is_merge_key (Optional[bool]): True if this Field is used as a merge
123
+ key within its parent schema, False or None if it is not a merge
124
+ key or has no parent schema. If not given, this will be derived from
125
+ the Arrow base field's "DELTACAT:merge_key" metadata key. Merge keys
126
+ are the default keys used to find matching records for equality
127
+ deletes, upserts, and other equality-key-based merge operations.
128
+ Must be a non-floating-point primitive type.
129
+
130
+ merge_order (Optional[MergeOrder]): Merge order for this field
131
+ within its parent schema. None if it is not used for merge order or
132
+ has no parent schema. If not given, this will be derived from
133
+ the Arrow base field's "DELTACAT:merge_order" metadata key. Merge
134
+ order is used to determine the record kept amongst all records
135
+ with matching merge keys for equality deletes, upserts, and other
136
+ equality-key-based merge operations. Must be a primitive type.
137
+
138
+ is_event_time (Optional[bool]): True if this Field is used to derive
139
+ event time within its parent schema, False or None if it is not used
140
+ or has no parent schema. If not given, this will be derived from
141
+ the Arrow base field's "DELTACAT:event_time" metadata key. Event
142
+ times are used to determine a stream's data completeness watermark.
143
+ Must be an integer, float, or date type.
144
+
145
+ doc (Optional[str]): Documentation for this field or None if this
146
+ field has no documentation. If not given, then docs will be derived
147
+ from the Arrow base field's "DELTACAT:doc" metadata key.
148
+
149
+ past_default (Optional[Any]): Past default values for records
150
+ written to the parent schema before this field was appended,
151
+ or None if this field has no parent schema. If not given, this will
152
+ be derived from the Arrow base field's "DELTACAT:past_default"
153
+ metadata key. Must be coercible to the field's base arrow type.
154
+
155
+ future_default (Optional[Any]): Future default values for records
156
+ that omit this field in the parent schema they're written to, or
157
+ None if this field has no parent schema. If not given, this will
158
+ be derived from the Arrow base field's "DELTACAT:future_default"
159
+ metadata key. Must be coercible to the field's base arrow type.
160
+
161
+ consistency_type (Optional[SchemaConsistencyType]): Schema
162
+ consistency type for records written to this field within the
163
+ context of a parent schema, or None if the field has no parent
164
+ schema. If not given, this will be derived from the Arrow base
165
+ field's "DELTACAT:consistency_type" metadata key.
166
+
167
+ path (Optional[NestedFieldName]): Fully qualified path of this
168
+ field within its parent schema. Any manually specified path will
169
+ be overwritten when this field is added to a schema.
170
+
171
+ native_object (Optional[Any]): The native object, if any, that this
172
+ field was originally derived from.
173
+ Returns:
174
+ A new DeltaCAT Field.
175
+ """
176
+ final_field = Field._build(
177
+ field=field,
178
+ field_id=Field._field_id(field) if field_id is None else field_id,
179
+ is_merge_key=Field._is_merge_key(field)
180
+ if is_merge_key is None
181
+ else is_merge_key,
182
+ merge_order=Field._merge_order(field)
183
+ if merge_order is None
184
+ else merge_order,
185
+ is_event_time=Field._is_event_time(field)
186
+ if is_event_time is None
187
+ else is_event_time,
188
+ doc=Field._doc(field) if doc is None else doc,
189
+ past_default=Field._past_default(field)
190
+ if past_default is None
191
+ else past_default,
192
+ future_default=Field._future_default(field)
193
+ if future_default is None
194
+ else future_default,
195
+ consistency_type=Field._consistency_type(field)
196
+ if consistency_type is None
197
+ else consistency_type,
198
+ )
199
+ return Field(
200
+ {
201
+ "arrow": final_field,
202
+ "path": copy.deepcopy(path),
203
+ "nativeObject": native_object,
204
+ }
205
+ )
206
+
207
+ @property
208
+ def arrow(self) -> pa.Field:
209
+ return self["arrow"]
210
+
211
+ @property
212
+ def id(self) -> Optional[FieldId]:
213
+ return Field._field_id(self.arrow)
214
+
215
+ @property
216
+ def path(self) -> Optional[NestedFieldName]:
217
+ return self.get("path")
218
+
219
+ @property
220
+ def is_merge_key(self) -> Optional[bool]:
221
+ return Field._is_merge_key(self.arrow)
222
+
223
+ @property
224
+ def merge_order(self) -> Optional[MergeOrder]:
225
+ return Field._merge_order(self.arrow)
226
+
227
+ @property
228
+ def doc(self) -> Optional[str]:
229
+ return Field._doc(self.arrow)
230
+
231
+ @property
232
+ def past_default(self) -> Optional[Any]:
233
+ return Field._past_default(self.arrow)
234
+
235
+ @property
236
+ def future_default(self) -> Optional[Any]:
237
+ return Field._future_default(self.arrow)
238
+
239
+ @property
240
+ def consistency_type(self) -> Optional[SchemaConsistencyType]:
241
+ return Field._consistency_type(self.arrow)
242
+
243
+ @property
244
+ def native_object(self) -> Optional[Any]:
245
+ return self.get("nativeObject")
246
+
247
+ @staticmethod
248
+ def _field_id(field: pa.Field) -> Optional[FieldId]:
249
+ field_id = None
250
+ if field.metadata:
251
+ bytes_val = field.metadata.get(PARQUET_FIELD_ID_KEY_NAME)
252
+ field_id = int(bytes_val.decode()) if bytes_val else None
253
+ return field_id
254
+
255
+ @staticmethod
256
+ def _doc(field: pa.Field) -> Optional[str]:
257
+ doc = None
258
+ if field.metadata:
259
+ bytes_val = field.metadata.get(FIELD_DOC_KEY_NAME)
260
+ doc = bytes_val.decode() if bytes_val else None
261
+ return doc
262
+
263
+ @staticmethod
264
+ def _is_merge_key(field: pa.Field) -> Optional[bool]:
265
+ is_merge_key = None
266
+ if field.metadata:
267
+ bytes_val = field.metadata.get(FIELD_MERGE_KEY_NAME)
268
+ is_merge_key = bool(bytes_val.decode()) if bytes_val else None
269
+ return is_merge_key
270
+
271
+ @staticmethod
272
+ def _merge_order(field: pa.Field) -> Optional[MergeOrder]:
273
+ merge_order = None
274
+ if field.metadata:
275
+ bytes_val = field.metadata.get(FIELD_MERGE_ORDER_KEY_NAME)
276
+ merge_order = msgpack.loads(bytes_val) if bytes_val else None
277
+ return merge_order
278
+
279
+ @staticmethod
280
+ def _is_event_time(field: pa.Field) -> Optional[bool]:
281
+ is_event_time = None
282
+ if field.metadata:
283
+ bytes_val = field.metadata.get(FIELD_EVENT_TIME_KEY_NAME)
284
+ is_event_time = bool(bytes_val.decode()) if bytes_val else None
285
+ return is_event_time
286
+
287
+ @staticmethod
288
+ def _past_default(field: pa.Field) -> Optional[Any]:
289
+ default = None
290
+ if field.metadata:
291
+ bytes_val = field.metadata.get(FIELD_PAST_DEFAULT_KEY_NAME)
292
+ default = msgpack.loads(bytes_val) if bytes_val else None
293
+ return default
294
+
295
+ @staticmethod
296
+ def _future_default(field: pa.Field) -> Optional[Any]:
297
+ default = None
298
+ if field.metadata:
299
+ bytes_val = field.metadata.get(FIELD_FUTURE_DEFAULT_KEY_NAME)
300
+ default = msgpack.loads(bytes_val) if bytes_val else None
301
+ return default
302
+
303
+ @staticmethod
304
+ def _consistency_type(field: pa.Field) -> Optional[SchemaConsistencyType]:
305
+ t = None
306
+ if field.metadata:
307
+ bytes_val = field.metadata.get(FIELD_CONSISTENCY_TYPE_KEY_NAME)
308
+ t = SchemaConsistencyType(bytes_val.decode()) if bytes_val else None
309
+ return t
310
+
311
+ @staticmethod
312
+ def _validate_merge_key(field: pa.Field):
313
+ if not (pa.types.is_string(field.type) or pa.types.is_primitive(field.type)):
314
+ raise ValueError(f"Merge key {field} must be a primitive type.")
315
+ if pa.types.is_floating(field.type):
316
+ raise ValueError(f"Merge key {field} cannot be floating point.")
317
+
318
+ @staticmethod
319
+ def _validate_merge_order(field: pa.Field):
320
+ if not pa.types.is_primitive(field.type):
321
+ raise ValueError(f"Merge order {field} must be a primitive type.")
322
+
323
+ @staticmethod
324
+ def _validate_event_time(field: pa.Field):
325
+ if (
326
+ not pa.types.is_integer(field.type)
327
+ and not pa.types.is_floating(field.type)
328
+ and not pa.types.is_date(field.type)
329
+ ):
330
+ raise ValueError(f"Event time {field} must be numeric or date type.")
331
+
332
+ @staticmethod
333
+ def _validate_default(
334
+ default: Optional[Any],
335
+ field: pa.Field,
336
+ ) -> pa.Scalar:
337
+ try:
338
+ return pa.scalar(default, field.type)
339
+ except ArrowInvalid:
340
+ raise ValueError(
341
+ f"Cannot treat default value `{default}` as type"
342
+ f"`{field.type}` for field: {field}"
343
+ )
344
+
345
+ @staticmethod
346
+ def _build(
347
+ field: pa.Field,
348
+ field_id: Optional[int],
349
+ is_merge_key: Optional[bool],
350
+ merge_order: Optional[MergeOrder],
351
+ is_event_time: Optional[bool],
352
+ doc: Optional[str],
353
+ past_default: Optional[Any],
354
+ future_default: Optional[Any],
355
+ consistency_type: Optional[SchemaConsistencyType],
356
+ ) -> pa.Field:
357
+ meta = {}
358
+ if is_merge_key:
359
+ Field._validate_merge_key(field)
360
+ meta[FIELD_MERGE_KEY_NAME] = str(is_merge_key)
361
+ if merge_order:
362
+ Field._validate_merge_order(field)
363
+ meta[FIELD_MERGE_ORDER_KEY_NAME] = msgpack.dumps(merge_order)
364
+ if is_event_time:
365
+ Field._validate_event_time(field)
366
+ meta[FIELD_EVENT_TIME_KEY_NAME] = str(is_event_time)
367
+ if past_default is not None:
368
+ Field._validate_default(past_default, field)
369
+ meta[FIELD_PAST_DEFAULT_KEY_NAME] = msgpack.dumps(past_default)
370
+ if future_default is not None:
371
+ Field._validate_default(future_default, field)
372
+ meta[FIELD_FUTURE_DEFAULT_KEY_NAME] = msgpack.dumps(future_default)
373
+ if field_id is not None:
374
+ meta[PARQUET_FIELD_ID_KEY_NAME] = str(field_id)
375
+ if doc is not None:
376
+ meta[FIELD_DOC_KEY_NAME] = doc
377
+ if consistency_type is not None:
378
+ meta[FIELD_CONSISTENCY_TYPE_KEY_NAME] = consistency_type.value
379
+ return pa.field(
380
+ name=field.name,
381
+ type=field.type,
382
+ nullable=field.nullable,
383
+ metadata=meta,
384
+ )
385
+
386
+
387
+ SingleSchema = Union[List[Field], pa.Schema]
388
+ MultiSchema = Union[Dict[SchemaName, List[Field]], Dict[SchemaName, pa.Schema]]
389
+
390
+
391
+ class Schema(dict):
392
+ @staticmethod
393
+ def of(
394
+ schema: Union[SingleSchema, MultiSchema],
395
+ schema_id: Optional[SchemaId] = None,
396
+ native_object: Optional[Any] = None,
397
+ ) -> Schema:
398
+ """
399
+ Creates a DeltaCAT schema from either one or multiple Arrow base schemas
400
+ or lists of DeltaCAT fields. All field names across all input schemas
401
+ must be unique (case-insensitive). If a dict of named subschemas is
402
+ given, then this DeltaCAT schema will be backed by a unified arrow
403
+ schema created as a union of all input schemas in the natural iteration
404
+ order of their dictionary keys. This unified schema saves all named
405
+ subschema field mappings in its metadata to support DeltaCAT subschema
406
+ retrieval by name after schema creation.
407
+
408
+ Args:
409
+ schema (Union[SingleSchema, MultiSchema]): For a single unnamed
410
+ schema, either an Arrow base schema or list of DeltaCAT fields.
411
+ If an Arrow base schema is given, then a copy of the base schema
412
+ is made with each Arrow field populated with additional metadata.
413
+ Field IDs, merge keys, docs, and default vals will be read from
414
+ each Arrow field's metadata if they exist. Any field missing a
415
+ field ID will be assigned a unique field ID, with assigned field
416
+ IDs either starting from 0 or the max field ID + 1.
417
+ For multiple named subschemas, a dictionary of schema names to an
418
+ arrow base schema or list of DeltaCAT fields. These schemas will
419
+ be copied into a unified Arrow schema representing a union of all
420
+ of their fields in their natural iteration order. Any missing
421
+ field IDs will be autoassigned starting from 0 or the max field ID
422
+ + 1 across the natural iteration order of all schemas first, and
423
+ all fields second.
424
+ All fields across all schemas must have unique names
425
+ (case-insensitive).
426
+
427
+ schema_id (SchemaId): Unique ID of schema within its parent table
428
+ version. Defaults to 0.
429
+
430
+ native_object (Optional[Any]): The native object, if any, that this
431
+ schema was converted from.
432
+ Returns:
433
+ A new DeltaCAT Schema.
434
+ """
435
+ # normalize the input as a unified pyarrow schema
436
+ # if the input included multiple subschemas, then also save a mapping
437
+ # from each subschema to its unique field names
438
+ schema, subschema_to_field_names = Schema._to_unified_pyarrow_schema(schema)
439
+ # discover assigned field IDs in the given pyarrow schema
440
+ field_ids_to_fields = {}
441
+ schema_metadata = {}
442
+ visitor_dict = {"maxFieldId": 0}
443
+ # find and save the schema's max field ID in the visitor dictionary
444
+ Schema._visit_fields(
445
+ current=schema,
446
+ visit=Schema._find_max_field_id,
447
+ visitor_dict=visitor_dict,
448
+ )
449
+ max_field_id = visitor_dict["maxFieldId"]
450
+ visitor_dict["fieldIdsToFields"] = field_ids_to_fields
451
+ # populate map of field IDs to DeltaCAT fields w/ IDs, docs, etc.
452
+ Schema._visit_fields(
453
+ current=schema,
454
+ visit=Schema._populate_fields,
455
+ visitor_dict=visitor_dict,
456
+ )
457
+ if schema.metadata:
458
+ schema_metadata.update(schema.metadata)
459
+ # populate merge keys
460
+ merge_keys = [
461
+ field.id for field in field_ids_to_fields.values() if field.is_merge_key
462
+ ]
463
+ # create a new pyarrow schema with field ID, doc, etc. field metadata
464
+ pyarrow_schema = pa.schema(
465
+ fields=[field.arrow for field in field_ids_to_fields.values()],
466
+ )
467
+ # map subschema field names to IDs (for faster lookup and reduced size)
468
+ subschema_to_field_ids = {
469
+ schema_name: [
470
+ Field.of(pyarrow_schema.field(field_name)).id
471
+ for field_name in field_names
472
+ ]
473
+ for schema_name, field_names in subschema_to_field_names.items()
474
+ }
475
+ # create a final pyarrow schema with populated schema metadata
476
+ if schema_id is not None:
477
+ schema_metadata[SCHEMA_ID_KEY_NAME] = str(schema_id)
478
+ if schema_metadata.get(SCHEMA_ID_KEY_NAME) is None:
479
+ schema_metadata[SCHEMA_ID_KEY_NAME] = str(0)
480
+ schema_metadata[SUBSCHEMAS_KEY_NAME] = msgpack.dumps(subschema_to_field_ids)
481
+ final_schema = pyarrow_schema.with_metadata(schema_metadata)
482
+ return Schema(
483
+ {
484
+ "arrow": final_schema,
485
+ "mergeKeys": merge_keys or None,
486
+ "fieldIdsToFields": field_ids_to_fields,
487
+ "maxFieldId": max_field_id,
488
+ "nativeObject": native_object,
489
+ }
490
+ )
491
+
492
+ @staticmethod
493
+ def deserialize(serialized: pa.Buffer) -> Schema:
494
+ return Schema.of(schema=pa.ipc.read_schema(serialized))
495
+
496
+ def serialize(self) -> pa.Buffer:
497
+ return self.arrow.serialize()
498
+
499
+ def equivalent_to(self, other: Schema, check_metadata: bool = False):
500
+ if other is None:
501
+ return False
502
+ if not isinstance(other, dict):
503
+ return False
504
+ if not isinstance(other, Schema):
505
+ other = Schema(other)
506
+ return self.arrow.equals(
507
+ other.arrow,
508
+ check_metadata,
509
+ )
510
+
511
+ def add_subschema(
512
+ self,
513
+ name: SchemaName,
514
+ schema: SingleSchema,
515
+ ) -> Schema:
516
+ subschemas = copy.copy(self.subschemas)
517
+ if not subschemas: # self is SingleSchema
518
+ subschemas = {BASE_SCHEMA_NAME: self}
519
+ subschemas = Schema._add_subschema(name, schema, subschemas)
520
+ return Schema.of(
521
+ schema=subschemas,
522
+ schema_id=self.id + 1,
523
+ )
524
+
525
+ def delete_subschema(self, name: SchemaName) -> Schema:
526
+ subschemas = copy.copy(self.subschemas)
527
+ subschemas = self._del_subschema(name, subschemas)
528
+ if not subschemas:
529
+ raise ValueError(f"Deleting `{name}` would leave the schema empty.")
530
+ subschemas = {name: val.arrow for name, val in subschemas.items()}
531
+ return Schema.of(
532
+ schema=subschemas,
533
+ schema_id=self.id + 1,
534
+ )
535
+
536
+ def replace_subschema(
537
+ self,
538
+ name: SchemaName,
539
+ schema: SingleSchema,
540
+ ) -> Schema:
541
+ subschemas = copy.copy(self.subschemas)
542
+ subschemas = Schema._del_subschema(name, subschemas)
543
+ subschemas = Schema._add_subschema(name, schema, subschemas)
544
+ return Schema.of(
545
+ schema=subschemas,
546
+ schema_id=self.id + 1,
547
+ )
548
+
549
+ def field_id(self, name: Union[FieldName, NestedFieldName]) -> FieldId:
550
+ return Schema._field_name_to_field_id(self.arrow, name)
551
+
552
+ def field_name(self, field_id: FieldId) -> Union[FieldName, NestedFieldName]:
553
+ field = self.field_ids_to_fields[field_id]
554
+ if len(field.path) == 1:
555
+ return field.arrow.name
556
+ return field.path
557
+
558
+ def field(self, field_locator: FieldLocator) -> Field:
559
+ field_id = (
560
+ field_locator
561
+ if isinstance(field_locator, FieldId)
562
+ else self.field_id(field_locator)
563
+ )
564
+ return self.field_ids_to_fields[field_id]
565
+
566
+ @property
567
+ def fields(self) -> List[Field]:
568
+ field_ids_to_fields = self.field_ids_to_fields
569
+ return list(field_ids_to_fields.values())
570
+
571
+ @property
572
+ def merge_keys(self) -> Optional[List[FieldId]]:
573
+ return self.get("mergeKeys")
574
+
575
+ @property
576
+ def field_ids_to_fields(self) -> Dict[FieldId, Field]:
577
+ return self.get("fieldIdsToFields")
578
+
579
+ @property
580
+ def arrow(self) -> pa.Schema:
581
+ return self["arrow"]
582
+
583
+ @property
584
+ def max_field_id(self) -> FieldId:
585
+ return self["maxFieldId"]
586
+
587
+ @property
588
+ def id(self) -> SchemaId:
589
+ return Schema._schema_id(self.arrow)
590
+
591
+ @property
592
+ def subschema(self, name: SchemaName) -> Optional[Schema]:
593
+ subschemas = self.subschemas
594
+ return subschemas.get(name) if subschemas else None
595
+
596
+ @property
597
+ def subschemas(self) -> Dict[SchemaName, Schema]:
598
+ # return cached subschemas first if they exist
599
+ subschemas = self.get("subschemas")
600
+ if not subschemas:
601
+ # retrieve any defined subschemas
602
+ subschemas_to_field_ids = self.subschemas_to_field_ids
603
+ # rebuild and return the subschema cache
604
+ if subschemas_to_field_ids:
605
+ subschemas = {
606
+ schema_name: Schema.of(
607
+ schema=pa.schema(
608
+ [self.field(field_id).arrow for field_id in field_ids]
609
+ ),
610
+ schema_id=self.id,
611
+ native_object=self.native_object,
612
+ )
613
+ for schema_name, field_ids in subschemas_to_field_ids.items()
614
+ }
615
+ self["subschemas"] = subschemas
616
+ return subschemas or {}
617
+
618
+ @property
619
+ def subschema_field_ids(self, name: SchemaName) -> Optional[List[FieldId]]:
620
+ return self.subschemas_to_field_ids.get(name)
621
+
622
+ @property
623
+ def subschemas_to_field_ids(self) -> Dict[SchemaName, List[FieldId]]:
624
+ return Schema._subschemas(self.arrow)
625
+
626
+ @property
627
+ def native_object(self) -> Optional[Any]:
628
+ return self.get("nativeObject")
629
+
630
+ @staticmethod
631
+ def _schema_id(schema: pa.Schema) -> SchemaId:
632
+ schema_id = None
633
+ if schema.metadata:
634
+ bytes_val = schema.metadata.get(SCHEMA_ID_KEY_NAME)
635
+ schema_id = int(bytes_val.decode()) if bytes_val else None
636
+ return schema_id
637
+
638
+ @staticmethod
639
+ def _subschemas(
640
+ schema: pa.Schema,
641
+ ) -> Dict[SchemaName, List[FieldId]]:
642
+ subschemas = None
643
+ if schema.metadata:
644
+ bytes_val = schema.metadata.get(SUBSCHEMAS_KEY_NAME)
645
+ subschemas = msgpack.loads(bytes_val) if bytes_val else None
646
+ return subschemas
647
+
648
+ @staticmethod
649
+ def _field_name_to_field_id(
650
+ schema: pa.Schema,
651
+ name: Union[FieldName, NestedFieldName],
652
+ ) -> FieldId:
653
+ if isinstance(name, str):
654
+ return Field.of(schema.field(name)).id
655
+ if isinstance(name, List):
656
+ if not len(name):
657
+ raise ValueError(f"Nested field name `{name}` is empty.")
658
+ field = schema
659
+ for part in name:
660
+ field = field[part]
661
+ return Field.of(field).id
662
+ raise ValueError(f"Unknown field name type: {type(name)}")
663
+
664
+ @staticmethod
665
+ def _visit_fields(
666
+ current: Union[pa.Schema, pa.Field],
667
+ visit: Callable,
668
+ path: NestedFieldName = [],
669
+ *args,
670
+ **kwargs,
671
+ ) -> None:
672
+ """
673
+ Recursively visit all fields in a PyArrow schema, including nested
674
+ fields.
675
+
676
+ Args:
677
+ current (pa.Schema or pa.Field): The schema or field to visit.
678
+ visit (callable): A function that visits the current field.
679
+ path (NestedFieldName): The current path to the field.
680
+ *args: Additional args to pass to the visit function.
681
+ **kwargs: Additional keyword args to pass to the visit function.
682
+ Returns:
683
+ None
684
+ """
685
+ if isinstance(current, pa.Schema):
686
+ for field in current:
687
+ Schema._visit_fields(
688
+ field,
689
+ visit,
690
+ path,
691
+ *args,
692
+ **kwargs,
693
+ )
694
+ elif isinstance(current, pa.Field):
695
+ path.append(current.name)
696
+ visit(current, path, *args, **kwargs)
697
+ if pa.types.is_nested(current.type):
698
+ if isinstance(current, pa.StructType):
699
+ for field in current:
700
+ Schema._visit_fields(
701
+ field,
702
+ visit,
703
+ path,
704
+ *args,
705
+ **kwargs,
706
+ )
707
+ elif isinstance(current, pa.ListType):
708
+ Schema._visit_fields(
709
+ current.value_field,
710
+ visit,
711
+ path,
712
+ *args,
713
+ **kwargs,
714
+ )
715
+ elif isinstance(current, pa.MapType):
716
+ Schema._visit_fields(
717
+ current.key_field,
718
+ visit,
719
+ path,
720
+ *args,
721
+ **kwargs,
722
+ )
723
+ Schema._visit_fields(
724
+ current.item_field,
725
+ visit,
726
+ path,
727
+ *args,
728
+ **kwargs,
729
+ )
730
+ path.pop()
731
+ else:
732
+ raise ValueError(f"Unexpected Schema Field Type: {type(current)}")
733
+
734
+ @staticmethod
735
+ def _find_max_field_id(
736
+ field: pa.Field,
737
+ path: NestedFieldName,
738
+ visitor_dict: Dict[str, Any],
739
+ ) -> None:
740
+ max_field_id = max(
741
+ visitor_dict.get("maxFieldId", 0),
742
+ Field.of(field).id or 0,
743
+ )
744
+ visitor_dict["maxFieldId"] = max_field_id
745
+
746
+ @staticmethod
747
+ def _populate_fields(
748
+ field: pa.Field,
749
+ path: NestedFieldName,
750
+ visitor_dict: Dict[str, Any],
751
+ ) -> None:
752
+ field_ids_to_fields = visitor_dict["fieldIdsToFields"]
753
+ max_field_id = (
754
+ visitor_dict["maxFieldId"] + len(field_ids_to_fields)
755
+ ) % MAX_FIELD_ID_EXCLUSIVE
756
+ dc_field = Field.of(field)
757
+ if dc_field is not None and dc_field.id is not None:
758
+ field_id = dc_field.id
759
+ else:
760
+ field_id = max_field_id
761
+
762
+ if (dupe := field_ids_to_fields.get(field_id)) is not None:
763
+ raise ValueError(
764
+ f"Duplicate field id {field_id} for field: {field} "
765
+ f"Already assigned to field: {dupe}"
766
+ )
767
+ field = Field.of(
768
+ field=field,
769
+ field_id=field_id,
770
+ path=path,
771
+ )
772
+ field_ids_to_fields[field_id] = field
773
+
774
+ @staticmethod
775
+ def _get_lower_case_field_names(
776
+ schema: SingleSchema,
777
+ ) -> List[str]:
778
+ if isinstance(schema, pa.Schema):
779
+ return [name.lower() for name in schema.names]
780
+ elif isinstance(schema, List): # List[Field]
781
+ names = [f.arrow.name.lower() for f in schema if isinstance(f, Field)]
782
+ if len(names) == len(schema):
783
+ return names # all items in list are valid Field objects
784
+ raise ValueError(f"Unsupported schema argument: {schema}")
785
+
786
+ @staticmethod
787
+ def _validate_schema_name(name: str) -> None:
788
+ if not name:
789
+ raise ValueError(f"Schema name cannot be empty.")
790
+ if len(name) > BYTES_PER_KIBIBYTE:
791
+ raise ValueError(
792
+ f"Invalid schema name `{name}`. Schema names "
793
+ f"cannot be greater than {BYTES_PER_KIBIBYTE} "
794
+ f"characters."
795
+ )
796
+
797
+ @staticmethod
798
+ def _validate_field_names(
799
+ schema: Union[SingleSchema, MultiSchema],
800
+ ) -> None:
801
+ all_names = []
802
+ if isinstance(schema, dict): # MultiSchema
803
+ for schema_name, val in schema.items():
804
+ Schema._validate_schema_name(schema_name)
805
+ all_names.extend(Schema._get_lower_case_field_names(val))
806
+ else: # SingleSchema
807
+ all_names.extend(Schema._get_lower_case_field_names(schema))
808
+ if not all_names:
809
+ raise ValueError(f"Schema must contain at least one field.")
810
+ name_set = set()
811
+ dupes = []
812
+ for name in all_names:
813
+ dupes.append(name) if name in name_set else name_set.add(name)
814
+ if dupes:
815
+ raise ValueError(
816
+ f"Expected all schema fields to have unique names "
817
+ f"(case-insensitive), but found the following duplicates: "
818
+ f"{dupes}"
819
+ )
820
+
821
+ @staticmethod
822
+ def _to_pyarrow_schema(schema: SingleSchema) -> pa.Schema:
823
+ if isinstance(schema, pa.Schema):
824
+ return schema
825
+ elif isinstance(schema, List): # List[Field]
826
+ return pa.schema(fields=[field.arrow for field in schema])
827
+ else:
828
+ raise ValueError(f"Unsupported schema base type: {schema}")
829
+
830
+ @staticmethod
831
+ def _to_unified_pyarrow_schema(
832
+ schema: Union[SingleSchema, MultiSchema],
833
+ ) -> Tuple[pa.Schema, Dict[SchemaName, List[FieldName]]]:
834
+ # first, ensure all field names are valid and contain no duplicates
835
+ Schema._validate_field_names(schema)
836
+ # now union all schemas into a single schema
837
+ subschema_to_field_names = {}
838
+ if isinstance(schema, dict): # MultiSchema
839
+ all_schemas = []
840
+ for schema_name, schema_val in schema.items():
841
+ pyarow_schema = Schema._to_pyarrow_schema(schema_val)
842
+ all_schemas.append(pyarow_schema)
843
+ subschema_to_field_names[schema_name] = [
844
+ field.name for field in pyarow_schema
845
+ ]
846
+ return pa.unify_schemas(all_schemas), subschema_to_field_names
847
+ return Schema._to_pyarrow_schema(schema), {} # SingleSchema
848
+
849
+ @staticmethod
850
+ def _del_subschema(
851
+ name: SchemaName,
852
+ subschemas: Dict[SchemaName, Schema],
853
+ ) -> Dict[SchemaName, Schema]:
854
+ deleted_subschema = subschemas.pop(name, None)
855
+ if deleted_subschema is None:
856
+ raise ValueError(f"Subschema `{name}` does not exist.")
857
+ return subschemas
858
+
859
+ @staticmethod
860
+ def _add_subschema(
861
+ name: SchemaName,
862
+ schema: SingleSchema,
863
+ subschemas: Dict[SchemaName, Schema],
864
+ ) -> Dict[SchemaName, Schema]:
865
+ Schema._validate_schema_name(name)
866
+ if name == BASE_SCHEMA_NAME:
867
+ raise ValueError(
868
+ f"Cannot add subschema with reserved name: {BASE_SCHEMA_NAME}"
869
+ )
870
+ if name in subschemas:
871
+ raise ValueError(f"Subschema `{name}` already exists.")
872
+ for key, val in subschemas.items():
873
+ subschemas[key] = val.arrow
874
+ subschemas[name] = schema
875
+ return subschemas
876
+
877
+
878
+ class SchemaList(List[Schema]):
879
+ @staticmethod
880
+ def of(items: List[Schema]) -> SchemaList:
881
+ typed_items = SchemaList()
882
+ for item in items:
883
+ if item is not None and not isinstance(item, Schema):
884
+ item = Schema(item)
885
+ typed_items.append(item)
886
+ return typed_items
887
+
888
+ def __getitem__(self, item):
889
+ val = super().__getitem__(item)
890
+ if val is not None and not isinstance(val, Schema):
891
+ self[item] = val = Schema(val)
892
+ return val