deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,128 @@
1
+ # Similar to daft's datatype, this is a big ole enum of all possible types
2
+ # In the long term, this will have to be interoperable with pandas/daft/spark/parquet/iceberg/etc type systems
3
+ # Our Spec will need to publish data type mappings, such as Iceberg's data type mappings: https://iceberg.apache.org/spec/#file-system-operations
4
+ # It also has the unique responsibility of representing multi-modal (e.g. image) types
5
+ from dataclasses import dataclass
6
+ from typing import Optional
7
+
8
+ import pyarrow as pa
9
+
10
+
11
+ # OPEN QUESTIONS:
12
+ # Do we want to support the notion of logical vs physical type like parquet?
13
+
14
+ # TODO turn into an interface or otherwise allow pluggable datatypes
15
+ @dataclass(frozen=True)
16
+ class Datatype:
17
+ type_name: str
18
+
19
+ @property
20
+ def subtype(self) -> Optional[str]:
21
+ """
22
+ Higher level formats like binary or image will have "subtype", such as image(jpg) or binary(np_array)
23
+ TODO - Note that we are replacing this schema system with DeltaCat schema model, which supports extended/decorated pyarrow types
24
+ For now going to do a super minimal/hacky implementation of types like binary and image, where
25
+ :return: Subtype if it exists, or None
26
+ """
27
+ if not self.type_name.endswith(")"):
28
+ return None
29
+ if self.type_name.startswith("binary(") or self.type_name.startswith("image("):
30
+ return self.type_name[self.type_name.find("(") + 1 : -1]
31
+ return None
32
+
33
+ @classmethod
34
+ def binary(cls, binary_format):
35
+ """
36
+ :param binary_format:
37
+ :return:
38
+ """
39
+ return cls(type_name=f"binary({binary_format})")
40
+
41
+ @classmethod
42
+ def image(cls, image_format):
43
+ return cls(type_name=f"image({image_format})")
44
+
45
+ @classmethod
46
+ def string(cls):
47
+ return cls(type_name="string")
48
+
49
+ @classmethod
50
+ def float(cls):
51
+ return cls(type_name="float")
52
+
53
+ @classmethod
54
+ def int16(cls):
55
+ return cls(type_name="int16")
56
+
57
+ @classmethod
58
+ def int32(cls):
59
+ return cls(type_name="int32")
60
+
61
+ @classmethod
62
+ def int64(cls):
63
+ return cls(type_name="int64")
64
+
65
+ @classmethod
66
+ def bool(cls):
67
+ return cls(type_name="bool")
68
+
69
+ @classmethod
70
+ def from_pyarrow(cls, pa_type: pa.DataType) -> "Datatype":
71
+ """
72
+ Convert a pa type to a Rivulet Datatype.
73
+
74
+ Args:
75
+ pa_type: pa DataType to convert
76
+
77
+ Returns:
78
+ Datatype: Corresponding Rivulet Datatype
79
+
80
+ Raises:
81
+ ValueError: If the pa type is not supported
82
+ """
83
+ if pa.types.is_string(pa_type):
84
+ return cls.string()
85
+ elif pa.types.is_float64(pa_type):
86
+ return cls.float()
87
+ elif pa.types.is_int16(pa_type):
88
+ return cls.int16()
89
+ elif pa.types.is_int32(pa_type):
90
+ return cls.int32()
91
+ elif pa.types.is_int64(pa_type):
92
+ return cls.int64()
93
+ elif pa.types.is_boolean(pa_type):
94
+ return cls.bool()
95
+ elif pa.types.is_binary(pa_type):
96
+ # TODO: Use pyarrow metadata on schema field to map correctly into image and other binary types
97
+ return cls.binary("binary") # Default binary format
98
+ else:
99
+ raise ValueError(f"Unsupported pa type: {pa_type}")
100
+
101
+ def to_pyarrow(self) -> pa.field:
102
+ """
103
+ In the future we want to be more thoughtful about how we do type conversions
104
+
105
+ For now, just build a simple mapping of every time to pyarrow
106
+ For what it's worth, Daft schema types have a giant if/else like this
107
+
108
+ :return: pyarrow type
109
+ """
110
+ if self.type_name == "string":
111
+ return pa.string()
112
+ elif self.type_name == "float":
113
+ return pa.float64()
114
+ elif self.type_name == "int16":
115
+ return pa.int16()
116
+ elif self.type_name == "int32":
117
+ return pa.int32()
118
+ elif self.type_name == "int64":
119
+ return pa.int64()
120
+ elif self.type_name == "bool":
121
+ return pa.bool_()
122
+ elif self.type_name.startswith("image(") or self.type_name.startswith(
123
+ "binary("
124
+ ):
125
+ # TODO we will need to think about how custom types work with tabular libraries
126
+ return pa.binary()
127
+ else:
128
+ raise ValueError(f"Unsupported type conversion to pa: {self.type_name}")
@@ -0,0 +1,251 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, asdict
4
+ from typing import MutableMapping, Dict, Iterable, Tuple, Optional
5
+
6
+ import pyarrow as pa
7
+
8
+ from deltacat.storage.rivulet.schema.datatype import Datatype
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class Field:
13
+ name: str
14
+ datatype: Datatype
15
+ is_merge_key: bool = False
16
+
17
+
18
+ class Schema(MutableMapping[str, Field]):
19
+ """
20
+ A mutable mapping representing a schema for structured data, requiring at least one merge key field.
21
+
22
+ TODO FUTURE ITERATIONS
23
+ 1. We may use Deltacat for schema
24
+ 2. We almost certainly want our schema system based on arrow types,
25
+ since many libraries we are integrating with (e.g. daft) are
26
+ interoperable with arrow schemas
27
+
28
+ Attributes:
29
+ name: The name of the schema (for storing in dict/map)
30
+ _fields (dict): Maps field names to Field objects.
31
+
32
+ Methods:
33
+ from_pyarrow(pyarrow_schema: pa.Schema, key: str) -> Schema:
34
+ Creates a Schema instance from a PyArrow schema.
35
+
36
+ __len__() -> int: Returns number of fields.
37
+ __getitem__(key: str) -> Field: Gets field by name.
38
+ __setitem__(key: str, value: Field | Datatype): Adds/updates field.
39
+ __delitem__(key: str): Deletes field if not a merge key.
40
+ __iter__(): Iterates over fields.
41
+
42
+ add_field(field: Field): Adds a Field using its name as the key.
43
+ to_pyarrow() -> pa.Schema:
44
+ Converts schema to PyArrow format.
45
+
46
+ keys(): Returns field names.
47
+ values(): Returns Field objects.
48
+ items(): Returns (name, Field) pairs.
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ fields: Iterable[Tuple[str, Datatype] | Field] = None,
54
+ merge_keys: Optional[Iterable[str]] = None,
55
+ ):
56
+ self._fields: Dict[str, Field] = {}
57
+ merge_keys = merge_keys or {}
58
+ if len(fields or []) == 0:
59
+ if len(merge_keys) > 0:
60
+ raise TypeError(
61
+ "It is invalid to specify merge keys when no fields are specified. Add fields or remove the merge keys."
62
+ )
63
+ return
64
+ # Convert all input tuples to Field objects and add to fields
65
+ for field in fields:
66
+ if isinstance(field, tuple):
67
+ name, datatype = field
68
+ processed_field = Field(
69
+ name=name, datatype=datatype, is_merge_key=(name in merge_keys)
70
+ )
71
+ elif isinstance(field, Field):
72
+ processed_field = field
73
+ name = field.name
74
+ # Check if merge key status conflicts
75
+ if len(merge_keys) > 0:
76
+ expected_merge_key_status = name in merge_keys
77
+ if processed_field.is_merge_key != expected_merge_key_status:
78
+ raise TypeError(
79
+ f"Merge key status conflict for field '{name}': "
80
+ f"Provided as merge key: {expected_merge_key_status}, "
81
+ f"Field's current status: {processed_field.is_merge_key}. "
82
+ f"Merge keys should only be defined if raw (name, Datatype) tuples are used."
83
+ )
84
+ else:
85
+ raise TypeError(f"Unexpected field type: {type(field)}")
86
+ self.add_field(processed_field)
87
+
88
+ @classmethod
89
+ def from_dict(cls, data) -> Schema:
90
+ fields = [
91
+ Field(
92
+ name=field_data["name"],
93
+ datatype=Datatype(**field_data["datatype"])
94
+ if isinstance(field_data["datatype"], dict)
95
+ else field_data["datatype"],
96
+ is_merge_key=field_data["is_merge_key"],
97
+ )
98
+ for field_data in data["fields"]
99
+ ]
100
+ return cls(fields)
101
+
102
+ @classmethod
103
+ def from_pyarrow(
104
+ cls, pyarrow_schema: pa.Schema, merge_keys: str | Iterable[str] = None
105
+ ) -> Schema:
106
+ """
107
+ Create a Schema instance from a PyArrow schema.
108
+
109
+ Args:
110
+ pyarrow_schema: PyArrow Schema to convert
111
+ merge_keys: The optional set of merge keys to add to the schema as it's being translated.
112
+ These keys must be present in the schema.
113
+
114
+ Returns:
115
+ Schema: New Schema instance
116
+
117
+ Raises:
118
+ ValueError: If key is not found in schema
119
+ """
120
+ merge_keys = [merge_keys] if isinstance(merge_keys, str) else merge_keys
121
+ fields = {}
122
+
123
+ for field in pyarrow_schema:
124
+ dtype = Datatype.from_pyarrow(field.type)
125
+ fields[field.name] = Field(
126
+ field.name, dtype, is_merge_key=(field.name in merge_keys)
127
+ )
128
+
129
+ # Validate that the defined merge_keys are present in the fields being added
130
+ missing_keys = merge_keys - fields.keys()
131
+ if missing_keys:
132
+ raise ValueError(
133
+ f"The following merge keys not found in the provided schema: {', '.join(missing_keys)}"
134
+ )
135
+
136
+ return cls(fields.values())
137
+
138
+ @classmethod
139
+ def merge_all(cls, schemas: Iterable[Schema]) -> Schema:
140
+ """Merges a list of schemas into a new schema"""
141
+ merged = cls({})
142
+ for schema in schemas:
143
+ merged.merge(schema)
144
+ return merged
145
+
146
+ def __getitem__(self, key: str) -> Field:
147
+ return self._fields[key]
148
+
149
+ def __setitem__(
150
+ self, key: str, value: Field | Datatype | Tuple[Datatype, bool]
151
+ ) -> None:
152
+ # Create field from [str, Datatype, bool] where bool is merge_key
153
+ if isinstance(value, Field):
154
+ processed_field = value
155
+ elif isinstance(value, Datatype):
156
+ processed_field = Field(
157
+ key, value
158
+ ) # is_merge_key is always false in this case
159
+ elif isinstance(value, tuple):
160
+ (datatype, merge_key) = value
161
+ processed_field = Field(key, datatype, merge_key)
162
+ else:
163
+ raise TypeError(
164
+ "The field must be an instance of the Field class, Datatype, or Tuple[Datatype, bool], where bool is whether the field is a merge key."
165
+ )
166
+ processed_field: Field = processed_field
167
+ # if len(self._fields) == 0 and not processed_field.is_merge_key:
168
+ # raise TypeError("The first field set on a Schema must be a merge key.")
169
+
170
+ self._fields[processed_field.name] = processed_field
171
+
172
+ def __delitem__(self, key: str) -> None:
173
+ field = self._fields[key]
174
+ if field.is_merge_key:
175
+ raise ValueError("Cannot delete a merge key field")
176
+ del self._fields[key]
177
+
178
+ def __len__(self) -> int:
179
+ return len(self._fields)
180
+
181
+ def __iter__(self) -> Iterable[str]:
182
+ return iter(self._fields.keys())
183
+
184
+ def __hash__(self) -> int:
185
+ return hash((frozenset(self._fields.items())))
186
+
187
+ def __eq__(self, other) -> bool:
188
+ if isinstance(other, Schema):
189
+ return self._fields == other._fields
190
+ return False
191
+
192
+ # Has a spurious type check problem in @dataclass + asdict(): https://youtrack.jetbrains.com/issue/PY-76059/Incorrect-Type-warning-with-asdict-and-Dataclass
193
+ def to_dict(self) -> dict[str, list[dict[str, Field]]]:
194
+ return {"fields": [asdict(field) for field in self._fields.values()]}
195
+
196
+ def add_field(self, field: Field) -> None:
197
+ """Adds a Field object using its name as the key, raises ValueError if it already exists"""
198
+ if field.name in self._fields:
199
+ raise ValueError(
200
+ f"Attempting to add a field with the same name as an existing field: {field.name}"
201
+ )
202
+ self[field.name] = field
203
+
204
+ def get_merge_keys(self) -> Iterable[str]:
205
+ """Return a list of all merge keys."""
206
+ return [field.name for field in self._fields.values() if field.is_merge_key]
207
+
208
+ def get_merge_key(self) -> str:
209
+ """Returns a single merge key if there is one, or raises if not. Used for simple schemas w/ a single key"""
210
+ # Get the merge key
211
+ merge_keys = list(self.get_merge_keys())
212
+ if len(merge_keys) != 1:
213
+ raise ValueError(
214
+ f"Schema must have exactly one merge key, but found {merge_keys}"
215
+ )
216
+ return merge_keys[0]
217
+
218
+ def merge(self, other: Schema) -> None:
219
+ """Merges another schema's fields into the current schema."""
220
+ if not other:
221
+ return
222
+ for name, field in other._fields.items():
223
+ if name in self._fields:
224
+ if self._fields[name] != field:
225
+ raise ValueError(
226
+ f"Field '{name}' already exists in the current schema with different definition"
227
+ )
228
+ else:
229
+ self.add_field(field)
230
+
231
+ def to_pyarrow(self) -> pa.Schema:
232
+ """
233
+ Convert the Schema to a PyArrow schema.
234
+
235
+ Returns:
236
+ pyarrow.schema: A PyArrow schema representation of this Schema.
237
+ """
238
+ # TODO: Should we track merge_keys as it goes to/from pyarrow?
239
+ fields = []
240
+ for name, field in self._fields.items():
241
+ fields.append(pa.field(name, field.datatype.to_pyarrow()))
242
+ return pa.schema(fields)
243
+
244
+ def keys(self) -> Iterable[str]:
245
+ return self._fields.keys()
246
+
247
+ def values(self) -> Iterable[Field]:
248
+ return self._fields.values()
249
+
250
+ def items(self) -> Iterable[tuple[str, Field]]:
251
+ return self._fields.items()
@@ -0,0 +1,40 @@
1
+ from typing import Protocol, Iterable, List, Union, Any, Dict
2
+
3
+ from deltacat.storage.rivulet.metastore.sst import SSTableRow
4
+ import pyarrow as pa
5
+
6
+ MEMTABLE_DATA = Union[Iterable[Dict[str, Any]], pa.Table]
7
+
8
+
9
+ class DataSerializer(Protocol):
10
+ """
11
+ Interface for writing data only.
12
+
13
+ As data is written, it must emit sufficient metadata to build SSTable
14
+ Each format will have a specific data writer (e.g. ParquetDataWriter)
15
+
16
+ TODO future improvements:
17
+ 1. How does data writer control how it chooses to write to existing files vs new files?
18
+ For now, we will not expose this configuration and always write each batch to
19
+ a new file
20
+ 2. Related to 1, how should we expose URI(s) to write to? Probably DataWriter can
21
+ use FileProvider and needs to know relevant ids like task ID.
22
+ """
23
+
24
+ def flush_batch(self, sorted_records: MEMTABLE_DATA) -> List[SSTableRow]:
25
+ """
26
+ Flushes rows to file, and return appropriate metadata to build SSTable
27
+
28
+ TODO future improvements
29
+ 1. Finalize type for input records (instead of MvpRow)
30
+
31
+ Options could be:
32
+ (a) Something like Iceberg "StructLike" which allows flexible integrations without memcopy for row-oriented formats, e.g. can make Spark InternalRow structlike
33
+ (b) use arrow. We will probably use arrow for writing parquet, although
34
+ probably it isn't ideal for row oriented formats
35
+ 2. Keep in mind, most implementation of DataWriter will be written in rust.
36
+
37
+ :param sorted_records: Records sorted by key
38
+ :return: metadata used to build SS Table
39
+ """
40
+ ...
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ from deltacat.storage.rivulet.parquet.serializer import ParquetDataSerializer
4
+ from deltacat.storage.rivulet import Schema
5
+ from deltacat.storage.rivulet.serializer import DataSerializer
6
+ from deltacat.storage.rivulet.fs.file_provider import FileProvider
7
+
8
+ from deltacat.storage.rivulet.feather.serializer import FeatherDataSerializer
9
+
10
+
11
+ class DataSerializerFactory:
12
+ """
13
+ Simple factory class for getting the appropriate serializer given a schema
14
+ TODO make this more modular/pluggable like DatasetReaderRegistrar
15
+ This will be more challenging to make pluggable, because we should not rely on a simple 1:1 mapping of type to serializer
16
+ The actual logic for determining how to serialize a given schema may be complex
17
+ e.g.: if schema contains datatype X, you must use serializer Y. Otherwise, default to serializer Z
18
+ """
19
+
20
+ @classmethod
21
+ def get_serializer(
22
+ self,
23
+ schema: Schema,
24
+ file_provider: FileProvider,
25
+ user_provided_format: str | None = None,
26
+ ) -> DataSerializer:
27
+ if user_provided_format == "parquet":
28
+ return ParquetDataSerializer(file_provider, schema)
29
+ elif user_provided_format == "feather":
30
+ return FeatherDataSerializer(file_provider, schema)
31
+ elif user_provided_format is not None:
32
+ raise ValueError("Unsupported format. Must be 'parquet' or 'feather'.")
33
+
34
+ # Default engine logic. For now, if there is image or binary use feather
35
+ has_binary_or_image = any(
36
+ field.datatype.type_name.startswith(("binary", "image"))
37
+ for field in schema.values()
38
+ )
39
+ if has_binary_or_image:
40
+ return FeatherDataSerializer(file_provider, schema)
41
+ else:
42
+ return ParquetDataSerializer(file_provider, schema)
File without changes
@@ -0,0 +1,29 @@
1
+ from typing import Protocol, Iterable, Union, Any, Dict
2
+ import pyarrow as pa
3
+
4
+ DATA = Union[Iterable[Dict[str, Any]], Iterable[pa.RecordBatch], pa.RecordBatch]
5
+
6
+
7
+ class DatasetWriter(Protocol):
8
+ """
9
+ Top level interface for writing records to rivulet dataset. This is used by dataset.py
10
+
11
+ This writes both data AND metadata (SSTs, manifests).
12
+
13
+ The general paradigm is that records are written iteratively through write or write_batch. At configurable intervals (based on record count or size), data and metadata gets flushed.
14
+
15
+ When the user either closes the dataset writer or calls commit(), this triggers all buffered data and metadata to be flushed.
16
+ """
17
+
18
+ def write(self, record: DATA) -> None:
19
+ ...
20
+
21
+ def flush(self) -> str:
22
+ """
23
+ Explicitly flush any data and metadata and commit to dataset
24
+
25
+ This is a blocking operation
26
+
27
+ :return: URI of manifest written for commit
28
+ """
29
+ ...