deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,47 @@
1
+ from abc import abstractmethod
2
+ from typing import Iterable, Optional, Protocol, TypeVar, Union
3
+
4
+ from deltacat.storage.rivulet.reader.dataset_metastore import DatasetMetastore
5
+
6
+ # TODO: Add type validation in dataset/schema classes
7
+ T = TypeVar("T", bound=Union[int, str])
8
+
9
+
10
+ class Shard(Protocol[T]):
11
+ """
12
+ Abstract base class representing a shard with defined inclusive boundaries.
13
+
14
+ A shard represents a logical partition of data, defined by its
15
+ minimum and maximum keys. These keys determine the range of data
16
+ within a dataset that the shard encompasses.
17
+ """
18
+
19
+ min_key: Optional[T]
20
+ max_key: Optional[T]
21
+
22
+
23
+ class ShardingStrategy(Protocol):
24
+ """
25
+ A sharding strategy determines how the dataset is divided into shards.
26
+ """
27
+
28
+ @staticmethod
29
+ def from_string(strategy: str) -> "ShardingStrategy":
30
+ """
31
+ Factory method to create the appropriate ShardingStrategy from a string.
32
+
33
+ param: strategy: The string representation of the sharding strategy.
34
+ return: ShardingStrategy class.
35
+ """
36
+ if strategy == "range":
37
+ from deltacat.storage.rivulet.shard.range_shard import RangeShardingStrategy
38
+
39
+ return RangeShardingStrategy()
40
+ else:
41
+ raise ValueError(f"Unsupported sharding strategy type: {strategy}")
42
+
43
+ @abstractmethod
44
+ def shards(self, num_shards: int, metastore: DatasetMetastore) -> Iterable[Shard]:
45
+ """
46
+ Generate the shards based on the chosen strategy.
47
+ """
@@ -1,33 +1,190 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from enum import Enum
4
+ from typing import Optional, Any, List, Tuple, Dict
5
5
 
6
+ from pyarrow.compute import SortOptions
6
7
 
7
- class SortOrder(str, Enum):
8
- ASCENDING = "ascending"
9
- DESCENDING = "descending"
8
+ from deltacat.storage.model.types import (
9
+ SortOrder,
10
+ NullOrder,
11
+ )
12
+ from deltacat.storage.model.schema import FieldLocator
13
+ from deltacat.storage.model.transform import Transform
10
14
 
11
15
 
12
16
  class SortKey(tuple):
13
17
  @staticmethod
14
- def of(key_name: str, sort_order: SortOrder = SortOrder.ASCENDING) -> SortKey:
18
+ def of(
19
+ key: Optional[List[FieldLocator]],
20
+ sort_order: SortOrder = SortOrder.ASCENDING,
21
+ null_order: NullOrder = NullOrder.AT_END,
22
+ transform: Optional[Transform] = None,
23
+ native_object: Optional[Any] = None,
24
+ ) -> SortKey:
15
25
  """
16
26
  Create a sort key from a field name to use as the sort key, and
17
27
  the sort order for this key. If no sort order is specified, then the
18
- data will be sorted in ascending order by default. Note that compaction
19
- always keeps the LAST occurrence of this key post-sort. For example, if
20
- you used an integer column as your sort key which contained the values
21
- [2, 1, 3] specifying SortOrder.ASCENDING would ensure that the
22
- value [3] is kept over [2, 1], and specifying SortOrder.DESCENDING
23
- would ensure that [1] is kept over [2, 3].
28
+ data will be sorted in ascending order by default.
24
29
  """
25
- return SortKey((key_name, sort_order.value))
30
+ return SortKey(
31
+ (
32
+ key,
33
+ sort_order.value if isinstance(sort_order, SortOrder) else sort_order,
34
+ null_order.value if isinstance(null_order, NullOrder) else null_order,
35
+ transform,
36
+ native_object,
37
+ )
38
+ )
39
+
40
+ def equivalent_to(
41
+ self,
42
+ other: SortKey,
43
+ ):
44
+ if other is None:
45
+ return False
46
+ if not isinstance(other, tuple):
47
+ return False
48
+ if not isinstance(other, SortKey):
49
+ other = SortKey(other)
50
+ return (
51
+ self.key == other.key
52
+ and self.transform == other.transform
53
+ and self.sort_order == other.sort_order
54
+ and self.null_order == other.null_order
55
+ )
26
56
 
27
57
  @property
28
- def key_name(self) -> str:
58
+ def key(self) -> Optional[List[FieldLocator]]:
29
59
  return self[0]
30
60
 
31
61
  @property
32
62
  def sort_order(self) -> SortOrder:
33
63
  return SortOrder(self[1])
64
+
65
+ @property
66
+ def null_order(self) -> NullOrder:
67
+ return NullOrder(self[2])
68
+
69
+ @property
70
+ def transform(self) -> Optional[Transform]:
71
+ val: Dict[str, Any] = (
72
+ Transform(self[3]) if len(self) >= 4 and self[3] is not None else None
73
+ )
74
+ return val
75
+
76
+ @property
77
+ def arrow(self) -> List[Tuple[str, str]]:
78
+ # TODO(pdames): Convert unsupported field locators to arrow field names,
79
+ # and transforms/multi-key-sorts to pyarrow compute expressions. Add
80
+ # null order via SortOptions when supported per field by Arrow.
81
+ return (
82
+ [(field_locator, self[1]) for field_locator in self[0]] if self[0] else []
83
+ )
84
+
85
+ @property
86
+ def native_object(self) -> Optional[Any]:
87
+ return self[4] if len(self) >= 5 else None
88
+
89
+
90
+ class SortKeyList(List[SortKey]):
91
+ @staticmethod
92
+ def of(items: List[SortKey]) -> SortKeyList:
93
+ typed_items = SortKeyList()
94
+ for item in items:
95
+ if item is not None and not isinstance(item, SortKey):
96
+ item = SortKey(item)
97
+ typed_items.append(item)
98
+ return typed_items
99
+
100
+ def __getitem__(self, item):
101
+ val = super().__getitem__(item)
102
+ if val is not None and not isinstance(val, SortKey):
103
+ self[item] = val = SortKey(val)
104
+ return val
105
+
106
+
107
+ class SortScheme(dict):
108
+ @staticmethod
109
+ def of(
110
+ keys: Optional[SortKeyList],
111
+ name: Optional[str] = None,
112
+ scheme_id: Optional[str] = None,
113
+ native_object: Optional[Any] = None,
114
+ ) -> SortScheme:
115
+ return SortScheme(
116
+ {
117
+ "keys": keys,
118
+ "name": name,
119
+ "id": scheme_id,
120
+ "nativeObject": native_object,
121
+ }
122
+ )
123
+
124
+ def equivalent_to(
125
+ self,
126
+ other: SortScheme,
127
+ check_identifiers: bool = False,
128
+ ) -> bool:
129
+ if other is None:
130
+ return False
131
+ if not isinstance(other, dict):
132
+ return False
133
+ if not isinstance(other, SortScheme):
134
+ other = SortScheme(other)
135
+ for i in range(len(self.keys)):
136
+ if not self.keys[i].equivalent_to(other.keys[i]):
137
+ return False
138
+ return not check_identifiers or (
139
+ self.name == other.name and self.id == other.id
140
+ )
141
+
142
+ @property
143
+ def keys(self) -> Optional[SortKeyList]:
144
+ val: List[SortKey] = self.get("keys")
145
+ if val is not None and not isinstance(val, SortKeyList):
146
+ self["keys"] = val = SortKeyList.of(val)
147
+ return val
148
+
149
+ @property
150
+ def name(self) -> Optional[str]:
151
+ return self.get("name")
152
+
153
+ @property
154
+ def id(self) -> Optional[str]:
155
+ return self.get("id")
156
+
157
+ @property
158
+ def arrow(self) -> SortOptions:
159
+ # TODO(pdames): Remove homogenous null ordering when supported by Arrow.
160
+ if self.keys:
161
+ if len(set([key.null_order for key in self.keys])) == 1:
162
+ return SortOptions(
163
+ sort_keys=[pa_key for k in self.keys for pa_key in k.arrow],
164
+ null_placement=self.keys[0].null_order.value,
165
+ )
166
+ else:
167
+ err_msg = "All arrow sort keys must use the same null order."
168
+ raise ValueError(err_msg)
169
+ return SortOptions()
170
+
171
+ @property
172
+ def native_object(self) -> Optional[Any]:
173
+ return self.get("nativeObject")
174
+
175
+
176
+ class SortSchemeList(List[SortScheme]):
177
+ @staticmethod
178
+ def of(items: List[SortScheme]) -> SortSchemeList:
179
+ typed_items = SortSchemeList()
180
+ for item in items:
181
+ if item is not None and not isinstance(item, SortScheme):
182
+ item = SortScheme(item)
183
+ typed_items.append(item)
184
+ return typed_items
185
+
186
+ def __getitem__(self, item):
187
+ val = super().__getitem__(item)
188
+ if val is not None and not isinstance(val, SortScheme):
189
+ self[item] = val = SortScheme(val)
190
+ return val
@@ -1,31 +1,54 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Any, Dict, List, Optional
4
+ import posixpath
5
5
 
6
- from deltacat.storage.model.locator import Locator
6
+ import pyarrow
7
+
8
+ import deltacat.storage.model.partition as partition
9
+
10
+ from typing import Any, Dict, Optional, List
11
+
12
+ from deltacat.storage.model.metafile import Metafile, MetafileRevisionInfo
13
+ from deltacat.constants import TXN_DIR_NAME
14
+ from deltacat.storage.model.locator import (
15
+ Locator,
16
+ LocatorName,
17
+ )
7
18
  from deltacat.storage.model.namespace import NamespaceLocator
8
- from deltacat.storage.model.table import TableLocator
19
+ from deltacat.storage.model.table import (
20
+ TableLocator,
21
+ Table,
22
+ )
9
23
  from deltacat.storage.model.table_version import TableVersionLocator
10
- from deltacat.storage.model.types import CommitState
11
- from deltacat.storage.model.partition_spec import StreamPartitionSpec, PartitionValues
24
+ from deltacat.storage.model.types import (
25
+ CommitState,
26
+ StreamFormat,
27
+ )
28
+
12
29
 
30
+ class Stream(Metafile):
31
+ """
32
+ An unbounded stream of Deltas, where each delta's records are optionally
33
+ partitioned according to the given partition scheme.
34
+ """
13
35
 
14
- class Stream(dict):
15
36
  @staticmethod
16
37
  def of(
17
38
  locator: Optional[StreamLocator],
18
- partition_keys: Optional[List[Dict[str, Any]]],
39
+ partition_scheme: Optional[partition.PartitionScheme],
19
40
  state: Optional[CommitState] = None,
20
- previous_stream_digest: Optional[bytes] = None,
21
- partition_spec: Optional[StreamPartitionSpec] = None,
41
+ previous_stream_id: Optional[str] = None,
42
+ watermark: Optional[int] = None,
43
+ native_object: Optional[Any] = None,
22
44
  ) -> Stream:
23
45
  stream = Stream()
24
46
  stream.locator = locator
25
- stream.partition_keys = partition_keys
47
+ stream.partition_scheme = partition_scheme
26
48
  stream.state = state
27
- stream.previous_stream_digest = previous_stream_digest
28
- stream.partition_spec = partition_spec
49
+ stream.previous_stream_id = previous_stream_id
50
+ stream.watermark = watermark
51
+ stream.native_object = native_object
29
52
  return stream
30
53
 
31
54
  @property
@@ -40,31 +63,44 @@ class Stream(dict):
40
63
  self["streamLocator"] = stream_locator
41
64
 
42
65
  @property
43
- def partition_keys(self) -> Optional[List[Dict[str, Any]]]:
44
- """
45
- Ordered list of unique column names in the table schema on
46
- which the underlying data is partitioned. Either partition_spec
47
- or partition_keys must be specified but not both.
66
+ def locator_alias(self) -> Optional[StreamLocatorAlias]:
67
+ return StreamLocatorAlias.of(self)
48
68
 
49
- (Deprecated): Partition keys will be deprecated in the favor
50
- of partition_spec in future releases.
69
+ @property
70
+ def partition_scheme(self) -> Optional[partition.PartitionScheme]:
51
71
  """
52
- return self.get("partitionKeys")
72
+ A table's partition keys are defined within the context of a
73
+ Partition Scheme, which supports defining both fields to partition
74
+ a table by and optional transforms to apply to those fields to
75
+ derive the Partition Values that a given field, and its corresponding
76
+ record, belong to.
77
+ """
78
+ val: Dict[str, Any] = self.get("partitionScheme")
79
+ if val is not None and not isinstance(val, partition.PartitionScheme):
80
+ self.partition_scheme = val = partition.PartitionScheme(val)
81
+ return val
53
82
 
54
- @partition_keys.setter
55
- def partition_keys(self, partition_keys: Optional[List[Dict[str, Any]]]) -> None:
56
- self["partitionKeys"] = partition_keys
83
+ @partition_scheme.setter
84
+ def partition_scheme(
85
+ self, partition_scheme: Optional[partition.PartitionScheme]
86
+ ) -> None:
87
+ self["partitionScheme"] = partition_scheme
57
88
 
58
89
  @property
59
- def previous_stream_digest(self) -> Optional[str]:
60
- """
61
- Previous stream digest
62
- """
63
- return self.get("previousStreamDigest")
90
+ def previous_stream_id(self) -> Optional[str]:
91
+ return self.get("previousStreamId")
64
92
 
65
- @previous_stream_digest.setter
66
- def previous_stream_digest(self, previous_stream_digest: Optional[str]) -> None:
67
- self["previousStreamDigest"] = previous_stream_digest
93
+ @previous_stream_id.setter
94
+ def previous_stream_id(self, previous_stream_id: Optional[str]) -> None:
95
+ self["previousStreamId"] = previous_stream_id
96
+
97
+ @property
98
+ def watermark(self) -> Optional[int]:
99
+ return self.get("watermark")
100
+
101
+ @watermark.setter
102
+ def watermark(self, watermark: Optional[int]) -> None:
103
+ self["watermark"] = watermark
68
104
 
69
105
  @property
70
106
  def state(self) -> Optional[CommitState]:
@@ -79,24 +115,12 @@ class Stream(dict):
79
115
  self["state"] = state
80
116
 
81
117
  @property
82
- def partition_spec(self) -> Optional[StreamPartitionSpec]:
83
- """
84
- If a table uses complex partitioning instead of identity,
85
- partition spec can be specified to define that strategy.
86
- For example, a partition spec can define a bucketing strategy
87
- on composite column values or can define iceberg compliant
88
- bucketing.
118
+ def native_object(self) -> Optional[Any]:
119
+ return self.get("nativeObject")
89
120
 
90
- Either partition_spec or partition_keys must be specified but not both.
91
- """
92
- val: Dict[str, Any] = self.get("partitionSpec")
93
- if val is not None and not isinstance(val, StreamPartitionSpec):
94
- self.partition_spec = val = StreamPartitionSpec(val)
95
- return val
96
-
97
- @partition_spec.setter
98
- def partition_spec(self, spec: StreamPartitionSpec) -> None:
99
- self["partitionSpec"] = spec
121
+ @native_object.setter
122
+ def native_object(self, native_object: Optional[Any]) -> None:
123
+ self["nativeObject"] = native_object
100
124
 
101
125
  @property
102
126
  def namespace_locator(self) -> Optional[NamespaceLocator]:
@@ -126,6 +150,13 @@ class Stream(dict):
126
150
  return stream_locator.stream_id
127
151
  return None
128
152
 
153
+ @property
154
+ def stream_format(self) -> Optional[str]:
155
+ stream_locator = self.locator
156
+ if stream_locator:
157
+ return stream_locator.format
158
+ return None
159
+
129
160
  @property
130
161
  def namespace(self) -> Optional[str]:
131
162
  stream_locator = self.locator
@@ -147,16 +178,65 @@ class Stream(dict):
147
178
  return stream_locator.table_version
148
179
  return None
149
180
 
150
- def validate_partition_values(self, partition_values: Optional[PartitionValues]):
151
- # TODO (pdames): ensure value data types match key data types
152
- partition_keys = self.partition_keys
153
- num_keys = len(partition_keys) if partition_keys else 0
154
- num_values = len(partition_values) if partition_values else 0
155
- if num_values != num_keys:
156
- raise ValueError(
157
- f"Found {num_values} partition values but "
158
- f"{num_keys} partition keys: {self}"
181
+ def to_serializable(self) -> Stream:
182
+ serializable = self
183
+ if serializable.table_locator:
184
+ serializable: Stream = Stream.update_for(self)
185
+ # remove the mutable table locator
186
+ serializable.table_version_locator.table_locator = TableLocator.at(
187
+ namespace=self.id,
188
+ table_name=self.id,
159
189
  )
190
+ return serializable
191
+
192
+ def from_serializable(
193
+ self,
194
+ path: str,
195
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
196
+ ) -> Stream:
197
+ # restore the table locator from its mapped immutable metafile ID
198
+ if self.table_locator and self.table_locator.table_name == self.id:
199
+ parent_rev_dir_path = Metafile._parent_metafile_rev_dir_path(
200
+ base_metafile_path=path,
201
+ parent_number=2,
202
+ )
203
+ txn_log_dir = posixpath.join(
204
+ posixpath.dirname(
205
+ posixpath.dirname(
206
+ posixpath.dirname(parent_rev_dir_path),
207
+ )
208
+ ),
209
+ TXN_DIR_NAME,
210
+ )
211
+ table = Table.read(
212
+ MetafileRevisionInfo.latest_revision(
213
+ revision_dir_path=parent_rev_dir_path,
214
+ filesystem=filesystem,
215
+ success_txn_log_dir=txn_log_dir,
216
+ ).path,
217
+ filesystem,
218
+ )
219
+ self.table_version_locator.table_locator = table.locator
220
+ return self
221
+
222
+
223
+ class StreamLocatorName(LocatorName):
224
+ def __init__(self, locator: StreamLocator):
225
+ self.locator = locator
226
+
227
+ @property
228
+ def immutable_id(self) -> Optional[str]:
229
+ return self.locator.stream_id
230
+
231
+ @immutable_id.setter
232
+ def immutable_id(self, immutable_id: Optional[str]):
233
+ self.locator.stream_id = immutable_id
234
+
235
+ def parts(self) -> List[str]:
236
+ return [
237
+ self.locator.stream_id,
238
+ self.locator.format,
239
+ ]
160
240
 
161
241
 
162
242
  class StreamLocator(Locator, dict):
@@ -164,7 +244,7 @@ class StreamLocator(Locator, dict):
164
244
  def of(
165
245
  table_version_locator: Optional[TableVersionLocator],
166
246
  stream_id: Optional[str],
167
- storage_type: Optional[str],
247
+ stream_format: Optional[StreamFormat],
168
248
  ) -> StreamLocator:
169
249
  """
170
250
  Creates a table version Stream Locator. All input parameters are
@@ -173,7 +253,11 @@ class StreamLocator(Locator, dict):
173
253
  stream_locator = StreamLocator()
174
254
  stream_locator.table_version_locator = table_version_locator
175
255
  stream_locator.stream_id = stream_id
176
- stream_locator.storage_type = storage_type
256
+ stream_locator.format = (
257
+ stream_format.value
258
+ if isinstance(stream_format, StreamFormat)
259
+ else stream_format
260
+ )
177
261
  return stream_locator
178
262
 
179
263
  @staticmethod
@@ -182,19 +266,31 @@ class StreamLocator(Locator, dict):
182
266
  table_name: Optional[str],
183
267
  table_version: Optional[str],
184
268
  stream_id: Optional[str],
185
- storage_type: Optional[str],
269
+ stream_format: Optional[StreamFormat],
186
270
  ) -> StreamLocator:
187
- table_version_locator = TableVersionLocator.at(
188
- namespace,
189
- table_name,
190
- table_version,
271
+ table_version_locator = (
272
+ TableVersionLocator.at(
273
+ namespace,
274
+ table_name,
275
+ table_version,
276
+ )
277
+ if table_version
278
+ else None
191
279
  )
192
280
  return StreamLocator.of(
193
281
  table_version_locator,
194
282
  stream_id,
195
- storage_type,
283
+ stream_format,
196
284
  )
197
285
 
286
+ @property
287
+ def name(self) -> StreamLocatorName:
288
+ return StreamLocatorName(self)
289
+
290
+ @property
291
+ def parent(self) -> Optional[TableVersionLocator]:
292
+ return self.table_version_locator
293
+
198
294
  @property
199
295
  def table_version_locator(self) -> Optional[TableVersionLocator]:
200
296
  val: Dict[str, Any] = self.get("tableVersionLocator")
@@ -217,12 +313,12 @@ class StreamLocator(Locator, dict):
217
313
  self["streamId"] = stream_id
218
314
 
219
315
  @property
220
- def storage_type(self) -> Optional[str]:
221
- return self.get("storageType")
316
+ def format(self) -> Optional[str]:
317
+ return self.get("format")
222
318
 
223
- @storage_type.setter
224
- def storage_type(self, storage_type: Optional[str]) -> None:
225
- self["storageType"] = storage_type
319
+ @format.setter
320
+ def format(self, stream_format: Optional[str]) -> None:
321
+ self["format"] = stream_format
226
322
 
227
323
  @property
228
324
  def namespace_locator(self) -> Optional[NamespaceLocator]:
@@ -259,13 +355,45 @@ class StreamLocator(Locator, dict):
259
355
  return table_version_locator.table_version
260
356
  return None
261
357
 
262
- def canonical_string(self) -> str:
263
- """
264
- Returns a unique string for the given locator that can be used
265
- for equality checks (i.e. two locators are equal if they have
266
- the same canonical string).
267
- """
268
- tvl_hexdigest = self.table_version_locator.hexdigest()
269
- stream_id = self.stream_id
270
- storage_type = self.storage_type
271
- return f"{tvl_hexdigest}|{stream_id}|{storage_type}"
358
+
359
+ class StreamLocatorAliasName(LocatorName):
360
+ def __init__(self, locator: StreamLocatorAlias):
361
+ self.locator = locator
362
+
363
+ @property
364
+ def immutable_id(self) -> Optional[str]:
365
+ return None
366
+
367
+ def parts(self) -> List[str]:
368
+ return [self.locator.format]
369
+
370
+
371
+ class StreamLocatorAlias(Locator, dict):
372
+ @staticmethod
373
+ def of(
374
+ parent_stream: Stream,
375
+ ) -> StreamLocatorAlias:
376
+ return (
377
+ StreamLocatorAlias(
378
+ {
379
+ "format": parent_stream.stream_format,
380
+ "parent": (
381
+ parent_stream.locator.parent if parent_stream.locator else None
382
+ ),
383
+ }
384
+ )
385
+ if parent_stream.state == CommitState.COMMITTED
386
+ else None # only committed streams can be resolved by alias
387
+ )
388
+
389
+ @property
390
+ def format(self) -> Optional[str]:
391
+ return self.get("format")
392
+
393
+ @property
394
+ def name(self) -> StreamLocatorAliasName:
395
+ return StreamLocatorAliasName(self)
396
+
397
+ @property
398
+ def parent(self) -> Optional[Locator]:
399
+ return self.get("parent")