deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,23 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Any, Dict, Optional
4
+ from typing import Any, Dict, Optional, List
5
5
 
6
- from deltacat.storage.model.locator import Locator
6
+ from deltacat.storage.model.metafile import Metafile
7
+ from deltacat.storage.model.locator import Locator, LocatorName
7
8
 
9
+ NamespaceProperties = dict[str, Any]
8
10
 
9
- class Namespace(dict):
11
+
12
+ class Namespace(Metafile):
10
13
  @staticmethod
11
14
  def of(
12
- locator: Optional[NamespaceLocator], permissions: Optional[Dict[str, Any]]
15
+ locator: Optional[NamespaceLocator],
16
+ properties: Optional[NamespaceProperties] = None,
13
17
  ) -> Namespace:
14
18
  namespace = Namespace()
15
19
  namespace.locator = locator
16
- namespace.permissions = permissions
20
+ namespace.properties = properties
17
21
  return namespace
18
22
 
19
23
  @property
@@ -35,12 +39,24 @@ class Namespace(dict):
35
39
  return None
36
40
 
37
41
  @property
38
- def permissions(self) -> Optional[Dict[str, Any]]:
39
- return self.get("permissions")
42
+ def properties(self) -> Optional[NamespaceProperties]:
43
+ return self.get("properties")
44
+
45
+ @properties.setter
46
+ def properties(self, properties: Optional[NamespaceProperties]) -> None:
47
+ self["properties"] = properties
48
+
49
+
50
+ class NamespaceLocatorName(LocatorName):
51
+ def __init__(self, locator: NamespaceLocator):
52
+ self.locator = locator
53
+
54
+ @property
55
+ def immutable_id(self) -> Optional[str]:
56
+ return None
40
57
 
41
- @permissions.setter
42
- def permissions(self, permissions: Optional[Dict[str, Any]]) -> None:
43
- self["permissions"] = permissions
58
+ def parts(self) -> List[str]:
59
+ return [self.locator.namespace]
44
60
 
45
61
 
46
62
  class NamespaceLocator(Locator, dict):
@@ -50,6 +66,14 @@ class NamespaceLocator(Locator, dict):
50
66
  namespace_locator.namespace = namespace
51
67
  return namespace_locator
52
68
 
69
+ @property
70
+ def name(self) -> NamespaceLocatorName:
71
+ return NamespaceLocatorName(self)
72
+
73
+ @property
74
+ def parent(self) -> Optional[Locator]:
75
+ return None
76
+
53
77
  @property
54
78
  def namespace(self) -> Optional[str]:
55
79
  return self.get("namespace")
@@ -57,11 +81,3 @@ class NamespaceLocator(Locator, dict):
57
81
  @namespace.setter
58
82
  def namespace(self, namespace: Optional[str]) -> None:
59
83
  self["namespace"] = namespace
60
-
61
- def canonical_string(self) -> str:
62
- """
63
- Returns a unique string for the given locator that can be used
64
- for equality checks (i.e. two locators are equal if they have
65
- the same canonical string).
66
- """
67
- return self.namespace
@@ -1,29 +1,58 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
- from typing import Any, Dict, List, Optional, Union
4
3
 
4
+ import base64
5
+ import posixpath
6
+
7
+ import pyarrow
5
8
  import pyarrow as pa
6
- from deltacat.storage.model.partition_spec import PartitionValues
7
- from deltacat.storage.model.locator import Locator
9
+
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from deltacat.storage.model.metafile import Metafile, MetafileRevisionInfo
13
+ from deltacat.constants import METAFILE_FORMAT, METAFILE_FORMAT_JSON, TXN_DIR_NAME
14
+ from deltacat.storage.model.schema import (
15
+ FieldLocator,
16
+ Schema,
17
+ )
18
+ from deltacat.storage.model.locator import (
19
+ Locator,
20
+ LocatorName,
21
+ )
8
22
  from deltacat.storage.model.namespace import NamespaceLocator
9
23
  from deltacat.storage.model.stream import StreamLocator
10
- from deltacat.storage.model.table import TableLocator
24
+ from deltacat.storage.model.table import (
25
+ TableLocator,
26
+ Table,
27
+ )
11
28
  from deltacat.storage.model.table_version import TableVersionLocator
12
- from deltacat.storage.model.types import CommitState
29
+ from deltacat.storage.model.transform import Transform
30
+ from deltacat.storage.model.types import (
31
+ CommitState,
32
+ StreamFormat,
33
+ )
13
34
  from deltacat.types.media import ContentType
14
35
 
15
36
 
16
- class Partition(dict):
37
+ """
38
+ An ordered list of partition values. Partition values are typically derived
39
+ by applying one or more transforms to a table's fields.
40
+ """
41
+ PartitionValues = List[Any]
42
+ UNPARTITIONED_SCHEME_ID = "deadbeef-7277-49a4-a195-fdc8ed235d42"
43
+
44
+
45
+ class Partition(Metafile):
17
46
  @staticmethod
18
47
  def of(
19
48
  locator: Optional[PartitionLocator],
20
- schema: Optional[Union[pa.Schema, str, bytes]],
49
+ schema: Optional[Schema],
21
50
  content_types: Optional[List[ContentType]],
22
51
  state: Optional[CommitState] = None,
23
52
  previous_stream_position: Optional[int] = None,
24
53
  previous_partition_id: Optional[str] = None,
25
54
  stream_position: Optional[int] = None,
26
- next_partition_id: Optional[str] = None,
55
+ partition_scheme_id: Optional[str] = None,
27
56
  ) -> Partition:
28
57
  partition = Partition()
29
58
  partition.locator = locator
@@ -33,7 +62,9 @@ class Partition(dict):
33
62
  partition.previous_stream_position = previous_stream_position
34
63
  partition.previous_partition_id = previous_partition_id
35
64
  partition.stream_position = stream_position
36
- partition.next_partition_id = next_partition_id
65
+ partition.partition_scheme_id = (
66
+ partition_scheme_id if locator.partition_values else UNPARTITIONED_SCHEME_ID
67
+ )
37
68
  return partition
38
69
 
39
70
  @property
@@ -48,11 +79,18 @@ class Partition(dict):
48
79
  self["partitionLocator"] = partition_locator
49
80
 
50
81
  @property
51
- def schema(self) -> Optional[Union[pa.Schema, str, bytes]]:
52
- return self.get("schema")
82
+ def locator_alias(self) -> Optional[PartitionLocatorAlias]:
83
+ return PartitionLocatorAlias.of(self)
84
+
85
+ @property
86
+ def schema(self) -> Optional[Schema]:
87
+ val: Dict[str, Any] = self.get("schema")
88
+ if val is not None and not isinstance(val, Schema):
89
+ self.schema = val = Schema(val)
90
+ return val
53
91
 
54
92
  @schema.setter
55
- def schema(self, schema: Optional[Union[pa.Schema, str, bytes]]) -> None:
93
+ def schema(self, schema: Optional[Schema]) -> None:
56
94
  self["schema"] = schema
57
95
 
58
96
  @property
@@ -104,12 +142,12 @@ class Partition(dict):
104
142
  self["streamPosition"] = stream_position
105
143
 
106
144
  @property
107
- def next_partition_id(self) -> Optional[str]:
108
- return self.get("nextPartitionId")
145
+ def partition_scheme_id(self) -> Optional[str]:
146
+ return self.get("partitionSchemeId")
109
147
 
110
- @next_partition_id.setter
111
- def next_partition_id(self, next_partition_id: Optional[str]):
112
- self["nextPartitionId"] = next_partition_id
148
+ @partition_scheme_id.setter
149
+ def partition_scheme_id(self, partition_scheme_id: Optional[str]) -> None:
150
+ self["partitionSchemeId"] = partition_scheme_id
113
151
 
114
152
  @property
115
153
  def partition_id(self) -> Optional[str]:
@@ -125,6 +163,13 @@ class Partition(dict):
125
163
  return partition_locator.stream_id
126
164
  return None
127
165
 
166
+ @property
167
+ def stream_format(self) -> Optional[str]:
168
+ partition_locator = self.locator
169
+ if partition_locator:
170
+ return partition_locator.stream_format
171
+ return None
172
+
128
173
  @property
129
174
  def partition_values(self) -> Optional[PartitionValues]:
130
175
  partition_locator = self.locator
@@ -163,7 +208,7 @@ class Partition(dict):
163
208
  def storage_type(self) -> Optional[str]:
164
209
  partition_locator = self.locator
165
210
  if partition_locator:
166
- return partition_locator.storage_type
211
+ return partition_locator.stream_format
167
212
  return None
168
213
 
169
214
  @property
@@ -193,6 +238,84 @@ class Partition(dict):
193
238
  content_type in supported_content_types
194
239
  )
195
240
 
241
+ def to_serializable(self) -> Partition:
242
+ serializable: Partition = Partition.update_for(self)
243
+ if serializable.schema:
244
+ schema_bytes = serializable.schema.serialize().to_pybytes()
245
+ serializable.schema = (
246
+ base64.b64encode(schema_bytes).decode("utf-8")
247
+ if METAFILE_FORMAT == METAFILE_FORMAT_JSON
248
+ else schema_bytes
249
+ )
250
+
251
+ if serializable.table_locator:
252
+ # replace the mutable table locator
253
+ serializable.table_version_locator.table_locator = TableLocator.at(
254
+ namespace=self.id,
255
+ table_name=self.id,
256
+ )
257
+ return serializable
258
+
259
+ def from_serializable(
260
+ self,
261
+ path: str,
262
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
263
+ ) -> Partition:
264
+ if self.get("schema"):
265
+ schema_data = self["schema"]
266
+ schema_bytes = (
267
+ base64.b64decode(schema_data)
268
+ if METAFILE_FORMAT == METAFILE_FORMAT_JSON
269
+ else schema_data
270
+ )
271
+ self["schema"] = Schema.deserialize(pa.py_buffer(schema_bytes))
272
+ else:
273
+ self["schema"] = None
274
+
275
+ # restore the table locator from its mapped immutable metafile ID
276
+ if self.table_locator and self.table_locator.table_name == self.id:
277
+ parent_rev_dir_path = Metafile._parent_metafile_rev_dir_path(
278
+ base_metafile_path=path,
279
+ parent_number=3,
280
+ )
281
+ txn_log_dir = posixpath.join(
282
+ posixpath.dirname(
283
+ posixpath.dirname(
284
+ posixpath.dirname(parent_rev_dir_path),
285
+ )
286
+ ),
287
+ TXN_DIR_NAME,
288
+ )
289
+ table = Table.read(
290
+ MetafileRevisionInfo.latest_revision(
291
+ revision_dir_path=parent_rev_dir_path,
292
+ filesystem=filesystem,
293
+ success_txn_log_dir=txn_log_dir,
294
+ ).path,
295
+ filesystem,
296
+ )
297
+ self.table_version_locator.table_locator = table.locator
298
+ return self
299
+
300
+
301
+ class PartitionLocatorName(LocatorName):
302
+ def __init__(self, locator: PartitionLocator):
303
+ self.locator = locator
304
+
305
+ @property
306
+ def immutable_id(self) -> Optional[str]:
307
+ return self.locator.partition_id
308
+
309
+ @immutable_id.setter
310
+ def immutable_id(self, immutable_id: Optional[str]):
311
+ self.locator.partition_id = immutable_id
312
+
313
+ def parts(self) -> List[str]:
314
+ return [
315
+ str(self.locator.partition_values),
316
+ self.locator.partition_id,
317
+ ]
318
+
196
319
 
197
320
  class PartitionLocator(Locator, dict):
198
321
  @staticmethod
@@ -223,16 +346,20 @@ class PartitionLocator(Locator, dict):
223
346
  table_name: Optional[str],
224
347
  table_version: Optional[str],
225
348
  stream_id: Optional[str],
226
- storage_type: Optional[str],
349
+ stream_format: Optional[StreamFormat],
227
350
  partition_values: Optional[PartitionValues],
228
351
  partition_id: Optional[str],
229
352
  ) -> PartitionLocator:
230
- stream_locator = StreamLocator.at(
231
- namespace,
232
- table_name,
233
- table_version,
234
- stream_id,
235
- storage_type,
353
+ stream_locator = (
354
+ StreamLocator.at(
355
+ namespace,
356
+ table_name,
357
+ table_version,
358
+ stream_id,
359
+ stream_format,
360
+ )
361
+ if stream_id and stream_format
362
+ else None
236
363
  )
237
364
  return PartitionLocator.of(
238
365
  stream_locator,
@@ -240,6 +367,14 @@ class PartitionLocator(Locator, dict):
240
367
  partition_id,
241
368
  )
242
369
 
370
+ @property
371
+ def name(self) -> PartitionLocatorName:
372
+ return PartitionLocatorName(self)
373
+
374
+ @property
375
+ def parent(self) -> Optional[StreamLocator]:
376
+ return self.stream_locator
377
+
243
378
  @property
244
379
  def stream_locator(self) -> Optional[StreamLocator]:
245
380
  val: Dict[str, Any] = self.get("streamLocator")
@@ -296,10 +431,10 @@ class PartitionLocator(Locator, dict):
296
431
  return None
297
432
 
298
433
  @property
299
- def storage_type(self) -> Optional[str]:
434
+ def stream_format(self) -> Optional[str]:
300
435
  stream_locator = self.stream_locator
301
436
  if stream_locator:
302
- return stream_locator.storage_type
437
+ return stream_locator.format
303
438
  return None
304
439
 
305
440
  @property
@@ -323,13 +458,203 @@ class PartitionLocator(Locator, dict):
323
458
  return stream_locator.table_version
324
459
  return None
325
460
 
326
- def canonical_string(self) -> str:
327
- """
328
- Returns a unique string for the given locator that can be used
329
- for equality checks (i.e. two locators are equal if they have
330
- the same canonical string).
331
- """
332
- sl_hexdigest = self.stream_locator.hexdigest()
333
- partition_vals = str(self.partition_values)
334
- partition_id = self.partition_id
335
- return f"{sl_hexdigest}|{partition_vals}|{partition_id}"
461
+
462
+ class PartitionKey(dict):
463
+ @staticmethod
464
+ def of(
465
+ key: List[FieldLocator],
466
+ name: Optional[str] = None,
467
+ field_id: Optional[int] = None,
468
+ transform: Optional[Transform] = None,
469
+ native_object: Optional[Any] = None,
470
+ ) -> PartitionKey:
471
+ return PartitionKey(
472
+ {
473
+ "key": key,
474
+ "name": name,
475
+ "fieldId": field_id,
476
+ "transform": transform,
477
+ "nativeObject": native_object,
478
+ }
479
+ )
480
+
481
+ def equivalent_to(
482
+ self,
483
+ other: PartitionKey,
484
+ check_identifiers: False,
485
+ ):
486
+ if other is None:
487
+ return False
488
+ if not isinstance(other, dict):
489
+ return False
490
+ if not isinstance(other, PartitionKey):
491
+ other = PartitionKey(other)
492
+ return (
493
+ self.key == other.key
494
+ and self.transform == other.transform
495
+ and not check_identifiers
496
+ or (self.name == other.name and self.id == other.id)
497
+ )
498
+
499
+ @property
500
+ def key(self) -> List[FieldLocator]:
501
+ return self.get("key")
502
+
503
+ @property
504
+ def name(self) -> Optional[str]:
505
+ return self.get("name")
506
+
507
+ @property
508
+ def id(self) -> Optional[int]:
509
+ return self.get("fieldId")
510
+
511
+ @property
512
+ def transform(self) -> Optional[Transform]:
513
+ val: Dict[str, Any] = self.get("transform")
514
+ if val is not None and not isinstance(val, Transform):
515
+ self["transform"] = val = Transform(val)
516
+ return val
517
+
518
+ @property
519
+ def native_object(self) -> Optional[Any]:
520
+ return self.get("nativeObject")
521
+
522
+
523
+ class PartitionKeyList(List[PartitionKey]):
524
+ @staticmethod
525
+ def of(items: List[PartitionKey]) -> PartitionKeyList:
526
+ typed_items = PartitionKeyList()
527
+ for item in items:
528
+ if item is not None and not isinstance(item, PartitionKey):
529
+ item = PartitionKey(item)
530
+ typed_items.append(item)
531
+ return typed_items
532
+
533
+ def __getitem__(self, item):
534
+ val = super().__getitem__(item)
535
+ if val is not None and not isinstance(val, PartitionKey):
536
+ self[item] = val = PartitionKey(val)
537
+ return val
538
+
539
+
540
+ class PartitionScheme(dict):
541
+ @staticmethod
542
+ def of(
543
+ keys: Optional[PartitionKeyList],
544
+ name: Optional[str] = None,
545
+ scheme_id: Optional[str] = None,
546
+ native_object: Optional[Any] = None,
547
+ ) -> PartitionScheme:
548
+ return PartitionScheme(
549
+ {
550
+ "keys": keys,
551
+ "name": name,
552
+ "id": scheme_id,
553
+ "nativeObject": native_object,
554
+ }
555
+ )
556
+
557
+ def equivalent_to(
558
+ self,
559
+ other: PartitionScheme,
560
+ check_identifiers: bool = False,
561
+ ) -> bool:
562
+ if other is None:
563
+ return False
564
+ if not isinstance(other, dict):
565
+ return False
566
+ if not isinstance(other, PartitionScheme):
567
+ other = PartitionScheme(other)
568
+ for i in range(len(self.keys)):
569
+ if not self.keys[i].equivalent_to(other.keys[i], check_identifiers):
570
+ return False
571
+ return not check_identifiers or (
572
+ self.name == other.name and self.id == other.id
573
+ )
574
+
575
+ @property
576
+ def keys(self) -> Optional[PartitionKeyList]:
577
+ val: List[PartitionKey] = self.get("keys")
578
+ if val is not None and not isinstance(val, PartitionKeyList):
579
+ self["keys"] = val = PartitionKeyList.of(val)
580
+ return val
581
+
582
+ @property
583
+ def name(self) -> Optional[str]:
584
+ return self.get("name")
585
+
586
+ @property
587
+ def id(self) -> Optional[str]:
588
+ return self.get("id")
589
+
590
+ @property
591
+ def native_object(self) -> Optional[Any]:
592
+ return self.get("nativeObject")
593
+
594
+
595
+ class PartitionSchemeList(List[PartitionScheme]):
596
+ @staticmethod
597
+ def of(items: List[PartitionScheme]) -> PartitionSchemeList:
598
+ typed_items = PartitionSchemeList()
599
+ for item in items:
600
+ if item is not None and not isinstance(item, PartitionScheme):
601
+ item = PartitionScheme(item)
602
+ typed_items.append(item)
603
+ return typed_items
604
+
605
+ def __getitem__(self, item):
606
+ val = super().__getitem__(item)
607
+ if val is not None and not isinstance(val, PartitionScheme):
608
+ self[item] = val = PartitionScheme(val)
609
+ return val
610
+
611
+
612
+ class PartitionLocatorAliasName(LocatorName):
613
+ def __init__(self, locator: PartitionLocatorAlias):
614
+ self.locator = locator
615
+
616
+ @property
617
+ def immutable_id(self) -> Optional[str]:
618
+ return None
619
+
620
+ def parts(self) -> List[str]:
621
+ return [
622
+ str(self.locator.partition_values),
623
+ self.locator.partition_scheme_id,
624
+ ]
625
+
626
+
627
+ class PartitionLocatorAlias(Locator, dict):
628
+ @staticmethod
629
+ def of(parent_partition: Partition):
630
+ return (
631
+ PartitionLocatorAlias(
632
+ {
633
+ "partition_values": parent_partition.partition_values,
634
+ "partition_scheme_id": parent_partition.partition_scheme_id,
635
+ "parent": (
636
+ parent_partition.locator.parent
637
+ if parent_partition.locator
638
+ else None
639
+ ),
640
+ }
641
+ )
642
+ if parent_partition.state == CommitState.COMMITTED
643
+ else None # only committed partitions can be resolved by alias
644
+ )
645
+
646
+ @property
647
+ def partition_values(self) -> Optional[PartitionValues]:
648
+ return self.get("partition_values")
649
+
650
+ @property
651
+ def partition_scheme_id(self) -> Optional[str]:
652
+ return self.get("partition_scheme_id")
653
+
654
+ @property
655
+ def name(self) -> PartitionLocatorAliasName:
656
+ return PartitionLocatorAliasName(self)
657
+
658
+ @property
659
+ def parent(self) -> Optional[Locator]:
660
+ return self.get("parent")
File without changes
@@ -0,0 +1,19 @@
1
+ class RowFilter:
2
+ ...
3
+
4
+
5
+ class ColumnFilter:
6
+ ...
7
+
8
+
9
+ class PartitionFilter:
10
+ ...
11
+
12
+
13
+ class Pushdown:
14
+ """Represents pushdown predicates to be applied for DeltaCAT Tables"""
15
+
16
+ row_filter: RowFilter
17
+ column_filter: ColumnFilter
18
+ partition_filter: PartitionFilter
19
+ limit: int
@@ -0,0 +1,10 @@
1
+ from dataclasses import dataclass
2
+
3
+ from deltacat.storage.model.scan.scan_task import ScanTask
4
+
5
+
6
+ @dataclass
7
+ class ScanPlan:
8
+ """Represents collection of ScanTasks to be passed to compute engine for query planning"""
9
+
10
+ scan_tasks: list[ScanTask]
@@ -0,0 +1,34 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+
4
+
5
+ @dataclass
6
+ class DataFile:
7
+ """Represents a data file, e.g. a S3 object, or a local file."""
8
+
9
+ file_path: str
10
+
11
+
12
+ class ScanTask(ABC):
13
+ """Base class representing a unit of data to be read by a compute worker"""
14
+
15
+ @abstractmethod
16
+ def data_files(self) -> list[DataFile]:
17
+ pass
18
+
19
+
20
+ @dataclass
21
+ class FileScanTask(ScanTask):
22
+ """A unit of data in the form of data files"""
23
+
24
+ data_file_list: list[DataFile]
25
+
26
+ def data_files(self) -> list[DataFile]:
27
+ return self.data_file_list
28
+
29
+
30
+ class ShardedScanTask(ScanTask):
31
+ """A unit of data in the form of shards (e.g. shard 1-10 each represents 1/10 of all data in a Table)"""
32
+
33
+ def data_files(self) -> list[DataFile]:
34
+ raise NotImplementedError("data_files is not implemented for ShardedScanTask")