deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +2 -3
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -1
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
  40. deltacat/compute/compactor_v2/steps/merge.py +11 -80
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  45. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  46. deltacat/compute/converter/constants.py +4 -0
  47. deltacat/compute/converter/converter_session.py +143 -0
  48. deltacat/compute/converter/model/convert_input.py +69 -0
  49. deltacat/compute/converter/model/convert_input_files.py +61 -0
  50. deltacat/compute/converter/model/converter_session_params.py +99 -0
  51. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  52. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  53. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  54. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  55. deltacat/compute/converter/steps/__init__.py +0 -0
  56. deltacat/compute/converter/steps/convert.py +211 -0
  57. deltacat/compute/converter/steps/dedupe.py +60 -0
  58. deltacat/compute/converter/utils/__init__.py +0 -0
  59. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat/compute/converter/utils/s3u.py +133 -0
  64. deltacat/compute/resource_estimation/delta.py +1 -19
  65. deltacat/constants.py +47 -1
  66. deltacat/env.py +51 -0
  67. deltacat/examples/__init__.py +0 -0
  68. deltacat/examples/basic_logging.py +101 -0
  69. deltacat/examples/common/__init__.py +0 -0
  70. deltacat/examples/common/fixtures.py +15 -0
  71. deltacat/examples/hello_world.py +27 -0
  72. deltacat/examples/iceberg/__init__.py +0 -0
  73. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  74. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  75. deltacat/exceptions.py +51 -9
  76. deltacat/logs.py +4 -1
  77. deltacat/storage/__init__.py +118 -28
  78. deltacat/storage/iceberg/__init__.py +0 -0
  79. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  80. deltacat/storage/iceberg/impl.py +737 -0
  81. deltacat/storage/iceberg/model.py +709 -0
  82. deltacat/storage/interface.py +217 -134
  83. deltacat/storage/main/__init__.py +0 -0
  84. deltacat/storage/main/impl.py +2077 -0
  85. deltacat/storage/model/delta.py +118 -71
  86. deltacat/storage/model/interop.py +24 -0
  87. deltacat/storage/model/list_result.py +8 -0
  88. deltacat/storage/model/locator.py +93 -3
  89. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  90. deltacat/storage/model/metafile.py +1316 -0
  91. deltacat/storage/model/namespace.py +34 -18
  92. deltacat/storage/model/partition.py +362 -37
  93. deltacat/storage/model/scan/__init__.py +0 -0
  94. deltacat/storage/model/scan/push_down.py +19 -0
  95. deltacat/storage/model/scan/scan_plan.py +10 -0
  96. deltacat/storage/model/scan/scan_task.py +34 -0
  97. deltacat/storage/model/schema.py +892 -0
  98. deltacat/storage/model/shard.py +47 -0
  99. deltacat/storage/model/sort_key.py +170 -13
  100. deltacat/storage/model/stream.py +208 -80
  101. deltacat/storage/model/table.py +123 -29
  102. deltacat/storage/model/table_version.py +322 -46
  103. deltacat/storage/model/transaction.py +757 -0
  104. deltacat/storage/model/transform.py +198 -61
  105. deltacat/storage/model/types.py +111 -13
  106. deltacat/storage/rivulet/__init__.py +11 -0
  107. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  108. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  109. deltacat/storage/rivulet/dataset.py +744 -0
  110. deltacat/storage/rivulet/dataset_executor.py +87 -0
  111. deltacat/storage/rivulet/feather/__init__.py +5 -0
  112. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  113. deltacat/storage/rivulet/feather/serializer.py +35 -0
  114. deltacat/storage/rivulet/fs/__init__.py +0 -0
  115. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  116. deltacat/storage/rivulet/fs/file_store.py +130 -0
  117. deltacat/storage/rivulet/fs/input_file.py +76 -0
  118. deltacat/storage/rivulet/fs/output_file.py +86 -0
  119. deltacat/storage/rivulet/logical_plan.py +105 -0
  120. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  121. deltacat/storage/rivulet/metastore/delta.py +190 -0
  122. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  123. deltacat/storage/rivulet/metastore/sst.py +82 -0
  124. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  125. deltacat/storage/rivulet/mvp/Table.py +101 -0
  126. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  127. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  129. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  130. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  131. deltacat/storage/rivulet/reader/__init__.py +0 -0
  132. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  133. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  134. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  135. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  136. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  137. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  138. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  139. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  140. deltacat/storage/rivulet/schema/__init__.py +0 -0
  141. deltacat/storage/rivulet/schema/datatype.py +128 -0
  142. deltacat/storage/rivulet/schema/schema.py +251 -0
  143. deltacat/storage/rivulet/serializer.py +40 -0
  144. deltacat/storage/rivulet/serializer_factory.py +42 -0
  145. deltacat/storage/rivulet/writer/__init__.py +0 -0
  146. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  147. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  148. deltacat/tests/_io/__init__.py +1 -0
  149. deltacat/tests/catalog/test_catalogs.py +324 -0
  150. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  151. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  152. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  153. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  154. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  155. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  156. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  157. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  158. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  159. deltacat/tests/compute/conftest.py +75 -0
  160. deltacat/tests/compute/converter/__init__.py +0 -0
  161. deltacat/tests/compute/converter/conftest.py +80 -0
  162. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  163. deltacat/tests/compute/converter/utils.py +123 -0
  164. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  165. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  166. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  167. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  168. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  169. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  170. deltacat/tests/compute/test_util_common.py +19 -12
  171. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  172. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  173. deltacat/tests/storage/__init__.py +0 -0
  174. deltacat/tests/storage/conftest.py +25 -0
  175. deltacat/tests/storage/main/__init__.py +0 -0
  176. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  177. deltacat/tests/storage/model/__init__.py +0 -0
  178. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  179. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  180. deltacat/tests/storage/model/test_schema.py +308 -0
  181. deltacat/tests/storage/model/test_shard.py +22 -0
  182. deltacat/tests/storage/model/test_table_version.py +110 -0
  183. deltacat/tests/storage/model/test_transaction.py +308 -0
  184. deltacat/tests/storage/rivulet/__init__.py +0 -0
  185. deltacat/tests/storage/rivulet/conftest.py +149 -0
  186. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  187. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  188. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  189. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  190. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  191. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  192. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  193. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  194. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  195. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  197. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  198. deltacat/tests/test_deltacat_api.py +39 -0
  199. deltacat/tests/test_utils/filesystem.py +14 -0
  200. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  201. deltacat/tests/test_utils/pyarrow.py +8 -15
  202. deltacat/tests/test_utils/storage.py +266 -3
  203. deltacat/tests/utils/test_daft.py +3 -3
  204. deltacat/tests/utils/test_pyarrow.py +0 -432
  205. deltacat/types/partial_download.py +1 -1
  206. deltacat/types/tables.py +1 -1
  207. deltacat/utils/export.py +59 -0
  208. deltacat/utils/filesystem.py +320 -0
  209. deltacat/utils/metafile_locator.py +73 -0
  210. deltacat/utils/pyarrow.py +36 -183
  211. deltacat-2.0.dist-info/METADATA +65 -0
  212. deltacat-2.0.dist-info/RECORD +347 -0
  213. deltacat/aws/redshift/__init__.py +0 -19
  214. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  215. deltacat/io/dataset.py +0 -73
  216. deltacat/io/read_api.py +0 -143
  217. deltacat/storage/model/delete_parameters.py +0 -40
  218. deltacat/storage/model/partition_spec.py +0 -71
  219. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  220. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  221. deltacat-1.1.35.dist-info/METADATA +0 -64
  222. deltacat-1.1.35.dist-info/RECORD +0 -219
  223. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  224. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  225. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  226. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  227. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  228. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  229. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  233. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  234. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  235. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,45 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Any, Dict, Optional
4
+ import posixpath
5
+ from typing import Any, Dict, Optional, List
5
6
 
6
- from deltacat.storage.model.locator import Locator
7
- from deltacat.storage.model.namespace import NamespaceLocator
7
+ import pyarrow
8
8
 
9
+ from deltacat.storage.model.locator import Locator, LocatorName
10
+ from deltacat.storage.model.namespace import (
11
+ NamespaceLocator,
12
+ Namespace,
13
+ )
14
+ from deltacat.storage.model.metafile import Metafile, MetafileRevisionInfo
15
+ from deltacat.constants import TXN_DIR_NAME
16
+
17
+ TableProperties = dict[str, Any]
18
+
19
+
20
+ class Table(Metafile):
21
+ """
22
+ Tables store properties common to every table version including the
23
+ table's name, a high-level description of all table versions, and
24
+ properties shared by all table versions.
25
+ """
9
26
 
10
- class Table(dict):
11
27
  @staticmethod
12
28
  def of(
13
29
  locator: Optional[TableLocator],
14
- permissions: Optional[Dict[str, Any]] = None,
15
30
  description: Optional[str] = None,
16
- properties: Optional[Dict[str, str]] = None,
31
+ properties: Optional[TableProperties] = None,
32
+ latest_active_table_version: Optional[str] = None,
33
+ latest_table_version: Optional[str] = None,
34
+ native_object: Optional[Any] = None,
17
35
  ) -> Table:
18
36
  table = Table()
19
37
  table.locator = locator
20
- table.permissions = permissions
21
38
  table.description = description
22
39
  table.properties = properties
40
+ table.latest_active_table_version = latest_active_table_version
41
+ table.latest_table_version = latest_table_version
42
+ table.native_object = native_object
23
43
  return table
24
44
 
25
45
  @property
@@ -33,14 +53,6 @@ class Table(dict):
33
53
  def locator(self, table_locator: Optional[TableLocator]) -> None:
34
54
  self["tableLocator"] = table_locator
35
55
 
36
- @property
37
- def permissions(self) -> Optional[Dict[str, Any]]:
38
- return self.get("permissions")
39
-
40
- @permissions.setter
41
- def permissions(self, permissions: Optional[Dict[str, Any]]) -> None:
42
- self["permissions"] = permissions
43
-
44
56
  @property
45
57
  def description(self) -> Optional[str]:
46
58
  return self.get("description")
@@ -50,13 +62,43 @@ class Table(dict):
50
62
  self["description"] = description
51
63
 
52
64
  @property
53
- def properties(self) -> Optional[Dict[str, str]]:
65
+ def properties(self) -> Optional[TableProperties]:
54
66
  return self.get("properties")
55
67
 
56
68
  @properties.setter
57
- def properties(self, properties: Optional[Dict[str, str]]) -> None:
69
+ def properties(self, properties: Optional[TableProperties]) -> None:
58
70
  self["properties"] = properties
59
71
 
72
+ @property
73
+ def latest_active_table_version(self) -> Optional[str]:
74
+ return self.get("latest_active_table_version")
75
+
76
+ @latest_active_table_version.setter
77
+ def latest_active_table_version(
78
+ self,
79
+ latest_active_table_version: Optional[str],
80
+ ) -> None:
81
+ self["latest_active_table_version"] = latest_active_table_version
82
+
83
+ @property
84
+ def latest_table_version(self) -> Optional[str]:
85
+ return self.get("latest_table_version")
86
+
87
+ @latest_table_version.setter
88
+ def latest_table_version(
89
+ self,
90
+ latest_table_version: Optional[str],
91
+ ) -> None:
92
+ self["latest_table_version"] = latest_table_version
93
+
94
+ @property
95
+ def native_object(self) -> Optional[Any]:
96
+ return self.get("nativeObject")
97
+
98
+ @native_object.setter
99
+ def native_object(self, native_object: Optional[Any]) -> None:
100
+ self["nativeObject"] = native_object
101
+
60
102
  @property
61
103
  def namespace_locator(self) -> Optional[NamespaceLocator]:
62
104
  table_locator = self.locator
@@ -78,6 +120,60 @@ class Table(dict):
78
120
  return table_locator.table_name
79
121
  return None
80
122
 
123
+ @table_name.setter
124
+ def table_name(self, table_name: Optional[str]) -> None:
125
+ table_locator = self.locator
126
+ if table_locator:
127
+ table_locator.table_name = table_name
128
+
129
+ def to_serializable(self) -> Table:
130
+ serializable = self
131
+ if serializable.namespace_locator:
132
+ serializable: Table = Table.update_for(self)
133
+ # remove the mutable namespace locator
134
+ serializable.locator.namespace_locator = NamespaceLocator.of(self.id)
135
+ return serializable
136
+
137
+ def from_serializable(
138
+ self,
139
+ path: str,
140
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
141
+ ) -> Table:
142
+ # restore the namespace locator from its mapped immutable metafile ID
143
+ if self.namespace_locator and self.namespace_locator.namespace == self.id:
144
+ parent_rev_dir_path = Metafile._parent_metafile_rev_dir_path(
145
+ base_metafile_path=path,
146
+ parent_number=1,
147
+ )
148
+ txn_log_dir = posixpath.join(
149
+ posixpath.dirname(
150
+ posixpath.dirname(parent_rev_dir_path),
151
+ ),
152
+ TXN_DIR_NAME,
153
+ )
154
+ namespace = Namespace.read(
155
+ MetafileRevisionInfo.latest_revision(
156
+ revision_dir_path=parent_rev_dir_path,
157
+ filesystem=filesystem,
158
+ success_txn_log_dir=txn_log_dir,
159
+ ).path,
160
+ filesystem,
161
+ )
162
+ self.locator.namespace_locator = namespace.locator
163
+ return self
164
+
165
+
166
+ class TableLocatorName(LocatorName):
167
+ def __init__(self, locator: TableLocator):
168
+ self.locator = locator
169
+
170
+ @property
171
+ def immutable_id(self) -> Optional[str]:
172
+ return None
173
+
174
+ def parts(self) -> List[str]:
175
+ return [self.locator.table_name]
176
+
81
177
 
82
178
  class TableLocator(Locator, dict):
83
179
  @staticmethod
@@ -91,11 +187,19 @@ class TableLocator(Locator, dict):
91
187
 
92
188
  @staticmethod
93
189
  def at(namespace: Optional[str], table_name: Optional[str]) -> TableLocator:
94
- namespace_locator = NamespaceLocator.of(namespace)
190
+ namespace_locator = NamespaceLocator.of(namespace) if namespace else None
95
191
  return TableLocator.of(namespace_locator, table_name)
96
192
 
97
193
  @property
98
- def namespace_locator(self) -> NamespaceLocator:
194
+ def name(self) -> TableLocatorName:
195
+ return TableLocatorName(self)
196
+
197
+ @property
198
+ def parent(self) -> Optional[NamespaceLocator]:
199
+ return self.namespace_locator
200
+
201
+ @property
202
+ def namespace_locator(self) -> Optional[NamespaceLocator]:
99
203
  val: Dict[str, Any] = self.get("namespaceLocator")
100
204
  if val is not None and not isinstance(val, NamespaceLocator):
101
205
  self.namespace_locator = val = NamespaceLocator(val)
@@ -119,13 +223,3 @@ class TableLocator(Locator, dict):
119
223
  if namespace_locator:
120
224
  return namespace_locator.namespace
121
225
  return None
122
-
123
- def canonical_string(self) -> str:
124
- """
125
- Returns a unique string for the given locator that can be used
126
- for equality checks (i.e. two locators are equal if they have
127
- the same canonical string).
128
- """
129
- nl_hexdigest = self.namespace_locator.hexdigest()
130
- table_name = self.table_name
131
- return f"{nl_hexdigest}|{table_name}"
@@ -1,38 +1,76 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Any, Dict, List, Optional, Union
4
+ import base64
5
+ import re
6
+ import posixpath
7
+ from typing import Any, Dict, List, Optional, Tuple
5
8
 
9
+ import pyarrow
6
10
  import pyarrow as pa
7
11
 
8
- from deltacat.storage.model.locator import Locator
12
+ import deltacat.storage.model.partition as partition
13
+
14
+ from deltacat.storage.model.metafile import Metafile, MetafileRevisionInfo
15
+ from deltacat.constants import (
16
+ METAFILE_FORMAT,
17
+ METAFILE_FORMAT_JSON,
18
+ TXN_DIR_NAME,
19
+ BYTES_PER_KIBIBYTE,
20
+ )
21
+ from deltacat.storage.model.schema import (
22
+ Schema,
23
+ SchemaList,
24
+ )
25
+ from deltacat.storage.model.locator import (
26
+ Locator,
27
+ LocatorName,
28
+ )
9
29
  from deltacat.storage.model.namespace import NamespaceLocator
10
- from deltacat.storage.model.table import TableLocator
30
+ from deltacat.storage.model.table import (
31
+ TableLocator,
32
+ Table,
33
+ )
11
34
  from deltacat.types.media import ContentType
12
- from deltacat.storage.model.sort_key import SortKey
35
+ from deltacat.storage.model.sort_key import SortScheme, SortSchemeList
36
+ from deltacat.storage.model.types import LifecycleState
13
37
 
38
+ TableVersionProperties = Dict[str, Any]
14
39
 
15
- class TableVersion(dict):
40
+
41
+ class TableVersion(Metafile):
16
42
  @staticmethod
17
43
  def of(
18
44
  locator: Optional[TableVersionLocator],
19
- schema: Optional[Union[pa.Schema, str, bytes]],
20
- partition_keys: Optional[List[Dict[str, Any]]] = None,
21
- primary_key_columns: Optional[List[str]] = None,
45
+ schema: Optional[Schema],
46
+ partition_scheme: Optional[partition.PartitionScheme] = None,
22
47
  description: Optional[str] = None,
23
- properties: Optional[Dict[str, str]] = None,
48
+ properties: Optional[TableVersionProperties] = None,
24
49
  content_types: Optional[List[ContentType]] = None,
25
- sort_keys: Optional[List[SortKey]] = None,
50
+ sort_scheme: Optional[SortScheme] = None,
51
+ watermark: Optional[int] = None,
52
+ lifecycle_state: Optional[LifecycleState] = None,
53
+ schemas: Optional[SchemaList] = None,
54
+ partition_schemes: Optional[partition.PartitionSchemeList] = None,
55
+ sort_schemes: Optional[SortSchemeList] = None,
56
+ previous_table_version: Optional[str] = None,
57
+ native_object: Optional[Any] = None,
26
58
  ) -> TableVersion:
27
59
  table_version = TableVersion()
28
60
  table_version.locator = locator
29
61
  table_version.schema = schema
30
- table_version.partition_keys = partition_keys
31
- table_version.primary_keys = primary_key_columns
62
+ table_version.partition_scheme = partition_scheme
32
63
  table_version.description = description
33
64
  table_version.properties = properties
34
65
  table_version.content_types = content_types
35
- table_version.sort_keys = sort_keys
66
+ table_version.sort_scheme = sort_scheme
67
+ table_version.watermark = watermark
68
+ table_version.state = lifecycle_state
69
+ table_version.schemas = schemas
70
+ table_version.partition_schemes = partition_schemes
71
+ table_version.sort_schemes = sort_schemes
72
+ table_version.previous_table_version = previous_table_version
73
+ table_version.native_object = native_object
36
74
  return table_version
37
75
 
38
76
  @property
@@ -47,36 +85,91 @@ class TableVersion(dict):
47
85
  self["tableVersionLocator"] = table_version_locator
48
86
 
49
87
  @property
50
- def schema(self) -> Optional[Union[pa.Schema, str, bytes]]:
51
- return self.get("schema")
88
+ def schema(self) -> Optional[Schema]:
89
+ val: Dict[str, Any] = self.get("schema")
90
+ if val is not None and not isinstance(val, Schema):
91
+ self.schema = val = Schema(val)
92
+ return val
52
93
 
53
94
  @schema.setter
54
- def schema(self, schema: Optional[Union[pa.Schema, str, bytes]]) -> None:
95
+ def schema(self, schema: Optional[Schema]) -> None:
55
96
  self["schema"] = schema
56
97
 
57
98
  @property
58
- def sort_keys(self) -> Optional[List[SortKey]]:
59
- return self.get("sortKeys")
99
+ def schemas(self) -> Optional[SchemaList]:
100
+ val: Optional[SchemaList] = self.get("schemas")
101
+ if val is not None and not isinstance(val, SchemaList):
102
+ self["schemas"] = val = SchemaList.of(val)
103
+ return val
104
+
105
+ @schemas.setter
106
+ def schemas(self, schemas: Optional[SchemaList]) -> None:
107
+ self["schemas"] = schemas
108
+
109
+ @property
110
+ def sort_scheme(self) -> Optional[SortScheme]:
111
+ val: Dict[str, Any] = self.get("sortScheme")
112
+ if val is not None and not isinstance(val, SortScheme):
113
+ self["sortScheme"] = val = SortScheme(val)
114
+ return val
60
115
 
61
- @sort_keys.setter
62
- def sort_keys(self, sort_keys: Optional[List[SortKey]]) -> None:
63
- self["sortKeys"] = sort_keys
116
+ @sort_scheme.setter
117
+ def sort_scheme(self, sort_scheme: Optional[SortScheme]) -> None:
118
+ self["sortScheme"] = sort_scheme
64
119
 
65
120
  @property
66
- def partition_keys(self) -> Optional[List[Dict[str, Any]]]:
67
- return self.get("partitionKeys")
121
+ def sort_schemes(self) -> Optional[SortSchemeList]:
122
+ val: Dict[str, Any] = self.get("sortSchemes")
123
+ if val is not None and not isinstance(val, SortSchemeList):
124
+ self["sortSchemes"] = val = SortSchemeList.of(val)
125
+ return val
68
126
 
69
- @partition_keys.setter
70
- def partition_keys(self, partition_keys: Optional[List[Dict[str, Any]]]) -> None:
71
- self["partitionKeys"] = partition_keys
127
+ @sort_schemes.setter
128
+ def sort_schemes(self, sort_schemes: Optional[SortSchemeList]) -> None:
129
+ self["sortSchemes"] = sort_schemes
72
130
 
73
131
  @property
74
- def primary_keys(self) -> Optional[List[str]]:
75
- return self.get("primaryKeys")
132
+ def watermark(self) -> Optional[int]:
133
+ return self.get("watermark")
134
+
135
+ @watermark.setter
136
+ def watermark(self, watermark: Optional[int]) -> None:
137
+ self["watermark"] = watermark
138
+
139
+ @property
140
+ def state(self) -> Optional[LifecycleState]:
141
+ state = self.get("state")
142
+ return None if state is None else LifecycleState(state)
143
+
144
+ @state.setter
145
+ def state(self, state: Optional[LifecycleState]) -> None:
146
+ self["state"] = state
147
+
148
+ @property
149
+ def partition_scheme(self) -> Optional[partition.PartitionScheme]:
150
+ val: Dict[str, Any] = self.get("partitionScheme")
151
+ if val is not None and not isinstance(val, partition.PartitionScheme):
152
+ self["partitionScheme"] = val = partition.PartitionScheme(val)
153
+ return val
76
154
 
77
- @primary_keys.setter
78
- def primary_keys(self, primary_keys: Optional[List[str]]) -> None:
79
- self["primaryKeys"] = primary_keys
155
+ @partition_scheme.setter
156
+ def partition_scheme(
157
+ self, partition_scheme: Optional[partition.PartitionScheme]
158
+ ) -> None:
159
+ self["partitionScheme"] = partition_scheme
160
+
161
+ @property
162
+ def partition_schemes(self) -> Optional[partition.PartitionSchemeList]:
163
+ val: Dict[str, Any] = self.get("partitionSchemes")
164
+ if val is not None and not isinstance(val, partition.PartitionSchemeList):
165
+ self["partitionSchemes"] = val = partition.PartitionSchemeList.of(val)
166
+ return val
167
+
168
+ @partition_schemes.setter
169
+ def partition_schemes(
170
+ self, partition_schemes: Optional[partition.PartitionSchemeList]
171
+ ) -> None:
172
+ self["partitionSchemes"] = partition_schemes
80
173
 
81
174
  @property
82
175
  def description(self) -> Optional[str]:
@@ -87,11 +180,19 @@ class TableVersion(dict):
87
180
  self["description"] = description
88
181
 
89
182
  @property
90
- def properties(self) -> Optional[Dict[str, str]]:
183
+ def previous_table_version(self) -> Optional[str]:
184
+ return self.get("previous_table_version")
185
+
186
+ @previous_table_version.setter
187
+ def previous_table_version(self, previous_table_version: Optional[str]) -> None:
188
+ self["previous_table_version"] = previous_table_version
189
+
190
+ @property
191
+ def properties(self) -> Optional[TableVersionProperties]:
91
192
  return self.get("properties")
92
193
 
93
194
  @properties.setter
94
- def properties(self, properties: Optional[Dict[str, str]]) -> None:
195
+ def properties(self, properties: Optional[TableVersionProperties]) -> None:
95
196
  self["properties"] = properties
96
197
 
97
198
  @property
@@ -107,6 +208,14 @@ class TableVersion(dict):
107
208
  def content_types(self, content_types: Optional[List[ContentType]]) -> None:
108
209
  self["contentTypes"] = content_types
109
210
 
211
+ @property
212
+ def native_object(self) -> Optional[Any]:
213
+ return self.get("nativeObject")
214
+
215
+ @native_object.setter
216
+ def native_object(self, native_object: Optional[Any]) -> None:
217
+ self["nativeObject"] = native_object
218
+
110
219
  @property
111
220
  def namespace_locator(self) -> Optional[NamespaceLocator]:
112
221
  table_version_locator = self.locator
@@ -148,11 +257,174 @@ class TableVersion(dict):
148
257
  content_type in supported_content_types
149
258
  )
150
259
 
260
+ def to_serializable(self) -> TableVersion:
261
+ serializable: TableVersion = TableVersion.update_for(self)
262
+ if serializable.schema:
263
+ schema_bytes = serializable.schema.serialize().to_pybytes()
264
+ serializable.schema = (
265
+ base64.b64encode(schema_bytes).decode("utf-8")
266
+ if METAFILE_FORMAT == METAFILE_FORMAT_JSON
267
+ else schema_bytes
268
+ )
269
+
270
+ if serializable.schemas:
271
+ serializable.schemas = [
272
+ base64.b64encode(schema.serialize().to_pybytes()).decode("utf-8")
273
+ if METAFILE_FORMAT == METAFILE_FORMAT_JSON
274
+ else schema.serialize().to_pybytes()
275
+ for schema in serializable.schemas
276
+ ]
277
+ if serializable.table_locator:
278
+ # remove the mutable table locator
279
+ serializable.locator.table_locator = TableLocator.at(
280
+ namespace=self.id,
281
+ table_name=self.id,
282
+ )
283
+ return serializable
284
+
285
+ def from_serializable(
286
+ self,
287
+ path: str,
288
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
289
+ ) -> TableVersion:
290
+ if self.get("schema"):
291
+ schema_data = self["schema"]
292
+ schema_bytes = (
293
+ base64.b64decode(schema_data)
294
+ if METAFILE_FORMAT == "json"
295
+ else schema_data
296
+ )
297
+ self["schema"] = Schema.deserialize(pa.py_buffer(schema_bytes))
298
+ else:
299
+ self["schema"] = None
300
+
301
+ if self.get("schemas"):
302
+ self.schemas = [
303
+ Schema.deserialize(
304
+ pa.py_buffer(
305
+ base64.b64decode(schema)
306
+ if METAFILE_FORMAT == METAFILE_FORMAT_JSON
307
+ else schema
308
+ )
309
+ )
310
+ for schema in self["schemas"]
311
+ ]
312
+ else:
313
+ self.schemas = None
314
+
315
+ if self.sort_scheme:
316
+ # force list-to-tuple conversion of sort keys via property invocation
317
+ self.sort_scheme.keys
318
+ [sort_scheme.keys for sort_scheme in self.sort_schemes]
319
+ # restore the table locator from its mapped immutable metafile ID
320
+ if self.table_locator and self.table_locator.table_name == self.id:
321
+ parent_rev_dir_path = Metafile._parent_metafile_rev_dir_path(
322
+ base_metafile_path=path,
323
+ parent_number=1,
324
+ )
325
+ txn_log_dir = posixpath.join(
326
+ posixpath.dirname(
327
+ posixpath.dirname(
328
+ posixpath.dirname(parent_rev_dir_path),
329
+ )
330
+ ),
331
+ TXN_DIR_NAME,
332
+ )
333
+ table = Table.read(
334
+ MetafileRevisionInfo.latest_revision(
335
+ revision_dir_path=parent_rev_dir_path,
336
+ filesystem=filesystem,
337
+ success_txn_log_dir=txn_log_dir,
338
+ ).path,
339
+ filesystem,
340
+ )
341
+ self.locator.table_locator = table.locator
342
+ return self
343
+
344
+ def current_version_number(self) -> Optional[int]:
345
+ """
346
+ Returns the current table version number as an integer, or None if
347
+ a table version has not yet been assigned.
348
+ """
349
+ prefix, version_number = (
350
+ TableVersion.parse_table_version(
351
+ self.table_version,
352
+ )
353
+ if self.table_version is not None
354
+ else (None, None)
355
+ )
356
+ return int(version_number) if version_number is not None else None
357
+
358
+ @staticmethod
359
+ def next_version(previous_version: Optional[str] = None) -> str:
360
+ """
361
+ Assigns the next table version string given the previous table version
362
+ by incrementing the version number of the given previous table version
363
+ identifier. Returns "1" if the previous version is undefined.
364
+ """
365
+ prefix, previous_version_number = (
366
+ TableVersion.parse_table_version(
367
+ previous_version,
368
+ )
369
+ if previous_version is not None
370
+ else (None, None)
371
+ )
372
+ new_version_number = (
373
+ int(previous_version_number) + 1
374
+ if previous_version_number is not None
375
+ else 1
376
+ )
377
+ new_prefix = prefix if prefix is not None else ""
378
+ return f"{new_prefix}{new_version_number}"
379
+
380
+ @staticmethod
381
+ def parse_table_version(table_version: str) -> Tuple[Optional[str], int]:
382
+ """
383
+ Parses a table version string into its prefix and version number.
384
+ Returns a tuple of the prefix and version number.
385
+ """
386
+ if not table_version:
387
+ raise ValueError(f"Table version to parse is undefined.")
388
+ if len(table_version) > BYTES_PER_KIBIBYTE:
389
+ raise ValueError(
390
+ f"Invalid table version {table_version}. Table version "
391
+ f"identifier cannot be greater than {BYTES_PER_KIBIBYTE} "
392
+ f"characters."
393
+ )
394
+ version_match = re.match(
395
+ rf"^(\w*\.)?(\d+)$",
396
+ table_version,
397
+ )
398
+ if version_match:
399
+ prefix, version_number = version_match.groups()
400
+ return prefix, int(version_number)
401
+ raise ValueError(
402
+ f"Invalid table version {table_version}. Valid table versions "
403
+ f"are of the form `TableVersionName.1` or simply `1`.",
404
+ )
405
+
406
+
407
+ class TableVersionLocatorName(LocatorName):
408
+ def __init__(self, locator: TableVersionLocator):
409
+ self.locator = locator
410
+
411
+ @property
412
+ def immutable_id(self) -> Optional[str]:
413
+ return self.locator.table_version
414
+
415
+ @immutable_id.setter
416
+ def immutable_id(self, immutable_id: Optional[str]):
417
+ self.locator.table_version = immutable_id
418
+
419
+ def parts(self) -> List[str]:
420
+ return [self.locator.table_version]
421
+
151
422
 
152
423
  class TableVersionLocator(Locator, dict):
153
424
  @staticmethod
154
425
  def of(
155
- table_locator: Optional[TableLocator], table_version: Optional[str]
426
+ table_locator: Optional[TableLocator],
427
+ table_version: Optional[str],
156
428
  ) -> TableVersionLocator:
157
429
  table_version_locator = TableVersionLocator()
158
430
  table_version_locator.table_locator = table_locator
@@ -165,9 +437,17 @@ class TableVersionLocator(Locator, dict):
165
437
  table_name: Optional[str],
166
438
  table_version: Optional[str],
167
439
  ) -> TableVersionLocator:
168
- table_locator = TableLocator.at(namespace, table_name)
440
+ table_locator = TableLocator.at(namespace, table_name) if table_name else None
169
441
  return TableVersionLocator.of(table_locator, table_version)
170
442
 
443
+ @property
444
+ def name(self):
445
+ return TableVersionLocatorName(self)
446
+
447
+ @property
448
+ def parent(self) -> Optional[TableLocator]:
449
+ return self.table_locator
450
+
171
451
  @property
172
452
  def table_locator(self) -> Optional[TableLocator]:
173
453
  val: Dict[str, Any] = self.get("tableLocator")
@@ -185,7 +465,13 @@ class TableVersionLocator(Locator, dict):
185
465
 
186
466
  @table_version.setter
187
467
  def table_version(self, table_version: Optional[str]) -> None:
188
- self["tableVersion"] = table_version
468
+ # ensure that the table version is valid
469
+ prefix, version_number = TableVersion.parse_table_version(table_version)
470
+ # restate the table version number in its canonical form
471
+ # (e.g., ensure that "MyVersion.0001" is saved as "MyVersion.1")
472
+ self["tableVersion"] = (
473
+ f"{prefix}{version_number}" if prefix else str(version_number)
474
+ )
189
475
 
190
476
  @property
191
477
  def namespace_locator(self) -> Optional[NamespaceLocator]:
@@ -207,13 +493,3 @@ class TableVersionLocator(Locator, dict):
207
493
  if table_locator:
208
494
  return table_locator.table_name
209
495
  return None
210
-
211
- def canonical_string(self) -> str:
212
- """
213
- Returns a unique string for the given locator that can be used
214
- for equality checks (i.e. two locators are equal if they have
215
- the same canonical string).
216
- """
217
- tl_hexdigest = self.table_locator.hexdigest()
218
- table_version = self.table_version
219
- return f"{tl_hexdigest}|{table_version}"