deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -1,38 +1,40 @@
1
- from typing import Any, Callable, Dict, List, Optional, Set, Union
2
-
3
- import pyarrow as pa
1
+ from typing import Any, Callable, Dict, List, Optional, Union, Tuple
4
2
 
5
3
  from deltacat.storage import (
6
- DeleteParameters,
4
+ EntryParams,
7
5
  Delta,
8
6
  DeltaLocator,
7
+ DeltaProperties,
9
8
  DeltaType,
10
9
  DistributedDataset,
11
10
  LifecycleState,
12
11
  ListResult,
13
12
  LocalDataset,
14
13
  LocalTable,
15
- Manifest,
16
14
  ManifestAuthor,
17
15
  Namespace,
16
+ NamespaceProperties,
18
17
  Partition,
19
- SchemaConsistencyType,
18
+ PartitionLocator,
19
+ PartitionScheme,
20
+ PartitionValues,
21
+ Schema,
22
+ SortScheme,
20
23
  Stream,
24
+ StreamFormat,
21
25
  StreamLocator,
22
26
  Table,
27
+ TableProperties,
23
28
  TableVersion,
24
- SortKey,
25
- PartitionLocator,
26
- PartitionFilter,
27
- PartitionValues,
28
- DeltaPartitionSpec,
29
- StreamPartitionSpec,
29
+ TableVersionLocator,
30
+ TableVersionProperties,
30
31
  )
32
+ from deltacat.storage.model.manifest import Manifest
31
33
  from deltacat.types.media import (
32
34
  ContentType,
35
+ DistributedDatasetType,
33
36
  StorageType,
34
37
  TableType,
35
- DistributedDatasetType,
36
38
  )
37
39
  from deltacat.utils.common import ReadKwargsProvider
38
40
 
@@ -64,12 +66,26 @@ def list_table_versions(
64
66
  raise NotImplementedError("list_table_versions not implemented")
65
67
 
66
68
 
69
+ def list_streams(
70
+ namespace: str,
71
+ table_name: str,
72
+ table_version: str,
73
+ *args,
74
+ **kwargs,
75
+ ) -> ListResult[Stream]:
76
+ """
77
+ Lists a page of streams for the given table version.
78
+ Raises an error if the table version does not exist.
79
+ """
80
+ raise NotImplementedError("list_streams not implemented")
81
+
82
+
67
83
  def list_partitions(
68
84
  namespace: str,
69
85
  table_name: str,
70
86
  table_version: Optional[str] = None,
71
87
  *args,
72
- **kwargs
88
+ **kwargs,
73
89
  ) -> ListResult[Partition]:
74
90
  """
75
91
  Lists a page of partitions for the given table version. Partitions are
@@ -96,9 +112,9 @@ def list_deltas(
96
112
  last_stream_position: Optional[int] = None,
97
113
  ascending_order: Optional[bool] = None,
98
114
  include_manifest: bool = False,
99
- partition_filter: Optional[PartitionFilter] = None,
115
+ partition_scheme_id: Optional[str] = None,
100
116
  *args,
101
- **kwargs
117
+ **kwargs,
102
118
  ) -> ListResult[Delta]:
103
119
  """
104
120
  Lists a page of deltas for the given table version and committed partition.
@@ -106,15 +122,13 @@ def list_deltas(
106
122
  limited to inclusive first and last stream positions. Deltas are returned by
107
123
  descending stream position by default. Table version resolves to the latest
108
124
  active table version if not specified. Partition values should not be
109
- specified for unpartitioned tables. Raises an error if the given table
110
- version or partition does not exist.
125
+ specified for unpartitioned tables. Partition scheme ID resolves to the
126
+ table version's current partition scheme by default. Raises an error if the
127
+ given table version or partition does not exist.
111
128
 
112
129
  To conserve memory, the deltas returned do not include manifests by
113
130
  default. The manifests can either be optionally retrieved as part of this
114
131
  call or lazily loaded via subsequent calls to `get_delta_manifest`.
115
-
116
- Note: partition_values is deprecated and will be removed in future releases.
117
- Use partition_filter instead.
118
132
  """
119
133
  raise NotImplementedError("list_deltas not implemented")
120
134
 
@@ -126,7 +140,7 @@ def list_partition_deltas(
126
140
  ascending_order: bool = False,
127
141
  include_manifest: bool = False,
128
142
  *args,
129
- **kwargs
143
+ **kwargs,
130
144
  ) -> ListResult[Delta]:
131
145
  """
132
146
  Lists a page of deltas committed to the given partition.
@@ -145,22 +159,21 @@ def get_delta(
145
159
  partition_values: Optional[PartitionValues] = None,
146
160
  table_version: Optional[str] = None,
147
161
  include_manifest: bool = False,
148
- partition_filter: Optional[PartitionFilter] = None,
162
+ partition_scheme_id: Optional[str] = None,
149
163
  *args,
150
- **kwargs
164
+ **kwargs,
151
165
  ) -> Optional[Delta]:
152
166
  """
153
167
  Gets the delta for the given table version, partition, and stream position.
154
168
  Table version resolves to the latest active table version if not specified.
155
- Partition values should not be specified for unpartitioned tables. Raises
156
- an error if the given table version or partition does not exist.
169
+ Partition values should not be specified for unpartitioned tables. Partition
170
+ scheme ID resolves to the table version's current partition scheme by
171
+ default. Raises an error if the given table version or partition does not
172
+ exist.
157
173
 
158
174
  To conserve memory, the delta returned does not include a manifest by
159
175
  default. The manifest can either be optionally retrieved as part of this
160
176
  call or lazily loaded via a subsequent call to `get_delta_manifest`.
161
-
162
- Note: partition_values is deprecated and will be removed in future releases.
163
- Use partition_filter instead.
164
177
  """
165
178
  raise NotImplementedError("get_delta not implemented")
166
179
 
@@ -171,23 +184,21 @@ def get_latest_delta(
171
184
  partition_values: Optional[PartitionValues] = None,
172
185
  table_version: Optional[str] = None,
173
186
  include_manifest: bool = False,
174
- partition_filter: Optional[PartitionFilter] = None,
187
+ partition_scheme_id: Optional[str] = None,
175
188
  *args,
176
- **kwargs
189
+ **kwargs,
177
190
  ) -> Optional[Delta]:
178
191
  """
179
192
  Gets the latest delta (i.e. the delta with the greatest stream position) for
180
193
  the given table version and partition. Table version resolves to the latest
181
194
  active table version if not specified. Partition values should not be
182
- specified for unpartitioned tables. Raises an error if the given table
183
- version or partition does not exist.
195
+ specified for unpartitioned tables. Partition scheme ID resolves to the
196
+ table version's current partition scheme by default. Raises an error if the
197
+ given table version or partition does not exist.
184
198
 
185
199
  To conserve memory, the delta returned does not include a manifest by
186
200
  default. The manifest can either be optionally retrieved as part of this
187
201
  call or lazily loaded via a subsequent call to `get_delta_manifest`.
188
-
189
- Note: partition_values is deprecated and will be removed in future releases.
190
- Use partition_filter instead.
191
202
  """
192
203
  raise NotImplementedError("get_latest_delta not implemented")
193
204
 
@@ -201,9 +212,8 @@ def download_delta(
201
212
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
202
213
  ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
203
214
  distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
204
- partition_filter: Optional[PartitionFilter] = None,
205
215
  *args,
206
- **kwargs
216
+ **kwargs,
207
217
  ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
208
218
  """
209
219
  Download the given delta or delta locator into either a list of
@@ -211,10 +221,6 @@ def download_delta(
211
221
  across this Ray cluster's object store memory. Ordered table N of a local
212
222
  table list, or ordered block N of a distributed dataset, always contain
213
223
  the contents of ordered delta manifest entry N.
214
-
215
- partition_filter is an optional parameter which determines which files to
216
- download from the delta manifest. A delta manifest contains all the data files
217
- for a given delta.
218
224
  """
219
225
  raise NotImplementedError("download_delta not implemented")
220
226
 
@@ -226,7 +232,7 @@ def download_delta_manifest_entry(
226
232
  columns: Optional[List[str]] = None,
227
233
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
228
234
  *args,
229
- **kwargs
235
+ **kwargs,
230
236
  ) -> LocalTable:
231
237
  """
232
238
  Downloads a single manifest entry into the specified table type for the
@@ -244,17 +250,21 @@ def get_delta_manifest(
244
250
  ) -> Manifest:
245
251
  """
246
252
  Get the manifest associated with the given delta or delta locator. This
247
- always retrieves the authoritative remote copy of the delta manifest, and
248
- never the local manifest defined for any input delta.
253
+ always retrieves the authoritative durable copy of the delta manifest, and
254
+ never the local manifest defined for any input delta. Raises an error if
255
+ the delta can't be found, or if it doesn't contain a manifest.
249
256
  """
250
257
  raise NotImplementedError("get_delta_manifest not implemented")
251
258
 
252
259
 
253
260
  def create_namespace(
254
- namespace: str, permissions: Dict[str, Any], *args, **kwargs
261
+ namespace: str,
262
+ properties: Optional[NamespaceProperties] = None,
263
+ *args,
264
+ **kwargs,
255
265
  ) -> Namespace:
256
266
  """
257
- Creates a table namespace with the given name and permissions. Returns
267
+ Creates a table namespace with the given name and properties. Returns
258
268
  the created namespace.
259
269
  """
260
270
  raise NotImplementedError("create_namespace not implemented")
@@ -262,13 +272,13 @@ def create_namespace(
262
272
 
263
273
  def update_namespace(
264
274
  namespace: str,
265
- permissions: Optional[Dict[str, Any]] = None,
275
+ properties: Optional[NamespaceProperties] = None,
266
276
  new_namespace: Optional[str] = None,
267
277
  *args,
268
- **kwargs
278
+ **kwargs,
269
279
  ) -> None:
270
280
  """
271
- Updates a table namespace's name and/or permissions. Raises an error if the
281
+ Updates a table namespace's name and/or properties. Raises an error if the
272
282
  given namespace does not exist.
273
283
  """
274
284
  raise NotImplementedError("update_namespace not implemented")
@@ -278,52 +288,28 @@ def create_table_version(
278
288
  namespace: str,
279
289
  table_name: str,
280
290
  table_version: Optional[str] = None,
281
- schema: Optional[Union[pa.Schema, str, bytes]] = None,
282
- schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
283
- partition_keys: Optional[List[Dict[str, Any]]] = None,
284
- primary_key_column_names: Optional[Set[str]] = None,
285
- sort_keys: Optional[List[SortKey]] = None,
291
+ schema: Optional[Schema] = None,
292
+ partition_scheme: Optional[PartitionScheme] = None,
293
+ # TODO(pdames): rename to `sort_scheme`
294
+ sort_keys: Optional[SortScheme] = None,
286
295
  table_version_description: Optional[str] = None,
287
- table_version_properties: Optional[Dict[str, str]] = None,
288
- table_permissions: Optional[Dict[str, Any]] = None,
296
+ table_version_properties: Optional[TableVersionProperties] = None,
289
297
  table_description: Optional[str] = None,
290
- table_properties: Optional[Dict[str, str]] = None,
298
+ table_properties: Optional[TableProperties] = None,
291
299
  supported_content_types: Optional[List[ContentType]] = None,
292
- partition_spec: Optional[StreamPartitionSpec] = None,
293
300
  *args,
294
- **kwargs
295
- ) -> Stream:
301
+ **kwargs,
302
+ ) -> Tuple[Optional[Table], TableVersion, Stream]:
296
303
  """
297
304
  Create a table version with an unreleased lifecycle state and an empty delta
298
- stream. Table versions may be schemaless and unpartitioned, or partitioned
299
- according to a list of partition key names and types. Note that partition
300
- keys are not required to exist in the table's schema, and can thus still be
301
- used with schemaless tables. This can be useful for creating logical shards
302
- of a delta stream where partition keys are known but not projected onto each
303
- row of the table (e.g. all rows of a customer orders table are known to
304
- correspond to a given order day, even if this column doesn't exist in the
305
- table). Primary and sort keys must exist within the table's schema.
306
- Permissions specified at the table level override any conflicting
307
- permissions specified at the table namespace level. Returns the stream
308
- for the created table version. Raises an error if the given namespace does
309
- not exist.
310
-
311
- Schemas are optional for DeltaCAT tables and can be used to inform the data
312
- consistency checks run for each field. If a schema is present, it can be
313
- used to enforce the following column-level data consistency policies at
314
- table load time:
315
-
316
- None: No consistency checks are run. May be mixed with the below two
317
- policies by specifying column names to pass through together with
318
- column names to coerce/validate.
305
+ stream. Table versions may be schemaless and unpartitioned to improve write
306
+ performance, or have their writes governed by a schema and partition scheme
307
+ to improve data consistency and read performance.
319
308
 
320
- Coerce: Coerce fields to fit the schema whenever possible. An explicit
321
- subset of column names to coerce may optionally be specified.
309
+ Returns a tuple containing the created/updated table, table version, and
310
+ stream (respectively).
322
311
 
323
- Validate: Raise an error for any fields that don't fit the schema. An
324
- explicit subset of column names to validate may optionally be specified.
325
-
326
- Either partition_keys or partition_spec must be specified but not both.
312
+ Raises an error if the given namespace does not exist.
327
313
  """
328
314
  raise NotImplementedError("create_table_version not implemented")
329
315
 
@@ -331,18 +317,17 @@ def create_table_version(
331
317
  def update_table(
332
318
  namespace: str,
333
319
  table_name: str,
334
- permissions: Optional[Dict[str, Any]] = None,
335
320
  description: Optional[str] = None,
336
- properties: Optional[Dict[str, str]] = None,
321
+ properties: Optional[TableProperties] = None,
337
322
  new_table_name: Optional[str] = None,
338
323
  *args,
339
- **kwargs
324
+ **kwargs,
340
325
  ) -> None:
341
326
  """
342
327
  Update table metadata describing the table versions it contains. By default,
343
- a table's properties are empty, and its description and permissions are
344
- equal to those given when its first table version was created. Raises an
345
- error if the given table does not exist.
328
+ a table's properties are empty, and its description is equal to that given
329
+ when its first table version was created. Raises an error if the given
330
+ table does not exist.
346
331
  """
347
332
  raise NotImplementedError("update_table not implemented")
348
333
 
@@ -352,12 +337,14 @@ def update_table_version(
352
337
  table_name: str,
353
338
  table_version: str,
354
339
  lifecycle_state: Optional[LifecycleState] = None,
355
- schema: Optional[Union[pa.Schema, str, bytes]] = None,
356
- schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
340
+ schema: Optional[Schema] = None,
357
341
  description: Optional[str] = None,
358
- properties: Optional[Dict[str, str]] = None,
342
+ properties: Optional[TableVersionProperties] = None,
343
+ partition_scheme: Optional[PartitionScheme] = None,
344
+ # TODO(pdames): rename to `sort_scheme`
345
+ sort_keys: Optional[SortScheme] = None,
359
346
  *args,
360
- **kwargs
347
+ **kwargs,
361
348
  ) -> None:
362
349
  """
363
350
  Update a table version. Notably, updating an unreleased table version's
@@ -375,18 +362,27 @@ def stage_stream(
375
362
  namespace: str,
376
363
  table_name: str,
377
364
  table_version: Optional[str] = None,
365
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
378
366
  *args,
379
- **kwargs
367
+ **kwargs,
380
368
  ) -> Stream:
381
369
  """
382
370
  Stages a new delta stream for the given table version. Resolves to the
383
- latest active table version if no table version is given. Returns the
384
- staged stream. Raises an error if the table version does not exist.
371
+ latest active table version if no table version is given. Resolves to the
372
+ DeltaCAT stream format if no stream format is given. If this stream
373
+ will replace another stream with the same format and scheme, then it will
374
+ have its previous stream ID set to the ID of the stream being replaced.
375
+ Returns the staged stream. Raises an error if the table version does not
376
+ exist.
385
377
  """
386
378
  raise NotImplementedError("stage_stream not implemented")
387
379
 
388
380
 
389
- def commit_stream(stream: Stream, *args, **kwargs) -> Stream:
381
+ def commit_stream(
382
+ stream: Stream,
383
+ *args,
384
+ **kwargs,
385
+ ) -> Stream:
390
386
  """
391
387
  Registers a delta stream with a target table version, replacing any
392
388
  previous stream registered for the same table version. Returns the
@@ -399,43 +395,112 @@ def delete_stream(
399
395
  namespace: str,
400
396
  table_name: str,
401
397
  table_version: Optional[str] = None,
398
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
402
399
  *args,
403
- **kwargs
400
+ **kwargs,
404
401
  ) -> None:
405
402
  """
406
403
  Deletes the delta stream currently registered with the given table version.
407
404
  Resolves to the latest active table version if no table version is given.
408
- Raises an error if the table version does not exist.
405
+ Resolves to the deltacat stream format if no stream format is given.
406
+ Raises an error if the stream does not exist.
409
407
  """
410
408
  raise NotImplementedError("delete_stream not implemented")
411
409
 
412
410
 
411
+ def delete_table(
412
+ namespace: str,
413
+ name: str,
414
+ purge: bool = False,
415
+ *args,
416
+ **kwargs,
417
+ ) -> None:
418
+ """
419
+ Drops the given table and all its contents (table versions, streams, partitions,
420
+ and deltas). If purge is True, also removes all data files associated with the table.
421
+ Raises an error if the given table does not exist.
422
+ """
423
+ raise NotImplementedError("delete_table not implemented")
424
+
425
+
426
+ def delete_namespace(
427
+ namespace: str,
428
+ purge: bool = False,
429
+ *args,
430
+ **kwargs,
431
+ ) -> None:
432
+ """
433
+ Drops a table namespace and all its contents. If purge is True, then all
434
+ tables, table versions, and deltas will be deleted. Otherwise, the namespace
435
+ will be dropped only if it is empty. Raises an error if the given namespace
436
+ does not exist.
437
+ """
438
+ raise NotImplementedError("drop_namespace not implemented")
439
+
440
+
441
+ def get_stream_by_id(
442
+ table_version_locator: TableVersionLocator,
443
+ stream_id: str,
444
+ *args,
445
+ **kwargs,
446
+ ) -> Optional[Partition]:
447
+ """
448
+ Gets the stream for the given table version locator and stream ID.
449
+ Returns None if the stream does not exist. Raises an error if the given
450
+ table version locator does not exist.
451
+ """
452
+ raise NotImplementedError("get_stream_by_id not implemented")
453
+
454
+
413
455
  def get_stream(
414
456
  namespace: str,
415
457
  table_name: str,
416
458
  table_version: Optional[str] = None,
459
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
417
460
  *args,
418
- **kwargs
461
+ **kwargs,
419
462
  ) -> Optional[Stream]:
420
463
  """
421
- Gets the most recently committed stream for the given table version and
422
- partition key values. Resolves to the latest active table version if no
423
- table version is given. Returns None if the table version does not exist.
464
+ Gets the most recently committed stream for the given table version.
465
+ Resolves to the latest active table version if no table version is given.
466
+ Resolves to the deltacat stream format if no stream format is given.
467
+ Returns None if the table version or stream format does not exist.
424
468
  """
425
469
  raise NotImplementedError("get_stream not implemented")
426
470
 
427
471
 
472
+ def stream_exists(
473
+ namespace: str,
474
+ table_name: str,
475
+ table_version: Optional[str] = None,
476
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
477
+ *args,
478
+ **kwargs,
479
+ ) -> bool:
480
+ """
481
+ Returns True if the given Stream exists, False if not.
482
+ Resolves to the latest active table version if no table version is given.
483
+ Resolves to the DeltaCAT stream format if no stream format is given.
484
+ Returns None if the table version or stream format does not exist.
485
+ """
486
+ raise NotImplementedError("stream_exists not implemented")
487
+
488
+
428
489
  def stage_partition(
429
- stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
490
+ stream: Stream,
491
+ partition_values: Optional[PartitionValues] = None,
492
+ partition_scheme_id: Optional[str] = None,
493
+ *args,
494
+ **kwargs,
430
495
  ) -> Partition:
431
496
  """
432
497
  Stages a new partition for the given stream and partition values. Returns
433
498
  the staged partition. If this partition will replace another partition
434
- with the same partition values, then it will have its previous partition ID
435
- set to the ID of the partition being replaced. Partition keys should not be
436
- specified for unpartitioned tables.
499
+ with the same partition values and scheme, then it will have its previous
500
+ partition ID set to the ID of the partition being replaced. Partition values
501
+ should not be specified for unpartitioned tables.
437
502
 
438
- The partition_values must represents the results of transforms in a partition
503
+ The partition_values must represent the results of transforms in a partition
439
504
  spec specified in the stream.
440
505
  """
441
506
  raise NotImplementedError("stage_partition not implemented")
@@ -445,13 +510,18 @@ def commit_partition(
445
510
  partition: Partition,
446
511
  previous_partition: Optional[Partition] = None,
447
512
  *args,
448
- **kwargs
513
+ **kwargs,
449
514
  ) -> Partition:
450
515
  """
451
- Commits the given partition to its associated table version stream,
452
- replacing any previous partition (i.e., "partition being replaced") registered for the same stream and
516
+ Commits the staged partition to its associated table version stream,
517
+ replacing any previous partition registered for the same stream and
453
518
  partition values.
454
- If the previous_partition is passed as an argument, the specified previous_partition will be the partition being replaced, otherwise it will be retrieved.
519
+
520
+ If previous partition is given then it will be replaced with its deltas
521
+ prepended to the new partition being committed. Otherwise the latest
522
+ committed partition with the same keys and partition scheme ID will be
523
+ retrieved.
524
+
455
525
  Returns the registered partition. If the partition's
456
526
  previous delta stream position is specified, then the commit will
457
527
  be rejected if it does not match the actual previous stream position of
@@ -463,33 +533,48 @@ def commit_partition(
463
533
 
464
534
 
465
535
  def delete_partition(
466
- namespace: str,
467
- table_name: str,
468
- table_version: Optional[str] = None,
536
+ stream_locator: StreamLocator,
469
537
  partition_values: Optional[PartitionValues] = None,
538
+ partition_scheme_id: Optional[str] = None,
470
539
  *args,
471
- **kwargs
540
+ **kwargs,
472
541
  ) -> None:
473
542
  """
474
- Deletes the given partition from the specified table version. Resolves to
475
- the latest active table version if no table version is given. Partition
543
+ Deletes the given partition from the specified stream. Partition
476
544
  values should not be specified for unpartitioned tables. Raises an error
477
- if the table version or partition does not exist.
545
+ if the partition does not exist.
478
546
  """
479
547
  raise NotImplementedError("delete_partition not implemented")
480
548
 
481
549
 
550
+ def get_partition_by_id(
551
+ stream_locator: StreamLocator,
552
+ partition_id: str,
553
+ *args,
554
+ **kwargs,
555
+ ) -> Optional[Partition]:
556
+ """
557
+ Gets the partition for the given stream locator and partition ID.
558
+ Returns None if the partition does not exist. Raises an error if the
559
+ given stream locator does not exist.
560
+ """
561
+ raise NotImplementedError("get_partition_by_id not implemented")
562
+
563
+
482
564
  def get_partition(
483
565
  stream_locator: StreamLocator,
484
566
  partition_values: Optional[PartitionValues] = None,
567
+ partition_scheme_id: Optional[str] = None,
485
568
  *args,
486
- **kwargs
569
+ **kwargs,
487
570
  ) -> Optional[Partition]:
488
571
  """
489
572
  Gets the most recently committed partition for the given stream locator and
490
573
  partition key values. Returns None if no partition has been committed for
491
574
  the given table version and/or partition key values. Partition values
492
- should not be specified for unpartitioned tables.
575
+ should not be specified for unpartitioned tables. Partition scheme ID
576
+ resolves to the table version's current partition scheme by default.
577
+ Raises an error if the given stream locator does not exist.
493
578
  """
494
579
  raise NotImplementedError("get_partition not implemented")
495
580
 
@@ -500,14 +585,12 @@ def stage_delta(
500
585
  delta_type: DeltaType = DeltaType.UPSERT,
501
586
  max_records_per_entry: Optional[int] = None,
502
587
  author: Optional[ManifestAuthor] = None,
503
- properties: Optional[Dict[str, str]] = None,
588
+ properties: Optional[DeltaProperties] = None,
504
589
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
505
590
  content_type: ContentType = ContentType.PARQUET,
506
- delete_parameters: Optional[DeleteParameters] = None,
507
- partition_spec: Optional[DeltaPartitionSpec] = None,
508
- partition_values: Optional[PartitionValues] = None,
591
+ entry_params: Optional[EntryParams] = None,
509
592
  *args,
510
- **kwargs
593
+ **kwargs,
511
594
  ) -> Delta:
512
595
  """
513
596
  Writes the given table to 1 or more S3 files. Returns an unregistered
@@ -601,7 +684,7 @@ def get_table_version_column_names(
601
684
  table_name: str,
602
685
  table_version: Optional[str] = None,
603
686
  *args,
604
- **kwargs
687
+ **kwargs,
605
688
  ) -> Optional[List[str]]:
606
689
  """
607
690
  Gets a list of column names for the specified table version, or for the
@@ -619,8 +702,8 @@ def get_table_version_schema(
619
702
  table_name: str,
620
703
  table_version: Optional[str] = None,
621
704
  *args,
622
- **kwargs
623
- ) -> Optional[Union[pa.Schema, str, bytes]]:
705
+ **kwargs,
706
+ ) -> Optional[Schema]:
624
707
  """
625
708
  Gets the schema for the specified table version, or for the latest active
626
709
  table version if none is specified. Returns None if the table version is
File without changes