deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,1236 +0,0 @@
1
- from typing import Any, Callable, Dict, List, Optional, Union, Tuple
2
-
3
- import pyarrow as pa
4
- import daft
5
- import json
6
- import sqlite3
7
- from sqlite3 import Cursor, Connection
8
- import uuid
9
- import ray
10
-
11
- import io
12
-
13
- from deltacat.tests.test_utils.storage import create_empty_delta
14
- from deltacat.utils.common import current_time_ms
15
-
16
-
17
- from deltacat.storage import (
18
- Delta,
19
- DeltaLocator,
20
- DeltaProperties,
21
- DeltaType,
22
- DistributedDataset,
23
- LifecycleState,
24
- ListResult,
25
- LocalDataset,
26
- LocalTable,
27
- ManifestAuthor,
28
- Namespace,
29
- NamespaceLocator,
30
- NamespaceProperties,
31
- Partition,
32
- PartitionScheme,
33
- Schema,
34
- Stream,
35
- StreamLocator,
36
- Table,
37
- TableVersion,
38
- TableVersionLocator,
39
- TableVersionProperties,
40
- TableLocator,
41
- TableProperties,
42
- CommitState,
43
- SortScheme,
44
- PartitionLocator,
45
- ManifestEntry,
46
- ManifestEntryList,
47
- EntryParams,
48
- PartitionValues,
49
- TransformName,
50
- StreamFormat,
51
- )
52
- from deltacat.storage.model.manifest import Manifest, ManifestMeta, EntryType
53
- from deltacat.types.media import (
54
- ContentType,
55
- StorageType,
56
- TableType,
57
- ContentEncoding,
58
- DistributedDatasetType,
59
- )
60
- from deltacat.utils.common import ReadKwargsProvider
61
- from deltacat.tests.local_deltacat_storage.exceptions import (
62
- InvalidNamespaceError,
63
- LocalStorageValidationError,
64
- )
65
-
66
- SQLITE_CUR_ARG = "sqlite3_cur"
67
- SQLITE_CON_ARG = "sqlite3_con"
68
- DB_FILE_PATH_ARG = "db_file_path"
69
-
70
- STREAM_FORMAT = StreamFormat.SQLITE3
71
- STREAM_ID_PROPERTY = "stream_id"
72
- CREATE_NAMESPACES_TABLE = (
73
- "CREATE TABLE IF NOT EXISTS namespaces(locator, value, PRIMARY KEY (locator))"
74
- )
75
- CREATE_TABLES_TABLE = (
76
- "CREATE TABLE IF NOT EXISTS tables(locator, namespace_locator, value, PRIMARY KEY (locator), "
77
- "FOREIGN KEY (namespace_locator) REFERENCES namespaces(locator))"
78
- )
79
- CREATE_TABLE_VERSIONS_TABLE = (
80
- "CREATE TABLE IF NOT EXISTS table_versions(locator, table_locator, value, PRIMARY KEY (locator), "
81
- "FOREIGN KEY (table_locator) REFERENCES tables(locator))"
82
- )
83
- CREATE_STREAMS_TABLE = (
84
- "CREATE TABLE IF NOT EXISTS streams(locator, table_version_locator, value, PRIMARY KEY(locator), "
85
- "FOREIGN KEY (table_version_locator) REFERENCES table_versions(locator))"
86
- )
87
- CREATE_PARTITIONS_TABLE = (
88
- "CREATE TABLE IF NOT EXISTS partitions(locator, stream_locator, value, PRIMARY KEY(locator), "
89
- "FOREIGN KEY (stream_locator) REFERENCES streams(locator))"
90
- )
91
- CREATE_DELTAS_TABLE = (
92
- "CREATE TABLE IF NOT EXISTS deltas(locator, partition_locator, value, PRIMARY KEY(locator), "
93
- "FOREIGN KEY (partition_locator) REFERENCES partitions(locator))"
94
- )
95
- CREATE_DATA_TABLE = "CREATE TABLE IF NOT EXISTS data(uri, value, PRIMARY KEY(uri))"
96
-
97
-
98
- def _get_sqlite3_cursor_con(kwargs) -> Tuple[Cursor, Connection]:
99
- if SQLITE_CUR_ARG in kwargs and SQLITE_CON_ARG in kwargs:
100
- return kwargs[SQLITE_CUR_ARG], kwargs[SQLITE_CON_ARG]
101
- elif DB_FILE_PATH_ARG in kwargs:
102
- con = sqlite3.connect(kwargs[DB_FILE_PATH_ARG])
103
- cur = con.cursor()
104
- return cur, con
105
-
106
- raise ValueError(f"Invalid local db connection kwargs: {kwargs}")
107
-
108
-
109
- def _get_manifest_entry_uri(manifest_entry_id: str) -> str:
110
- return f"cloudpickle://{manifest_entry_id}"
111
-
112
-
113
- def _merge_and_promote(
114
- partition_deltas: List[Delta], previous_partition_deltas: List[Delta]
115
- ):
116
- previous_partition_deltas_spos_gt: List[Delta] = [
117
- delta
118
- for delta in previous_partition_deltas
119
- if delta.stream_position > partition_deltas[0].stream_position
120
- ]
121
- # handle the case if the previous partition deltas have a greater stream position than the partition_delta
122
- partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
123
- return partition_deltas
124
-
125
-
126
- def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
127
- cur, con = _get_sqlite3_cursor_con(kwargs)
128
- res = cur.execute("SELECT * FROM namespaces")
129
- fetched = res.fetchall()
130
- result = []
131
-
132
- for item in fetched:
133
- result.append(Namespace(json.loads(item[1])))
134
-
135
- return ListResult.of(result, None, None)
136
-
137
-
138
- def list_tables(namespace: str, *args, **kwargs) -> ListResult[Table]:
139
- cur, con = _get_sqlite3_cursor_con(kwargs)
140
- params = (NamespaceLocator.of(namespace).canonical_string(),)
141
- res = cur.execute("SELECT * FROM tables WHERE namespace_locator = ?", params)
142
- fetched = res.fetchall()
143
- result = []
144
-
145
- for item in fetched:
146
- result.append(Table(json.loads(item[2])))
147
-
148
- return ListResult.of(result, None, None)
149
-
150
-
151
- def list_table_versions(
152
- namespace: str, table_name: str, *args, **kwargs
153
- ) -> ListResult[TableVersion]:
154
- cur, con = _get_sqlite3_cursor_con(kwargs)
155
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
156
-
157
- res = cur.execute(
158
- "SELECT * FROM table_versions WHERE table_locator = ?",
159
- (table_locator.canonical_string(),),
160
- )
161
- fetched = res.fetchall()
162
- result = []
163
-
164
- for item in fetched:
165
- result.append(TableVersion(json.loads(item[2])))
166
-
167
- return ListResult.of(result, None, None)
168
-
169
-
170
- def list_partitions(
171
- namespace: str,
172
- table_name: str,
173
- table_version: Optional[str] = None,
174
- *args,
175
- **kwargs,
176
- ) -> ListResult[Partition]:
177
- cur, con = _get_sqlite3_cursor_con(kwargs)
178
-
179
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
180
-
181
- res = cur.execute(
182
- "SELECT * FROM partitions WHERE stream_locator = ?",
183
- (stream.locator.canonical_string(),),
184
- )
185
-
186
- fetched = res.fetchall()
187
- result = []
188
- for item in fetched:
189
- partition = Partition(json.loads(item[2]))
190
- if partition.state == CommitState.COMMITTED:
191
- result.append(partition)
192
-
193
- return ListResult.of(result, None, None)
194
-
195
-
196
- def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partition]:
197
- return list_partitions(
198
- stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
199
- )
200
-
201
-
202
- def list_deltas(
203
- namespace: str,
204
- table_name: str,
205
- partition_values: Optional[PartitionValues] = None,
206
- table_version: Optional[str] = None,
207
- first_stream_position: Optional[int] = None,
208
- last_stream_position: Optional[int] = None,
209
- ascending_order: Optional[bool] = None,
210
- include_manifest: bool = False,
211
- partition_scheme_id: Optional[str] = None,
212
- *args,
213
- **kwargs,
214
- ) -> ListResult[Delta]:
215
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
216
- if stream is None:
217
- return ListResult.of([], None, None)
218
-
219
- partition = get_partition(stream.locator, partition_values, *args, **kwargs)
220
-
221
- all_deltas = list_partition_deltas(
222
- partition,
223
- first_stream_position=first_stream_position,
224
- last_stream_position=last_stream_position,
225
- ascending_order=ascending_order,
226
- include_manifest=include_manifest,
227
- *args,
228
- **kwargs,
229
- ).all_items()
230
-
231
- result = []
232
-
233
- for delta in all_deltas:
234
- if (
235
- not first_stream_position or first_stream_position < delta.stream_position
236
- ) and (
237
- not last_stream_position or delta.stream_position <= last_stream_position
238
- ):
239
- result.append(delta)
240
-
241
- if not include_manifest:
242
- delta.manifest = None
243
-
244
- result.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
245
- return ListResult.of(result, None, None)
246
-
247
-
248
- def list_partition_deltas(
249
- partition_like: Union[Partition, PartitionLocator],
250
- first_stream_position: Optional[int] = None,
251
- last_stream_position: Optional[int] = None,
252
- ascending_order: bool = False,
253
- include_manifest: bool = False,
254
- *args,
255
- **kwargs,
256
- ) -> ListResult[Delta]:
257
- cur, con = _get_sqlite3_cursor_con(kwargs)
258
-
259
- if partition_like is None:
260
- return ListResult.of([], None, None)
261
-
262
- if first_stream_position is None:
263
- first_stream_position = 0
264
-
265
- if last_stream_position is None:
266
- last_stream_position = float("inf")
267
-
268
- assert isinstance(partition_like, Partition) or isinstance(
269
- partition_like, PartitionLocator
270
- ), f"Expected a Partition or PartitionLocator as an input argument but found {partition_like}"
271
-
272
- partition_locator = None
273
- if isinstance(partition_like, Partition):
274
- partition_locator = partition_like.locator
275
- else:
276
- partition_locator = partition_like
277
-
278
- res = cur.execute(
279
- "SELECT * FROM deltas WHERE partition_locator = ?",
280
- (partition_locator.canonical_string(),),
281
- )
282
-
283
- serialized_items = res.fetchall()
284
-
285
- if not serialized_items:
286
- return ListResult.of([], None, None)
287
-
288
- result = []
289
- for item in serialized_items:
290
- current_delta = Delta(json.loads(item[2]))
291
- if (
292
- first_stream_position
293
- <= current_delta.stream_position
294
- <= last_stream_position
295
- ):
296
- result.append(current_delta)
297
-
298
- if not include_manifest:
299
- current_delta.manifest = None
300
-
301
- result.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
302
- return ListResult.of(result, None, None)
303
-
304
-
305
- def get_delta(
306
- namespace: str,
307
- table_name: str,
308
- stream_position: int,
309
- partition_values: Optional[PartitionValues] = None,
310
- table_version: Optional[str] = None,
311
- include_manifest: bool = False,
312
- partition_scheme_id: Optional[str] = None,
313
- *args,
314
- **kwargs,
315
- ) -> Optional[Delta]:
316
- cur, con = _get_sqlite3_cursor_con(kwargs)
317
-
318
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
319
-
320
- partition = get_partition(stream.locator, partition_values, *args, **kwargs)
321
- delta_locator = DeltaLocator.of(partition.locator, stream_position)
322
-
323
- res = cur.execute(
324
- "SELECT * FROM deltas WHERE locator = ?", (delta_locator.canonical_string(),)
325
- )
326
-
327
- serialized_delta = res.fetchone()
328
- if serialized_delta is None:
329
- return None
330
-
331
- delta = Delta(json.loads(serialized_delta[2]))
332
-
333
- if not include_manifest:
334
- delta.manifest = None
335
-
336
- return delta
337
-
338
-
339
- def get_latest_delta(
340
- namespace: str,
341
- table_name: str,
342
- partition_values: Optional[PartitionValues] = None,
343
- table_version: Optional[str] = None,
344
- include_manifest: bool = False,
345
- partition_scheme_id: Optional[str] = None,
346
- *args,
347
- **kwargs,
348
- ) -> Optional[Delta]:
349
-
350
- deltas = list_deltas(
351
- namespace=namespace,
352
- table_name=table_name,
353
- partition_values=partition_values,
354
- table_version=table_version,
355
- first_stream_position=None,
356
- last_stream_position=None,
357
- ascending_order=False,
358
- include_manifest=include_manifest,
359
- partition_scheme_id=partition_scheme_id,
360
- *args,
361
- **kwargs,
362
- ).all_items()
363
-
364
- if not deltas:
365
- return None
366
-
367
- return deltas[0]
368
-
369
-
370
- def download_delta(
371
- delta_like: Union[Delta, DeltaLocator],
372
- table_type: TableType = TableType.PYARROW,
373
- storage_type: StorageType = StorageType.DISTRIBUTED,
374
- max_parallelism: Optional[int] = None,
375
- columns: Optional[List[str]] = None,
376
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
377
- ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
378
- distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
379
- *args,
380
- **kwargs,
381
- ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
382
- result = []
383
- if isinstance(delta_like, Delta) and delta_like.manifest is not None:
384
- manifest = Delta(delta_like).manifest
385
- else:
386
- manifest = get_delta_manifest(delta_like, *args, **kwargs)
387
- for entry_index in range(len(manifest.entries)):
388
- result.append(
389
- download_delta_manifest_entry(
390
- delta_like=delta_like,
391
- entry_index=entry_index,
392
- table_type=table_type,
393
- columns=columns,
394
- file_reader_kwargs_provider=file_reader_kwargs_provider,
395
- *args,
396
- **kwargs,
397
- )
398
- )
399
-
400
- if storage_type == StorageType.DISTRIBUTED:
401
- if distributed_dataset_type is DistributedDatasetType.DAFT:
402
- return daft.from_arrow(result)
403
- elif distributed_dataset_type is DistributedDatasetType.RAY_DATASET:
404
- return ray.data.from_arrow(result)
405
- else:
406
- raise ValueError(f"Dataset type {distributed_dataset_type} not supported!")
407
-
408
- return result
409
-
410
-
411
- def download_delta_manifest_entry(
412
- delta_like: Union[Delta, DeltaLocator],
413
- entry_index: int,
414
- table_type: TableType = TableType.PYARROW,
415
- columns: Optional[List[str]] = None,
416
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
417
- *args,
418
- **kwargs,
419
- ) -> LocalTable:
420
- cur, con = _get_sqlite3_cursor_con(kwargs)
421
- if isinstance(delta_like, Delta) and delta_like.manifest is not None:
422
- manifest = Delta(delta_like).manifest
423
- else:
424
- manifest = get_delta_manifest(delta_like, *args, **kwargs)
425
- if entry_index >= len(manifest.entries):
426
- raise IndexError(
427
- f"Manifest entry index {entry_index} does not exist. "
428
- f"Valid values: [0, {len(manifest.entries)}]"
429
- )
430
-
431
- entry = manifest.entries[entry_index]
432
-
433
- res = cur.execute("SELECT value FROM data WHERE uri = ?", (entry.uri,))
434
- serialized_data = res.fetchone()
435
-
436
- if serialized_data is None:
437
- raise ValueError(
438
- f"Invalid value of delta locator: {delta_like.canonical_string()}"
439
- )
440
-
441
- serialized_data = serialized_data[0]
442
- if entry.meta.content_type == ContentType.PARQUET:
443
- if table_type == TableType.PYARROW_PARQUET:
444
- table = pa.parquet.ParquetFile(io.BytesIO(serialized_data))
445
- else:
446
- table = pa.parquet.read_table(io.BytesIO(serialized_data), columns=columns)
447
- elif entry.meta.content_type == ContentType.UNESCAPED_TSV:
448
- assert (
449
- table_type != TableType.PYARROW_PARQUET
450
- ), f"uTSV table cannot be read as {table_type}"
451
- parse_options = pa.csv.ParseOptions(delimiter="\t")
452
- convert_options = pa.csv.ConvertOptions(
453
- null_values=[""], strings_can_be_null=True, include_columns=columns
454
- )
455
- table = pa.csv.read_csv(
456
- io.BytesIO(serialized_data),
457
- parse_options=parse_options,
458
- convert_options=convert_options,
459
- )
460
- else:
461
- raise ValueError(f"Content type: {entry.meta.content_type} not supported.")
462
-
463
- if table_type == TableType.PYARROW:
464
- return table
465
- elif table_type == TableType.PYARROW_PARQUET:
466
- return table
467
- elif table_type == TableType.NUMPY:
468
- raise NotImplementedError(f"Table type={table_type} not supported")
469
- elif table_type == TableType.PANDAS:
470
- return table.to_pandas()
471
-
472
- return table
473
-
474
-
475
- def get_delta_manifest(
476
- delta_like: Union[Delta, DeltaLocator], *args, **kwargs
477
- ) -> Optional[Manifest]:
478
- delta = get_delta(
479
- namespace=delta_like.namespace,
480
- table_name=delta_like.table_name,
481
- stream_position=delta_like.stream_position,
482
- partition_values=delta_like.partition_values,
483
- table_version=delta_like.table_version,
484
- include_manifest=True,
485
- *args,
486
- **kwargs,
487
- )
488
- if not delta:
489
- return None
490
-
491
- return delta.manifest
492
-
493
-
494
- def create_namespace(
495
- namespace: str, properties: NamespaceProperties, *args, **kwargs
496
- ) -> Namespace:
497
- cur, con = _get_sqlite3_cursor_con(kwargs)
498
- locator = NamespaceLocator.of(namespace)
499
- result = Namespace.of(locator, properties)
500
- params = (locator.canonical_string(), json.dumps(result))
501
- cur.execute(CREATE_NAMESPACES_TABLE)
502
- cur.execute(CREATE_TABLES_TABLE)
503
- cur.execute(CREATE_TABLE_VERSIONS_TABLE)
504
- cur.execute(CREATE_STREAMS_TABLE)
505
- cur.execute(CREATE_PARTITIONS_TABLE)
506
- cur.execute(CREATE_DELTAS_TABLE)
507
- cur.execute(CREATE_DATA_TABLE)
508
- cur.execute("INSERT OR IGNORE INTO namespaces VALUES(?, ?)", params)
509
- con.commit()
510
- return result
511
-
512
-
513
- def update_namespace(
514
- namespace: str,
515
- properties: NamespaceProperties = None,
516
- new_namespace: Optional[str] = None,
517
- *args,
518
- **kwargs,
519
- ) -> None:
520
- assert new_namespace is None, "namespace name cannot be changed"
521
- cur, con = _get_sqlite3_cursor_con(kwargs)
522
- locator = NamespaceLocator.of(namespace)
523
- result = Namespace.of(locator, properties)
524
- params = (json.dumps(result), locator.canonical_string())
525
- cur.execute("UPDATE namespaces SET value = ? WHERE locator = ?", params)
526
- con.commit()
527
-
528
-
529
- def create_table_version(
530
- namespace: str,
531
- table_name: str,
532
- table_version: Optional[str] = None,
533
- schema: Optional[Union[pa.Schema, Any]] = None,
534
- partition_scheme: Optional[PartitionScheme] = None,
535
- sort_keys: Optional[SortScheme] = None,
536
- table_version_description: Optional[str] = None,
537
- table_version_properties: Optional[TableVersionProperties] = None,
538
- table_description: Optional[str] = None,
539
- table_properties: Optional[TableProperties] = None,
540
- supported_content_types: Optional[List[ContentType]] = None,
541
- *args,
542
- **kwargs,
543
- ) -> Stream:
544
- cur, con = _get_sqlite3_cursor_con(kwargs)
545
-
546
- if partition_scheme is not None:
547
- assert (
548
- partition_scheme.keys is not None
549
- ), "Partition keys must be specified with partition scheme"
550
- for key in partition_scheme.keys:
551
- assert (
552
- key.transform is None or key.transform.name == TransformName.IDENTITY
553
- ), (
554
- "Local DeltaCAT storage does not support creating table versions "
555
- "with non identity transform partition spec"
556
- )
557
- if sort_keys is not None:
558
- assert (
559
- sort_keys.keys is not None
560
- ), "Sort keys must be specified with sort scheme"
561
- for key in sort_keys.keys:
562
- assert (
563
- key.transform is None or key.transform.name == TransformName.IDENTITY
564
- ), (
565
- "Local DeltaCAT storage does not support creating table versions "
566
- "with non identity transform sort spec"
567
- )
568
-
569
- latest_version = get_latest_table_version(namespace, table_name, *args, **kwargs)
570
- if (
571
- table_version is not None
572
- and latest_version
573
- and int(latest_version.table_version) + 1 != int(table_version)
574
- ):
575
- raise AssertionError(
576
- f"Table version can only be incremented. Last version={latest_version.table_version}"
577
- )
578
- elif table_version is None:
579
- table_version = (
580
- (int(latest_version.table_version) + 1) if latest_version else "1"
581
- )
582
-
583
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
584
- table_obj = Table.of(table_locator, table_description, table_properties)
585
- table_version_locator = TableVersionLocator.of(
586
- table_locator=table_locator, table_version=table_version
587
- )
588
-
589
- stream_id = uuid.uuid4().__str__()
590
-
591
- if table_version_properties is None:
592
- table_version_properties = {}
593
-
594
- properties = {**table_version_properties, STREAM_ID_PROPERTY: stream_id}
595
- table_version_obj = TableVersion.of(
596
- table_version_locator,
597
- schema=Schema.of(schema) if schema else None,
598
- partition_scheme=partition_scheme,
599
- description=table_version_description,
600
- properties=properties,
601
- sort_scheme=sort_keys,
602
- content_types=supported_content_types,
603
- )
604
- stream_locator = StreamLocator.of(
605
- table_version_obj.locator, stream_id=stream_id, stream_format=STREAM_FORMAT
606
- )
607
- result_stream = Stream.of(
608
- stream_locator, partition_scheme=partition_scheme, state=CommitState.COMMITTED
609
- )
610
-
611
- params = (
612
- table_locator.canonical_string(),
613
- table_locator.namespace_locator.canonical_string(),
614
- json.dumps(table_obj),
615
- )
616
- cur.execute("INSERT OR IGNORE INTO tables VALUES (?, ?, ?)", params)
617
- params = (
618
- table_version_locator.canonical_string(),
619
- table_locator.canonical_string(),
620
- json.dumps(table_version_obj),
621
- )
622
- cur.execute("INSERT OR IGNORE INTO table_versions VALUES (?, ?, ?)", params)
623
-
624
- params = (
625
- stream_locator.canonical_string(),
626
- table_version_locator.canonical_string(),
627
- json.dumps(result_stream),
628
- )
629
- cur.execute("INSERT OR IGNORE INTO streams VALUES (?, ?, ?)", params)
630
- con.commit()
631
- return result_stream
632
-
633
-
634
- def update_table(
635
- namespace: str,
636
- table_name: str,
637
- description: Optional[str] = None,
638
- properties: Optional[TableProperties] = None,
639
- new_table_name: Optional[str] = None,
640
- *args,
641
- **kwargs,
642
- ) -> None:
643
- cur, con = _get_sqlite3_cursor_con(kwargs)
644
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
645
- table_obj = Table.of(table_locator, description, properties)
646
-
647
- params = (table_locator.canonical_string(),)
648
- cur.execute("DELETE FROM tables WHERE locator = ?", params)
649
- params = (
650
- table_locator.canonical_string(),
651
- table_locator.namespace_locator.canonical_string(),
652
- json.dumps(table_obj),
653
- )
654
- cur.execute("INSERT INTO tables VALUES (?, ?, ?)", params)
655
- con.commit()
656
-
657
-
658
- def update_table_version(
659
- namespace: str,
660
- table_name: str,
661
- table_version: str,
662
- lifecycle_state: Optional[LifecycleState] = None,
663
- schema: Optional[Union[pa.Schema, Any]] = None,
664
- description: Optional[str] = None,
665
- properties: Optional[TableVersionProperties] = None,
666
- *args,
667
- **kwargs,
668
- ) -> None:
669
- cur, con = _get_sqlite3_cursor_con(kwargs)
670
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
671
- table_version_locator = TableVersionLocator.of(
672
- table_locator=table_locator, table_version=table_version
673
- )
674
-
675
- res = cur.execute(
676
- "SELECT * from table_versions WHERE locator = ?",
677
- (table_version_locator.canonical_string(),),
678
- )
679
- serialized_table_version = res.fetchone()
680
- assert (
681
- serialized_table_version is not None
682
- ), f"Table version not found with locator={table_version_locator.canonical_string()}"
683
- current_table_version_obj = TableVersion(json.loads(serialized_table_version[2]))
684
-
685
- if properties is None:
686
- properties = {}
687
-
688
- current_props = (
689
- current_table_version_obj.properties
690
- if current_table_version_obj.properties
691
- else {}
692
- )
693
-
694
- tv_properties = {**properties, **current_props}
695
- table_version_obj = TableVersion.of(
696
- table_version_locator,
697
- schema=Schema.of(schema) if schema else None,
698
- partition_scheme=current_table_version_obj.partition_scheme,
699
- description=description,
700
- properties=tv_properties,
701
- sort_scheme=current_table_version_obj.sort_scheme,
702
- content_types=current_table_version_obj.content_types,
703
- )
704
-
705
- params = (
706
- table_locator.canonical_string(),
707
- json.dumps(table_version_obj),
708
- table_version_locator.canonical_string(),
709
- )
710
- cur.execute(
711
- "UPDATE table_versions SET table_locator = ?, value = ? WHERE locator = ?",
712
- params,
713
- )
714
- con.commit()
715
-
716
-
717
- def stage_stream(
718
- namespace: str,
719
- table_name: str,
720
- table_version: Optional[str] = None,
721
- *args,
722
- **kwargs,
723
- ) -> Stream:
724
- cur, con = _get_sqlite3_cursor_con(kwargs)
725
-
726
- existing_table_version = get_table_version(
727
- namespace, table_name, table_version, *args, **kwargs
728
- )
729
- existing_stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
730
-
731
- stream_id = uuid.uuid4().__str__()
732
- new_stream_locator = StreamLocator.of(
733
- existing_table_version.locator, stream_id, STREAM_FORMAT
734
- )
735
- new_stream = Stream.of(
736
- new_stream_locator,
737
- existing_stream.partition_scheme,
738
- CommitState.STAGED,
739
- existing_stream.locator.canonical_string(),
740
- )
741
-
742
- params = (
743
- new_stream_locator.canonical_string(),
744
- existing_table_version.locator.canonical_string(),
745
- json.dumps(new_stream),
746
- )
747
- cur.execute("INSERT INTO streams VALUES (?, ?, ?)", params)
748
- con.commit()
749
-
750
- return new_stream
751
-
752
-
753
- def commit_stream(stream: Stream, *args, **kwargs) -> Stream:
754
- cur, con = _get_sqlite3_cursor_con(kwargs)
755
-
756
- existing_table_version = get_table_version(
757
- stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
758
- )
759
- stream_to_commit = Stream.of(
760
- stream.locator,
761
- stream.partition_scheme,
762
- CommitState.COMMITTED,
763
- stream.previous_stream_id,
764
- )
765
-
766
- existing_table_version.properties[
767
- STREAM_ID_PROPERTY
768
- ] = stream_to_commit.locator.stream_id
769
-
770
- params = (
771
- json.dumps(existing_table_version),
772
- existing_table_version.locator.canonical_string(),
773
- )
774
- cur.execute("UPDATE table_versions SET value = ? WHERE locator = ?", params)
775
- params = (json.dumps(stream_to_commit), stream_to_commit.locator.canonical_string())
776
- cur.execute("UPDATE streams SET value = ? WHERE locator = ?", params)
777
- con.commit()
778
-
779
- return stream_to_commit
780
-
781
-
782
- def delete_stream(
783
- namespace: str,
784
- table_name: str,
785
- table_version: Optional[str] = None,
786
- *args,
787
- **kwargs,
788
- ) -> None:
789
- cur, con = _get_sqlite3_cursor_con(kwargs)
790
-
791
- table_version_locator = TableVersionLocator.of(
792
- TableLocator.of(NamespaceLocator.of(namespace), table_name), table_version
793
- )
794
-
795
- res = cur.execute(
796
- "SELECT locator FROM streams WHERE table_version_locator = ?",
797
- (table_version_locator.canonical_string(),),
798
- )
799
- locators = res.fetchall()
800
- cur.executemany("DELETE FROM streams WHERE locator = ?", locators)
801
- cur.execute(
802
- "DELETE FROM table_versions WHERE locator = ?",
803
- (table_version_locator.canonical_string(),),
804
- )
805
-
806
- con.commit()
807
-
808
-
809
- def stage_partition(
810
- stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
811
- ) -> Partition:
812
- cur, con = _get_sqlite3_cursor_con(kwargs)
813
- partition_id = uuid.uuid4().__str__()
814
- partition_locator = PartitionLocator.of(
815
- stream.locator, partition_values=partition_values, partition_id=partition_id
816
- )
817
-
818
- tv = get_table_version(
819
- stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
820
- )
821
-
822
- pv_partition = get_partition(
823
- stream.locator, partition_values=partition_values, *args, **kwargs
824
- )
825
-
826
- stream_position = current_time_ms()
827
- partition = Partition.of(
828
- partition_locator,
829
- schema=tv.schema,
830
- content_types=tv.content_types,
831
- state=CommitState.STAGED,
832
- previous_stream_position=pv_partition.stream_position if pv_partition else None,
833
- previous_partition_id=pv_partition.partition_id if pv_partition else None,
834
- stream_position=stream_position,
835
- )
836
-
837
- params = (
838
- partition.locator.canonical_string(),
839
- partition.stream_locator.canonical_string(),
840
- json.dumps(partition),
841
- )
842
- cur.execute("INSERT INTO partitions VALUES (?, ?, ?)", params)
843
- con.commit()
844
-
845
- return partition
846
-
847
-
848
- def commit_partition(
849
- partition: Partition,
850
- previous_partition: Optional[Partition] = None,
851
- *args,
852
- **kwargs,
853
- ) -> Partition:
854
- cur, con = _get_sqlite3_cursor_con(kwargs)
855
- pv_partition: Optional[Partition] = previous_partition or get_partition(
856
- partition.stream_locator,
857
- partition_values=partition.partition_values,
858
- *args,
859
- **kwargs,
860
- )
861
- # deprecate old partition and commit new one
862
- if pv_partition:
863
- pv_partition.state = CommitState.DEPRECATED
864
- params = (json.dumps(pv_partition), pv_partition.locator.canonical_string())
865
- cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
866
- previous_partition_deltas = (
867
- list_partition_deltas(
868
- pv_partition, ascending_order=False, *args, **kwargs
869
- ).all_items()
870
- or []
871
- )
872
-
873
- partition_deltas: Optional[List[Delta]] = (
874
- list_partition_deltas(
875
- partition, ascending_order=False, *args, **kwargs
876
- ).all_items()
877
- or []
878
- )
879
-
880
- # if previous_partition is passed in, table is in-place compacted and we need to run merge-and-promote
881
- if previous_partition:
882
- partition_deltas = _merge_and_promote(
883
- partition_deltas, previous_partition_deltas
884
- )
885
-
886
- stream_position = (
887
- partition_deltas[0].stream_position
888
- if partition_deltas
889
- else partition.stream_position
890
- )
891
-
892
- partition.stream_position = stream_position
893
- if partition_deltas:
894
- partition.locator = partition_deltas[0].partition_locator
895
-
896
- partition.state = CommitState.COMMITTED
897
- partition.previous_stream_position = (
898
- pv_partition.stream_position if pv_partition else None
899
- )
900
- params = (json.dumps(partition), partition.locator.canonical_string())
901
- cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
902
- con.commit()
903
-
904
- return partition
905
-
906
-
907
- def delete_partition(
908
- namespace: str,
909
- table_name: str,
910
- table_version: Optional[str] = None,
911
- partition_values: Optional[PartitionValues] = None,
912
- *args,
913
- **kwargs,
914
- ) -> None:
915
- cur, con = _get_sqlite3_cursor_con(kwargs)
916
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
917
- partition = get_partition(stream.locator, partition_values, *args, **kwargs)
918
-
919
- partition.state = CommitState.DEPRECATED
920
- params = (json.dumps(partition), partition.locator.canonical_string())
921
-
922
- cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
923
- con.commit()
924
-
925
-
926
- def get_partition(
927
- stream_locator: StreamLocator,
928
- partition_values: Optional[PartitionValues] = None,
929
- *args,
930
- **kwargs,
931
- ) -> Optional[Partition]:
932
- cur, con = _get_sqlite3_cursor_con(kwargs)
933
-
934
- res = cur.execute(
935
- "SELECT * FROM partitions WHERE stream_locator = ?",
936
- (stream_locator.canonical_string(),),
937
- )
938
-
939
- serialized_partitions = res.fetchall()
940
-
941
- if not serialized_partitions:
942
- return None
943
-
944
- if partition_values is None:
945
- partition_values = []
946
-
947
- prior_pv = ",".join(partition_values)
948
-
949
- for item in serialized_partitions:
950
- partition = Partition(json.loads(item[2]))
951
- pv = ",".join(partition.partition_values if partition.partition_values else [])
952
-
953
- if pv == prior_pv and partition.state == CommitState.COMMITTED:
954
- return partition
955
-
956
- return None
957
-
958
-
959
- def stage_delta(
960
- data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
961
- partition: Partition,
962
- delta_type: DeltaType = DeltaType.UPSERT,
963
- max_records_per_entry: Optional[int] = None,
964
- author: Optional[ManifestAuthor] = None,
965
- properties: Optional[DeltaProperties] = None,
966
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
967
- content_type: ContentType = ContentType.PARQUET,
968
- entry_params: Optional[EntryParams] = None,
969
- *args,
970
- **kwargs,
971
- ) -> Delta:
972
- cur, con = _get_sqlite3_cursor_con(kwargs)
973
- manifest_id = uuid.uuid4().__str__()
974
- uri = _get_manifest_entry_uri(manifest_id)
975
-
976
- if data is None:
977
- delta = create_empty_delta(
978
- partition,
979
- delta_type,
980
- author,
981
- properties=properties,
982
- manifest_entry_id=manifest_id,
983
- )
984
- cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", (uri, None))
985
- params = (delta.locator.canonical_string(), "staged_delta", json.dumps(delta))
986
- cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
987
- con.commit()
988
- return delta
989
-
990
- serialized_data = None
991
- if content_type == ContentType.PARQUET:
992
- buffer = io.BytesIO()
993
- pa.parquet.write_table(data, buffer)
994
- serialized_data = buffer.getvalue()
995
- elif content_type == ContentType.UNESCAPED_TSV:
996
- buffer = io.BytesIO()
997
- write_options = pa.csv.WriteOptions(
998
- include_header=True, delimiter="\t", quoting_style="none"
999
- )
1000
- pa.csv.write_csv(data, buffer, write_options=write_options)
1001
- serialized_data = buffer.getvalue()
1002
- else:
1003
- raise ValueError(f"Unsupported content type: {content_type}")
1004
-
1005
- stream_position = current_time_ms()
1006
- delta_locator = DeltaLocator.of(partition.locator, stream_position=stream_position)
1007
-
1008
- entry_type = (
1009
- EntryType.EQUALITY_DELETE if delta_type is DeltaType.DELETE else EntryType.DATA
1010
- )
1011
- meta = ManifestMeta.of(
1012
- len(data),
1013
- len(serialized_data),
1014
- content_type=content_type,
1015
- content_encoding=ContentEncoding.IDENTITY,
1016
- source_content_length=data.nbytes,
1017
- entry_type=entry_type,
1018
- entry_params=entry_params,
1019
- )
1020
-
1021
- manifest = Manifest.of(
1022
- entries=ManifestEntryList.of(
1023
- [
1024
- ManifestEntry.of(
1025
- uri=uri,
1026
- url=uri,
1027
- meta=meta,
1028
- mandatory=True,
1029
- uuid=manifest_id,
1030
- )
1031
- ]
1032
- ),
1033
- author=author,
1034
- uuid=manifest_id,
1035
- entry_type=entry_type,
1036
- entry_params=entry_params,
1037
- )
1038
-
1039
- delta = Delta.of(
1040
- delta_locator,
1041
- delta_type=delta_type,
1042
- meta=meta,
1043
- properties=properties,
1044
- manifest=manifest,
1045
- previous_stream_position=partition.stream_position,
1046
- )
1047
-
1048
- params = (uri, serialized_data)
1049
- cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", params)
1050
-
1051
- params = (delta_locator.canonical_string(), "staged_delta", json.dumps(delta))
1052
- cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
1053
-
1054
- con.commit()
1055
- return delta
1056
-
1057
-
1058
- def commit_delta(delta: Delta, *args, **kwargs) -> Delta:
1059
- cur, con = _get_sqlite3_cursor_con(kwargs)
1060
- delta_stream_position: Optional[int] = delta.stream_position
1061
- delta.locator.stream_position = delta_stream_position or current_time_ms()
1062
-
1063
- params = (
1064
- delta.locator.canonical_string(),
1065
- delta.partition_locator.canonical_string(),
1066
- json.dumps(delta),
1067
- )
1068
-
1069
- cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
1070
-
1071
- params = (
1072
- delta.partition_locator.canonical_string(),
1073
- json.dumps(delta),
1074
- delta.locator.canonical_string(),
1075
- )
1076
- cur.execute(
1077
- "UPDATE deltas SET partition_locator = ?, value = ? WHERE locator = ?", params
1078
- )
1079
- con.commit()
1080
- return delta
1081
-
1082
-
1083
- def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
1084
- cur, con = _get_sqlite3_cursor_con(kwargs)
1085
- locator = NamespaceLocator.of(namespace)
1086
-
1087
- res = cur.execute(
1088
- "SELECT * FROM namespaces WHERE locator = ?", (locator.canonical_string(),)
1089
- )
1090
- serialized_result = res.fetchone()
1091
-
1092
- if serialized_result is None:
1093
- return None
1094
-
1095
- return Namespace(json.loads(serialized_result[1]))
1096
-
1097
-
1098
- def namespace_exists(namespace: str, *args, **kwargs) -> bool:
1099
- obj = get_namespace(namespace, *args, **kwargs)
1100
-
1101
- return obj is not None
1102
-
1103
-
1104
- def get_table(namespace: str, table_name: str, *args, **kwargs) -> Optional[Table]:
1105
- cur, con = _get_sqlite3_cursor_con(kwargs)
1106
- locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
1107
-
1108
- res = cur.execute(
1109
- "SELECT * FROM tables WHERE locator = ?", (locator.canonical_string(),)
1110
- )
1111
- serialized_result = res.fetchone()
1112
-
1113
- if serialized_result is None:
1114
- return None
1115
-
1116
- return Table(json.loads(serialized_result[2]))
1117
-
1118
-
1119
- def table_exists(namespace: str, table_name: str, *args, **kwargs) -> bool:
1120
- obj = get_table(namespace, table_name, *args, **kwargs)
1121
-
1122
- return obj is not None
1123
-
1124
-
1125
- def get_table_version(
1126
- namespace: str, table_name: str, table_version: str, *args, **kwargs
1127
- ) -> Optional[TableVersion]:
1128
- cur, con = _get_sqlite3_cursor_con(kwargs)
1129
- locator = TableVersionLocator.of(
1130
- TableLocator.of(NamespaceLocator.of(namespace), table_name), table_version
1131
- )
1132
-
1133
- res = cur.execute(
1134
- "SELECT * FROM table_versions WHERE locator = ?", (locator.canonical_string(),)
1135
- )
1136
- serialized_table_version = res.fetchone()
1137
-
1138
- if serialized_table_version is None:
1139
- return None
1140
-
1141
- return TableVersion(json.loads(serialized_table_version[2]))
1142
-
1143
-
1144
- def get_latest_table_version(
1145
- namespace: str, table_name: str, *args, **kwargs
1146
- ) -> Optional[TableVersion]:
1147
- table_versions = list_table_versions(
1148
- namespace, table_name, *args, **kwargs
1149
- ).all_items()
1150
- if not table_versions:
1151
- return None
1152
-
1153
- table_versions.sort(reverse=True, key=lambda v: int(v.table_version))
1154
- return table_versions[0]
1155
-
1156
-
1157
- def get_latest_active_table_version(
1158
- namespace: str, table_name: str, *args, **kwargs
1159
- ) -> Optional[TableVersion]:
1160
-
1161
- # This module does not support table version lifecycle state
1162
- return get_latest_table_version(namespace, table_name, *args, **kwargs)
1163
-
1164
-
1165
- def get_table_version_schema(
1166
- namespace: str,
1167
- table_name: str,
1168
- table_version: Optional[str] = None,
1169
- *args,
1170
- **kwargs,
1171
- ) -> Optional[Union[pa.Schema, Any]]:
1172
- obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
1173
-
1174
- return obj.schema
1175
-
1176
-
1177
- def table_version_exists(
1178
- namespace: str, table_name: str, table_version: str, *args, **kwargs
1179
- ) -> bool:
1180
- obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
1181
-
1182
- return obj is not None
1183
-
1184
-
1185
- def get_stream(
1186
- namespace: str,
1187
- table_name: str,
1188
- table_version: Optional[str] = None,
1189
- *args,
1190
- **kwargs,
1191
- ) -> Optional[Stream]:
1192
- assert not isinstance(table_version, int), f"Passed an integer as the table version"
1193
- obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
1194
-
1195
- if obj is None:
1196
- return None
1197
-
1198
- stream_id = obj.properties.get(STREAM_ID_PROPERTY)
1199
- if stream_id is None:
1200
- return None
1201
-
1202
- cur, con = _get_sqlite3_cursor_con(kwargs)
1203
- stream_locator = StreamLocator.of(
1204
- obj.locator, stream_id=stream_id, stream_format=STREAM_FORMAT
1205
- )
1206
- res = cur.execute(
1207
- "SELECT * FROM streams WHERE locator = ?", (stream_locator.canonical_string(),)
1208
- )
1209
-
1210
- serialized_stream = res.fetchone()
1211
- if serialized_stream is None:
1212
- return None
1213
-
1214
- return Stream(json.loads(serialized_stream[2]))
1215
-
1216
-
1217
- def get_table_version_column_names(
1218
- namespace: str,
1219
- table_name: str,
1220
- table_version: Optional[str] = None,
1221
- *args,
1222
- **kwargs,
1223
- ) -> Optional[List[str]]:
1224
- raise NotImplementedError("Fetching column names is not supported")
1225
-
1226
-
1227
- def can_categorize(e: BaseException, **kwargs) -> bool:
1228
- if isinstance(e, InvalidNamespaceError):
1229
- return True
1230
- else:
1231
- return False
1232
-
1233
-
1234
- def raise_categorized_error(e: BaseException, **kwargs):
1235
- if isinstance(e, InvalidNamespaceError):
1236
- raise LocalStorageValidationError("Namespace provided is invalid!")