deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +3 -2
- deltacat/aws/clients.py +123 -3
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
- deltacat/benchmarking/conftest.py +61 -0
- deltacat/catalog/delegate.py +1 -1
- deltacat/catalog/interface.py +1 -1
- deltacat/compute/compactor/__init__.py +0 -3
- deltacat/compute/compactor/compaction_session.py +45 -20
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
- deltacat/compute/compactor/model/primary_key_index.py +1 -1
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +5 -3
- deltacat/compute/compactor/steps/dedupe.py +10 -8
- deltacat/compute/compactor/steps/hash_bucket.py +25 -4
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/primary_key_index.py +1 -15
- deltacat/compute/compactor/utils/sort_key.py +57 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/__init__.py +0 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +11 -2
- deltacat/storage/model/sort_key.py +33 -0
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/__init__.py +0 -0
- deltacat/tests/aws/test_clients.py +80 -0
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/performance.py +4 -2
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
- deltacat-0.1.18b15.dist-info/RECORD +176 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/model/sort_key.py +0 -98
- deltacat-0.1.18b13.dist-info/RECORD +0 -136
- /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1109 @@
|
|
1
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Union, Tuple
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
import json
|
5
|
+
import sqlite3
|
6
|
+
from sqlite3 import Cursor, Connection
|
7
|
+
import uuid
|
8
|
+
import io
|
9
|
+
from deltacat.utils.common import current_time_ms
|
10
|
+
|
11
|
+
from deltacat.storage import (
|
12
|
+
Delta,
|
13
|
+
DeltaLocator,
|
14
|
+
DeltaType,
|
15
|
+
DistributedDataset,
|
16
|
+
LifecycleState,
|
17
|
+
ListResult,
|
18
|
+
LocalDataset,
|
19
|
+
LocalTable,
|
20
|
+
Manifest,
|
21
|
+
ManifestAuthor,
|
22
|
+
Namespace,
|
23
|
+
NamespaceLocator,
|
24
|
+
Partition,
|
25
|
+
SchemaConsistencyType,
|
26
|
+
Stream,
|
27
|
+
StreamLocator,
|
28
|
+
Table,
|
29
|
+
TableVersion,
|
30
|
+
TableVersionLocator,
|
31
|
+
TableLocator,
|
32
|
+
CommitState,
|
33
|
+
SortKey,
|
34
|
+
PartitionLocator,
|
35
|
+
ManifestMeta,
|
36
|
+
ManifestEntry,
|
37
|
+
ManifestEntryList,
|
38
|
+
)
|
39
|
+
from deltacat.types.media import ContentType, StorageType, TableType, ContentEncoding
|
40
|
+
from deltacat.utils.common import ReadKwargsProvider
|
41
|
+
|
42
|
+
SQLITE_CUR_ARG = "sqlite3_cur"
|
43
|
+
SQLITE_CON_ARG = "sqlite3_con"
|
44
|
+
DB_FILE_PATH_ARG = "db_file_path"
|
45
|
+
|
46
|
+
STORAGE_TYPE = "SQLITE3"
|
47
|
+
STREAM_ID_PROPERTY = "stream_id"
|
48
|
+
CREATE_NAMESPACES_TABLE = (
|
49
|
+
"CREATE TABLE IF NOT EXISTS namespaces(locator, value, PRIMARY KEY (locator))"
|
50
|
+
)
|
51
|
+
CREATE_TABLES_TABLE = (
|
52
|
+
"CREATE TABLE IF NOT EXISTS tables(locator, namespace_locator, value, PRIMARY KEY (locator), "
|
53
|
+
"FOREIGN KEY (namespace_locator) REFERENCES namespaces(locator))"
|
54
|
+
)
|
55
|
+
CREATE_TABLE_VERSIONS_TABLE = (
|
56
|
+
"CREATE TABLE IF NOT EXISTS table_versions(locator, table_locator, value, PRIMARY KEY (locator), "
|
57
|
+
"FOREIGN KEY (table_locator) REFERENCES tables(locator))"
|
58
|
+
)
|
59
|
+
CREATE_STREAMS_TABLE = (
|
60
|
+
"CREATE TABLE IF NOT EXISTS streams(locator, table_version_locator, value, PRIMARY KEY(locator), "
|
61
|
+
"FOREIGN KEY (table_version_locator) REFERENCES table_versions(locator))"
|
62
|
+
)
|
63
|
+
CREATE_PARTITIONS_TABLE = (
|
64
|
+
"CREATE TABLE IF NOT EXISTS partitions(locator, stream_locator, value, PRIMARY KEY(locator), "
|
65
|
+
"FOREIGN KEY (stream_locator) REFERENCES streams(locator))"
|
66
|
+
)
|
67
|
+
CREATE_DELTAS_TABLE = (
|
68
|
+
"CREATE TABLE IF NOT EXISTS deltas(locator, partition_locator, value, PRIMARY KEY(locator), "
|
69
|
+
"FOREIGN KEY (partition_locator) REFERENCES partitions(locator))"
|
70
|
+
)
|
71
|
+
CREATE_DATA_TABLE = "CREATE TABLE IF NOT EXISTS data(uri, value, PRIMARY KEY(uri))"
|
72
|
+
|
73
|
+
|
74
|
+
def _get_sqlite3_cursor_con(kwargs) -> Tuple[Cursor, Connection]:
|
75
|
+
if SQLITE_CUR_ARG in kwargs and SQLITE_CON_ARG in kwargs:
|
76
|
+
return kwargs[SQLITE_CUR_ARG], kwargs[SQLITE_CON_ARG]
|
77
|
+
elif DB_FILE_PATH_ARG in kwargs:
|
78
|
+
con = sqlite3.connect(kwargs[DB_FILE_PATH_ARG])
|
79
|
+
cur = con.cursor()
|
80
|
+
return cur, con
|
81
|
+
|
82
|
+
raise ValueError(f"Invalid local db connection kwargs: {kwargs}")
|
83
|
+
|
84
|
+
|
85
|
+
def _get_manifest_entry_uri(manifest_entry_id: str) -> str:
|
86
|
+
return f"cloudpickle://{manifest_entry_id}"
|
87
|
+
|
88
|
+
|
89
|
+
def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
|
90
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
91
|
+
res = cur.execute("SELECT * FROM namespaces")
|
92
|
+
fetched = res.fetchall()
|
93
|
+
result = []
|
94
|
+
|
95
|
+
for item in fetched:
|
96
|
+
result.append(Namespace(json.loads(item[1])))
|
97
|
+
|
98
|
+
return ListResult.of(result, None, None)
|
99
|
+
|
100
|
+
|
101
|
+
def list_tables(namespace: str, *args, **kwargs) -> ListResult[Table]:
|
102
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
103
|
+
params = (NamespaceLocator.of(namespace).canonical_string(),)
|
104
|
+
res = cur.execute("SELECT * FROM tables WHERE namespace_locator = ?", params)
|
105
|
+
fetched = res.fetchall()
|
106
|
+
result = []
|
107
|
+
|
108
|
+
for item in fetched:
|
109
|
+
result.append(Table(json.loads(item[2])))
|
110
|
+
|
111
|
+
return ListResult.of(result, None, None)
|
112
|
+
|
113
|
+
|
114
|
+
def list_table_versions(
|
115
|
+
namespace: str, table_name: str, *args, **kwargs
|
116
|
+
) -> ListResult[TableVersion]:
|
117
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
118
|
+
table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
|
119
|
+
|
120
|
+
res = cur.execute(
|
121
|
+
"SELECT * FROM table_versions WHERE table_locator = ?",
|
122
|
+
(table_locator.canonical_string(),),
|
123
|
+
)
|
124
|
+
fetched = res.fetchall()
|
125
|
+
result = []
|
126
|
+
|
127
|
+
for item in fetched:
|
128
|
+
result.append(TableVersion(json.loads(item[2])))
|
129
|
+
|
130
|
+
return ListResult.of(result, None, None)
|
131
|
+
|
132
|
+
|
133
|
+
def list_partitions(
|
134
|
+
namespace: str,
|
135
|
+
table_name: str,
|
136
|
+
table_version: Optional[str] = None,
|
137
|
+
*args,
|
138
|
+
**kwargs,
|
139
|
+
) -> ListResult[Partition]:
|
140
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
141
|
+
|
142
|
+
stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
|
143
|
+
|
144
|
+
res = cur.execute(
|
145
|
+
"SELECT * FROM partitions WHERE stream_locator = ?",
|
146
|
+
(stream.locator.canonical_string(),),
|
147
|
+
)
|
148
|
+
|
149
|
+
fetched = res.fetchall()
|
150
|
+
result = []
|
151
|
+
for item in fetched:
|
152
|
+
partition = Partition(json.loads(item[2]))
|
153
|
+
if partition.state == CommitState.COMMITTED:
|
154
|
+
result.append(partition)
|
155
|
+
|
156
|
+
return ListResult.of(result, None, None)
|
157
|
+
|
158
|
+
|
159
|
+
def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partition]:
|
160
|
+
return list_partitions(
|
161
|
+
stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
|
162
|
+
)
|
163
|
+
|
164
|
+
|
165
|
+
def list_deltas(
|
166
|
+
namespace: str,
|
167
|
+
table_name: str,
|
168
|
+
partition_values: Optional[List[Any]] = None,
|
169
|
+
table_version: Optional[str] = None,
|
170
|
+
first_stream_position: Optional[int] = None,
|
171
|
+
last_stream_position: Optional[int] = None,
|
172
|
+
ascending_order: Optional[bool] = None,
|
173
|
+
include_manifest: bool = False,
|
174
|
+
*args,
|
175
|
+
**kwargs,
|
176
|
+
) -> ListResult[Delta]:
|
177
|
+
stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
|
178
|
+
if stream is None:
|
179
|
+
return ListResult.of([], None, None)
|
180
|
+
|
181
|
+
partition = get_partition(stream.locator, partition_values, *args, **kwargs)
|
182
|
+
|
183
|
+
all_deltas = list_partition_deltas(
|
184
|
+
partition,
|
185
|
+
first_stream_position=first_stream_position,
|
186
|
+
last_stream_position=last_stream_position,
|
187
|
+
ascending_order=ascending_order,
|
188
|
+
include_manifest=include_manifest,
|
189
|
+
*args,
|
190
|
+
**kwargs,
|
191
|
+
).all_items()
|
192
|
+
|
193
|
+
result = []
|
194
|
+
|
195
|
+
for delta in all_deltas:
|
196
|
+
if (
|
197
|
+
not first_stream_position or first_stream_position < delta.stream_position
|
198
|
+
) and (
|
199
|
+
not last_stream_position or delta.stream_position <= last_stream_position
|
200
|
+
):
|
201
|
+
result.append(delta)
|
202
|
+
|
203
|
+
if not include_manifest:
|
204
|
+
delta.manifest = None
|
205
|
+
|
206
|
+
result.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
|
207
|
+
return ListResult.of(result, None, None)
|
208
|
+
|
209
|
+
|
210
|
+
def list_partition_deltas(
|
211
|
+
partition_like: Union[Partition, PartitionLocator],
|
212
|
+
first_stream_position: Optional[int] = None,
|
213
|
+
last_stream_position: Optional[int] = None,
|
214
|
+
ascending_order: bool = False,
|
215
|
+
include_manifest: bool = False,
|
216
|
+
*args,
|
217
|
+
**kwargs,
|
218
|
+
) -> ListResult[Delta]:
|
219
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
220
|
+
|
221
|
+
if partition_like is None:
|
222
|
+
return ListResult.of([], None, None)
|
223
|
+
|
224
|
+
if first_stream_position is None:
|
225
|
+
first_stream_position = 0
|
226
|
+
|
227
|
+
if last_stream_position is None:
|
228
|
+
last_stream_position = float("inf")
|
229
|
+
|
230
|
+
assert isinstance(partition_like, Partition) or isinstance(
|
231
|
+
partition_like, PartitionLocator
|
232
|
+
), f"Expected a Partition or PartitionLocator as an input argument but found {partition_like}"
|
233
|
+
|
234
|
+
partition_locator = None
|
235
|
+
if isinstance(partition_like, Partition):
|
236
|
+
partition_locator = partition_like.locator
|
237
|
+
else:
|
238
|
+
partition_locator = partition_like
|
239
|
+
|
240
|
+
res = cur.execute(
|
241
|
+
"SELECT * FROM deltas WHERE partition_locator = ?",
|
242
|
+
(partition_locator.canonical_string(),),
|
243
|
+
)
|
244
|
+
|
245
|
+
serialized_items = res.fetchall()
|
246
|
+
|
247
|
+
if not serialized_items:
|
248
|
+
return ListResult.of([], None, None)
|
249
|
+
|
250
|
+
result = []
|
251
|
+
for item in serialized_items:
|
252
|
+
current_delta = Delta(json.loads(item[2]))
|
253
|
+
if (
|
254
|
+
first_stream_position
|
255
|
+
<= current_delta.stream_position
|
256
|
+
<= last_stream_position
|
257
|
+
):
|
258
|
+
result.append(current_delta)
|
259
|
+
|
260
|
+
if not include_manifest:
|
261
|
+
current_delta.manifest = None
|
262
|
+
|
263
|
+
result.sort(
|
264
|
+
reverse=True if not ascending_order else False, key=lambda d: d.stream_position
|
265
|
+
)
|
266
|
+
return ListResult.of(result, None, None)
|
267
|
+
|
268
|
+
|
269
|
+
def get_delta(
|
270
|
+
namespace: str,
|
271
|
+
table_name: str,
|
272
|
+
stream_position: int,
|
273
|
+
partition_values: Optional[List[Any]] = None,
|
274
|
+
table_version: Optional[str] = None,
|
275
|
+
include_manifest: bool = False,
|
276
|
+
*args,
|
277
|
+
**kwargs,
|
278
|
+
) -> Optional[Delta]:
|
279
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
280
|
+
|
281
|
+
stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
|
282
|
+
partition = get_partition(stream.locator, partition_values, *args, **kwargs)
|
283
|
+
delta_locator = DeltaLocator.of(partition.locator, stream_position)
|
284
|
+
|
285
|
+
res = cur.execute(
|
286
|
+
"SELECT * FROM deltas WHERE locator = ?", (delta_locator.canonical_string(),)
|
287
|
+
)
|
288
|
+
|
289
|
+
serialized_delta = res.fetchone()
|
290
|
+
if serialized_delta is None:
|
291
|
+
return None
|
292
|
+
|
293
|
+
delta = Delta(json.loads(serialized_delta[2]))
|
294
|
+
|
295
|
+
if not include_manifest:
|
296
|
+
delta.manifest = None
|
297
|
+
|
298
|
+
return delta
|
299
|
+
|
300
|
+
|
301
|
+
def get_latest_delta(
|
302
|
+
namespace: str,
|
303
|
+
table_name: str,
|
304
|
+
partition_values: Optional[List[Any]] = None,
|
305
|
+
table_version: Optional[str] = None,
|
306
|
+
include_manifest: bool = False,
|
307
|
+
*args,
|
308
|
+
**kwargs,
|
309
|
+
) -> Optional[Delta]:
|
310
|
+
|
311
|
+
deltas = list_deltas(
|
312
|
+
namespace,
|
313
|
+
table_name,
|
314
|
+
partition_values,
|
315
|
+
table_version,
|
316
|
+
None,
|
317
|
+
None,
|
318
|
+
False,
|
319
|
+
include_manifest,
|
320
|
+
*args,
|
321
|
+
**kwargs,
|
322
|
+
).all_items()
|
323
|
+
|
324
|
+
if not deltas:
|
325
|
+
return None
|
326
|
+
|
327
|
+
return deltas[0]
|
328
|
+
|
329
|
+
|
330
|
+
def download_delta(
|
331
|
+
delta_like: Union[Delta, DeltaLocator],
|
332
|
+
table_type: TableType = TableType.PYARROW,
|
333
|
+
storage_type: StorageType = StorageType.DISTRIBUTED,
|
334
|
+
max_parallelism: Optional[int] = None,
|
335
|
+
columns: Optional[List[str]] = None,
|
336
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
337
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
338
|
+
*args,
|
339
|
+
**kwargs,
|
340
|
+
) -> Union[LocalDataset, DistributedDataset]:
|
341
|
+
result = []
|
342
|
+
manifest = get_delta_manifest(delta_like, *args, **kwargs)
|
343
|
+
|
344
|
+
for entry_index in range(len(manifest.entries)):
|
345
|
+
result.append(
|
346
|
+
download_delta_manifest_entry(
|
347
|
+
delta_like=delta_like,
|
348
|
+
entry_index=entry_index,
|
349
|
+
table_type=table_type,
|
350
|
+
columns=columns,
|
351
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
352
|
+
*args,
|
353
|
+
**kwargs,
|
354
|
+
)
|
355
|
+
)
|
356
|
+
|
357
|
+
return result
|
358
|
+
|
359
|
+
|
360
|
+
def download_delta_manifest_entry(
|
361
|
+
delta_like: Union[Delta, DeltaLocator],
|
362
|
+
entry_index: int,
|
363
|
+
table_type: TableType = TableType.PYARROW,
|
364
|
+
columns: Optional[List[str]] = None,
|
365
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
366
|
+
*args,
|
367
|
+
**kwargs,
|
368
|
+
) -> LocalTable:
|
369
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
370
|
+
|
371
|
+
manifest = get_delta_manifest(delta_like, *args, **kwargs)
|
372
|
+
if entry_index >= len(manifest.entries):
|
373
|
+
raise IndexError(
|
374
|
+
f"Manifest entry index {entry_index} does not exist. "
|
375
|
+
f"Valid values: [0, {len(manifest.entries)}]"
|
376
|
+
)
|
377
|
+
|
378
|
+
entry = manifest.entries[entry_index]
|
379
|
+
|
380
|
+
res = cur.execute("SELECT value FROM data WHERE uri = ?", (entry.uri,))
|
381
|
+
serialized_data = res.fetchone()
|
382
|
+
|
383
|
+
if serialized_data is None:
|
384
|
+
raise ValueError(
|
385
|
+
f"Invalid value of delta locator: {delta_like.canonical_string()}"
|
386
|
+
)
|
387
|
+
|
388
|
+
serialized_data = serialized_data[0]
|
389
|
+
if entry.meta.content_type == ContentType.PARQUET:
|
390
|
+
if table_type == TableType.PYARROW_PARQUET:
|
391
|
+
table = pa.parquet.ParquetFile(io.BytesIO(serialized_data))
|
392
|
+
else:
|
393
|
+
table = pa.parquet.read_table(io.BytesIO(serialized_data), columns=columns)
|
394
|
+
elif entry.meta.content_type == ContentType.UNESCAPED_TSV:
|
395
|
+
assert (
|
396
|
+
table_type != TableType.PYARROW_PARQUET
|
397
|
+
), f"uTSV table cannot be read as {table_type}"
|
398
|
+
parse_options = pa.csv.ParseOptions(delimiter="\t")
|
399
|
+
convert_options = pa.csv.ConvertOptions(
|
400
|
+
null_values=[""], strings_can_be_null=True, include_columns=columns
|
401
|
+
)
|
402
|
+
table = pa.csv.read_csv(
|
403
|
+
io.BytesIO(serialized_data),
|
404
|
+
parse_options=parse_options,
|
405
|
+
convert_options=convert_options,
|
406
|
+
)
|
407
|
+
else:
|
408
|
+
raise ValueError(f"Content type: {entry.meta.content_type} not supported.")
|
409
|
+
|
410
|
+
if table_type == TableType.PYARROW:
|
411
|
+
return table
|
412
|
+
elif table_type == TableType.PYARROW_PARQUET:
|
413
|
+
return table
|
414
|
+
elif table_type == TableType.NUMPY:
|
415
|
+
raise NotImplementedError(f"Table type={table_type} not supported")
|
416
|
+
elif table_type == TableType.PANDAS:
|
417
|
+
return table.to_pandas()
|
418
|
+
|
419
|
+
return table
|
420
|
+
|
421
|
+
|
422
|
+
def get_delta_manifest(
|
423
|
+
delta_like: Union[Delta, DeltaLocator], *args, **kwargs
|
424
|
+
) -> Optional[Manifest]:
|
425
|
+
delta = get_delta(
|
426
|
+
namespace=delta_like.namespace,
|
427
|
+
table_name=delta_like.table_name,
|
428
|
+
stream_position=delta_like.stream_position,
|
429
|
+
partition_values=delta_like.partition_values,
|
430
|
+
table_version=delta_like.table_version,
|
431
|
+
include_manifest=True,
|
432
|
+
*args,
|
433
|
+
**kwargs,
|
434
|
+
)
|
435
|
+
if not delta:
|
436
|
+
return None
|
437
|
+
|
438
|
+
return delta.manifest
|
439
|
+
|
440
|
+
|
441
|
+
def create_namespace(
|
442
|
+
namespace: str, permissions: Dict[str, Any], *args, **kwargs
|
443
|
+
) -> Namespace:
|
444
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
445
|
+
locator = NamespaceLocator.of(namespace)
|
446
|
+
result = Namespace.of(locator, permissions)
|
447
|
+
params = (locator.canonical_string(), json.dumps(result))
|
448
|
+
cur.execute(CREATE_NAMESPACES_TABLE)
|
449
|
+
cur.execute(CREATE_TABLES_TABLE)
|
450
|
+
cur.execute(CREATE_TABLE_VERSIONS_TABLE)
|
451
|
+
cur.execute(CREATE_STREAMS_TABLE)
|
452
|
+
cur.execute(CREATE_PARTITIONS_TABLE)
|
453
|
+
cur.execute(CREATE_DELTAS_TABLE)
|
454
|
+
cur.execute(CREATE_DATA_TABLE)
|
455
|
+
cur.execute("INSERT OR IGNORE INTO namespaces VALUES(?, ?)", params)
|
456
|
+
con.commit()
|
457
|
+
return result
|
458
|
+
|
459
|
+
|
460
|
+
def update_namespace(
|
461
|
+
namespace: str,
|
462
|
+
permissions: Optional[Dict[str, Any]] = None,
|
463
|
+
new_namespace: Optional[str] = None,
|
464
|
+
*args,
|
465
|
+
**kwargs,
|
466
|
+
) -> None:
|
467
|
+
assert new_namespace is None, "namespace name cannot be changed"
|
468
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
469
|
+
locator = NamespaceLocator.of(namespace)
|
470
|
+
result = Namespace.of(locator, permissions)
|
471
|
+
params = (json.dumps(result), locator.canonical_string())
|
472
|
+
cur.execute("UPDATE namespaces SET value = ? WHERE locator = ?", params)
|
473
|
+
con.commit()
|
474
|
+
|
475
|
+
|
476
|
+
def create_table_version(
|
477
|
+
namespace: str,
|
478
|
+
table_name: str,
|
479
|
+
table_version: Optional[str] = None,
|
480
|
+
schema: Optional[Union[pa.Schema, str, bytes]] = None,
|
481
|
+
schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
|
482
|
+
partition_keys: Optional[List[Dict[str, Any]]] = None,
|
483
|
+
primary_key_column_names: Optional[Set[str]] = None,
|
484
|
+
sort_keys: Optional[List[SortKey]] = None,
|
485
|
+
table_version_description: Optional[str] = None,
|
486
|
+
table_version_properties: Optional[Dict[str, str]] = None,
|
487
|
+
table_permissions: Optional[Dict[str, Any]] = None,
|
488
|
+
table_description: Optional[str] = None,
|
489
|
+
table_properties: Optional[Dict[str, str]] = None,
|
490
|
+
supported_content_types: Optional[List[ContentType]] = None,
|
491
|
+
*args,
|
492
|
+
**kwargs,
|
493
|
+
) -> Stream:
|
494
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
495
|
+
|
496
|
+
latest_version = get_latest_table_version(namespace, table_name, *args, **kwargs)
|
497
|
+
if (
|
498
|
+
table_version is not None
|
499
|
+
and latest_version
|
500
|
+
and int(latest_version.table_version) + 1 != int(table_version)
|
501
|
+
):
|
502
|
+
raise AssertionError(
|
503
|
+
f"Table version can only be incremented. Last version={latest_version.table_version}"
|
504
|
+
)
|
505
|
+
elif table_version is None:
|
506
|
+
table_version = (
|
507
|
+
(int(latest_version.table_version) + 1) if latest_version else "1"
|
508
|
+
)
|
509
|
+
|
510
|
+
table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
|
511
|
+
table_obj = Table.of(
|
512
|
+
table_locator, table_permissions, table_description, table_properties
|
513
|
+
)
|
514
|
+
table_version_locator = TableVersionLocator.of(
|
515
|
+
table_locator=table_locator, table_version=table_version
|
516
|
+
)
|
517
|
+
|
518
|
+
stream_id = uuid.uuid4().__str__()
|
519
|
+
|
520
|
+
if table_version_properties is None:
|
521
|
+
table_version_properties = {}
|
522
|
+
|
523
|
+
properties = {**table_version_properties, STREAM_ID_PROPERTY: stream_id}
|
524
|
+
table_version_obj = TableVersion.of(
|
525
|
+
table_version_locator,
|
526
|
+
schema=schema,
|
527
|
+
partition_keys=partition_keys,
|
528
|
+
primary_key_columns=primary_key_column_names,
|
529
|
+
description=table_version_description,
|
530
|
+
properties=properties,
|
531
|
+
sort_keys=sort_keys,
|
532
|
+
content_types=supported_content_types,
|
533
|
+
)
|
534
|
+
stream_locator = StreamLocator.of(
|
535
|
+
table_version_obj.locator, stream_id=stream_id, storage_type=STORAGE_TYPE
|
536
|
+
)
|
537
|
+
result_stream = Stream.of(
|
538
|
+
stream_locator, partition_keys=partition_keys, state=CommitState.COMMITTED
|
539
|
+
)
|
540
|
+
|
541
|
+
params = (
|
542
|
+
table_locator.canonical_string(),
|
543
|
+
table_locator.namespace_locator.canonical_string(),
|
544
|
+
json.dumps(table_obj),
|
545
|
+
)
|
546
|
+
cur.execute("INSERT OR IGNORE INTO tables VALUES (?, ?, ?)", params)
|
547
|
+
params = (
|
548
|
+
table_version_locator.canonical_string(),
|
549
|
+
table_locator.canonical_string(),
|
550
|
+
json.dumps(table_version_obj),
|
551
|
+
)
|
552
|
+
cur.execute("INSERT OR IGNORE INTO table_versions VALUES (?, ?, ?)", params)
|
553
|
+
|
554
|
+
params = (
|
555
|
+
stream_locator.canonical_string(),
|
556
|
+
table_version_locator.canonical_string(),
|
557
|
+
json.dumps(result_stream),
|
558
|
+
)
|
559
|
+
cur.execute("INSERT OR IGNORE INTO streams VALUES (?, ?, ?)", params)
|
560
|
+
con.commit()
|
561
|
+
return result_stream
|
562
|
+
|
563
|
+
|
564
|
+
def update_table(
|
565
|
+
namespace: str,
|
566
|
+
table_name: str,
|
567
|
+
permissions: Optional[Dict[str, Any]] = None,
|
568
|
+
description: Optional[str] = None,
|
569
|
+
properties: Optional[Dict[str, str]] = None,
|
570
|
+
new_table_name: Optional[str] = None,
|
571
|
+
*args,
|
572
|
+
**kwargs,
|
573
|
+
) -> None:
|
574
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
575
|
+
table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
|
576
|
+
table_obj = Table.of(table_locator, permissions, description, properties)
|
577
|
+
|
578
|
+
params = (table_locator.canonical_string(),)
|
579
|
+
cur.execute("DELETE FROM tables WHERE locator = ?", params)
|
580
|
+
params = (
|
581
|
+
table_locator.canonical_string(),
|
582
|
+
table_locator.namespace_locator.canonical_string(),
|
583
|
+
json.dumps(table_obj),
|
584
|
+
)
|
585
|
+
cur.execute("INSERT INTO tables VALUES (?, ?, ?)", params)
|
586
|
+
con.commit()
|
587
|
+
|
588
|
+
|
589
|
+
def update_table_version(
|
590
|
+
namespace: str,
|
591
|
+
table_name: str,
|
592
|
+
table_version: str,
|
593
|
+
lifecycle_state: Optional[LifecycleState] = None,
|
594
|
+
schema: Optional[Union[pa.Schema, str, bytes]] = None,
|
595
|
+
schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
|
596
|
+
description: Optional[str] = None,
|
597
|
+
properties: Optional[Dict[str, str]] = None,
|
598
|
+
*args,
|
599
|
+
**kwargs,
|
600
|
+
) -> None:
|
601
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
602
|
+
table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
|
603
|
+
table_version_locator = TableVersionLocator.of(
|
604
|
+
table_locator=table_locator, table_version=table_version
|
605
|
+
)
|
606
|
+
|
607
|
+
res = cur.execute(
|
608
|
+
"SELECT * from table_versions WHERE locator = ?",
|
609
|
+
(table_version_locator.canonical_string(),),
|
610
|
+
)
|
611
|
+
serialized_table_version = res.fetchone()
|
612
|
+
assert (
|
613
|
+
serialized_table_version is not None
|
614
|
+
), f"Table version not found with locator={table_version_locator.canonical_string()}"
|
615
|
+
current_table_version_obj = TableVersion(json.loads(serialized_table_version[2]))
|
616
|
+
|
617
|
+
if properties is None:
|
618
|
+
properties = {}
|
619
|
+
|
620
|
+
current_props = (
|
621
|
+
current_table_version_obj.properties
|
622
|
+
if current_table_version_obj.properties
|
623
|
+
else {}
|
624
|
+
)
|
625
|
+
|
626
|
+
tv_properties = {**properties, **current_props}
|
627
|
+
table_version_obj = TableVersion.of(
|
628
|
+
table_version_locator,
|
629
|
+
schema=schema,
|
630
|
+
partition_keys=current_table_version_obj.partition_keys,
|
631
|
+
primary_key_columns=current_table_version_obj.primary_keys,
|
632
|
+
description=description,
|
633
|
+
properties=tv_properties,
|
634
|
+
sort_keys=current_table_version_obj.sort_keys,
|
635
|
+
content_types=current_table_version_obj.content_types,
|
636
|
+
)
|
637
|
+
|
638
|
+
params = (
|
639
|
+
table_locator.canonical_string(),
|
640
|
+
json.dumps(table_version_obj),
|
641
|
+
table_version_locator.canonical_string(),
|
642
|
+
)
|
643
|
+
cur.execute(
|
644
|
+
"UPDATE table_versions SET table_locator = ?, value = ? WHERE locator = ?",
|
645
|
+
params,
|
646
|
+
)
|
647
|
+
con.commit()
|
648
|
+
|
649
|
+
|
650
|
+
def stage_stream(
|
651
|
+
namespace: str,
|
652
|
+
table_name: str,
|
653
|
+
table_version: Optional[str] = None,
|
654
|
+
*args,
|
655
|
+
**kwargs,
|
656
|
+
) -> Stream:
|
657
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
658
|
+
|
659
|
+
existing_table_version = get_table_version(
|
660
|
+
namespace, table_name, table_version, *args, **kwargs
|
661
|
+
)
|
662
|
+
existing_stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
|
663
|
+
|
664
|
+
stream_id = uuid.uuid4().__str__()
|
665
|
+
new_stream_locator = StreamLocator.of(
|
666
|
+
existing_table_version.locator, stream_id, STORAGE_TYPE
|
667
|
+
)
|
668
|
+
new_stream = Stream.of(
|
669
|
+
new_stream_locator,
|
670
|
+
existing_stream.partition_keys,
|
671
|
+
CommitState.STAGED,
|
672
|
+
existing_stream.locator.canonical_string(),
|
673
|
+
)
|
674
|
+
|
675
|
+
params = (
|
676
|
+
new_stream_locator.canonical_string(),
|
677
|
+
existing_table_version.locator.canonical_string(),
|
678
|
+
json.dumps(new_stream),
|
679
|
+
)
|
680
|
+
cur.execute("INSERT INTO streams VALUES (?, ?, ?)", params)
|
681
|
+
con.commit()
|
682
|
+
|
683
|
+
return new_stream
|
684
|
+
|
685
|
+
|
686
|
+
def commit_stream(stream: Stream, *args, **kwargs) -> Stream:
|
687
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
688
|
+
|
689
|
+
existing_table_version = get_table_version(
|
690
|
+
stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
|
691
|
+
)
|
692
|
+
stream_to_commit = Stream.of(
|
693
|
+
stream.locator,
|
694
|
+
stream.partition_keys,
|
695
|
+
CommitState.COMMITTED,
|
696
|
+
stream.previous_stream_digest,
|
697
|
+
)
|
698
|
+
|
699
|
+
existing_table_version.properties[
|
700
|
+
STREAM_ID_PROPERTY
|
701
|
+
] = stream_to_commit.locator.stream_id
|
702
|
+
|
703
|
+
params = (
|
704
|
+
json.dumps(existing_table_version),
|
705
|
+
existing_table_version.locator.canonical_string(),
|
706
|
+
)
|
707
|
+
cur.execute("UPDATE table_versions SET value = ? WHERE locator = ?", params)
|
708
|
+
params = (json.dumps(stream_to_commit), stream_to_commit.locator.canonical_string())
|
709
|
+
cur.execute("UPDATE streams SET value = ? WHERE locator = ?", params)
|
710
|
+
con.commit()
|
711
|
+
|
712
|
+
return stream_to_commit
|
713
|
+
|
714
|
+
|
715
|
+
def delete_stream(
|
716
|
+
namespace: str,
|
717
|
+
table_name: str,
|
718
|
+
table_version: Optional[str] = None,
|
719
|
+
*args,
|
720
|
+
**kwargs,
|
721
|
+
) -> None:
|
722
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
723
|
+
|
724
|
+
table_version_locator = TableVersionLocator.of(
|
725
|
+
TableLocator.of(NamespaceLocator.of(namespace), table_name), table_version
|
726
|
+
)
|
727
|
+
|
728
|
+
res = cur.execute(
|
729
|
+
"SELECT locator FROM streams WHERE table_version_locator = ?",
|
730
|
+
(table_version_locator.canonical_string(),),
|
731
|
+
)
|
732
|
+
locators = res.fetchall()
|
733
|
+
cur.executemany("DELETE FROM streams WHERE locator = ?", locators)
|
734
|
+
cur.execute(
|
735
|
+
"DELETE FROM table_versions WHERE locator = ?",
|
736
|
+
(table_version_locator.canonical_string(),),
|
737
|
+
)
|
738
|
+
|
739
|
+
con.commit()
|
740
|
+
|
741
|
+
|
742
|
+
def stage_partition(
|
743
|
+
stream: Stream, partition_values: Optional[List[Any]] = None, *args, **kwargs
|
744
|
+
) -> Partition:
|
745
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
746
|
+
partition_id = uuid.uuid4().__str__()
|
747
|
+
partition_locator = PartitionLocator.of(
|
748
|
+
stream.locator, partition_values=partition_values, partition_id=partition_id
|
749
|
+
)
|
750
|
+
|
751
|
+
tv = get_table_version(
|
752
|
+
stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
|
753
|
+
)
|
754
|
+
|
755
|
+
pv_partition = get_partition(
|
756
|
+
stream.locator, partition_values=partition_values, *args, **kwargs
|
757
|
+
)
|
758
|
+
|
759
|
+
stream_position = current_time_ms()
|
760
|
+
partition = Partition.of(
|
761
|
+
partition_locator,
|
762
|
+
schema=tv.schema,
|
763
|
+
content_types=tv.content_types,
|
764
|
+
state=CommitState.STAGED,
|
765
|
+
previous_stream_position=pv_partition.stream_position if pv_partition else None,
|
766
|
+
previous_partition_id=pv_partition.partition_id if pv_partition else None,
|
767
|
+
stream_position=stream_position,
|
768
|
+
)
|
769
|
+
|
770
|
+
params = (
|
771
|
+
partition.locator.canonical_string(),
|
772
|
+
partition.stream_locator.canonical_string(),
|
773
|
+
json.dumps(partition),
|
774
|
+
)
|
775
|
+
cur.execute("INSERT INTO partitions VALUES (?, ?, ?)", params)
|
776
|
+
con.commit()
|
777
|
+
|
778
|
+
return partition
|
779
|
+
|
780
|
+
|
781
|
+
def commit_partition(partition: Partition, *args, **kwargs) -> Partition:
|
782
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
783
|
+
pv_partition = get_partition(
|
784
|
+
partition.stream_locator,
|
785
|
+
partition_values=partition.partition_values,
|
786
|
+
*args,
|
787
|
+
**kwargs,
|
788
|
+
)
|
789
|
+
|
790
|
+
# deprecate old and commit new one
|
791
|
+
if pv_partition:
|
792
|
+
pv_partition.state = CommitState.DEPRECATED
|
793
|
+
params = (json.dumps(pv_partition), pv_partition.locator.canonical_string())
|
794
|
+
cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
|
795
|
+
|
796
|
+
deltas = list_partition_deltas(partition, *args, **kwargs).all_items()
|
797
|
+
deltas.sort(reverse=True, key=lambda x: x.stream_position)
|
798
|
+
|
799
|
+
stream_position = partition.stream_position
|
800
|
+
if deltas:
|
801
|
+
stream_position = deltas[0].stream_position
|
802
|
+
|
803
|
+
partition.state = CommitState.COMMITTED
|
804
|
+
partition.stream_position = stream_position
|
805
|
+
partition.previous_stream_position = (
|
806
|
+
pv_partition.stream_position if pv_partition else None
|
807
|
+
)
|
808
|
+
params = (json.dumps(partition), partition.locator.canonical_string())
|
809
|
+
cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
|
810
|
+
con.commit()
|
811
|
+
|
812
|
+
return partition
|
813
|
+
|
814
|
+
|
815
|
+
def delete_partition(
|
816
|
+
namespace: str,
|
817
|
+
table_name: str,
|
818
|
+
table_version: Optional[str] = None,
|
819
|
+
partition_values: Optional[List[Any]] = None,
|
820
|
+
*args,
|
821
|
+
**kwargs,
|
822
|
+
) -> None:
|
823
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
824
|
+
stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
|
825
|
+
partition = get_partition(stream.locator, partition_values, *args, **kwargs)
|
826
|
+
|
827
|
+
partition.state = CommitState.DEPRECATED
|
828
|
+
params = (json.dumps(partition), partition.locator.canonical_string())
|
829
|
+
|
830
|
+
cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
|
831
|
+
con.commit()
|
832
|
+
|
833
|
+
|
834
|
+
def get_partition(
|
835
|
+
stream_locator: StreamLocator,
|
836
|
+
partition_values: Optional[List[Any]] = None,
|
837
|
+
*args,
|
838
|
+
**kwargs,
|
839
|
+
) -> Optional[Partition]:
|
840
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
841
|
+
|
842
|
+
res = cur.execute(
|
843
|
+
"SELECT * FROM partitions WHERE stream_locator = ?",
|
844
|
+
(stream_locator.canonical_string(),),
|
845
|
+
)
|
846
|
+
|
847
|
+
serialized_partitions = res.fetchall()
|
848
|
+
|
849
|
+
if not serialized_partitions:
|
850
|
+
return None
|
851
|
+
|
852
|
+
if partition_values is None:
|
853
|
+
partition_values = []
|
854
|
+
|
855
|
+
prior_pv = ",".join(partition_values)
|
856
|
+
|
857
|
+
for item in serialized_partitions:
|
858
|
+
partition = Partition(json.loads(item[2]))
|
859
|
+
pv = ",".join(partition.partition_values if partition.partition_values else [])
|
860
|
+
|
861
|
+
if pv == prior_pv and partition.state == CommitState.COMMITTED:
|
862
|
+
return partition
|
863
|
+
|
864
|
+
return None
|
865
|
+
|
866
|
+
|
867
|
+
def stage_delta(
|
868
|
+
data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
|
869
|
+
partition: Partition,
|
870
|
+
delta_type: DeltaType = DeltaType.UPSERT,
|
871
|
+
max_records_per_entry: Optional[int] = None,
|
872
|
+
author: Optional[ManifestAuthor] = None,
|
873
|
+
properties: Optional[Dict[str, str]] = None,
|
874
|
+
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
875
|
+
content_type: ContentType = ContentType.PARQUET,
|
876
|
+
*args,
|
877
|
+
**kwargs,
|
878
|
+
) -> Delta:
|
879
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
880
|
+
manifest_entry_id = uuid.uuid4().__str__()
|
881
|
+
uri = _get_manifest_entry_uri(manifest_entry_id)
|
882
|
+
|
883
|
+
serialized_data = None
|
884
|
+
if content_type == ContentType.PARQUET:
|
885
|
+
buffer = io.BytesIO()
|
886
|
+
pa.parquet.write_table(data, buffer)
|
887
|
+
serialized_data = buffer.getvalue()
|
888
|
+
elif content_type == ContentType.UNESCAPED_TSV:
|
889
|
+
buffer = io.BytesIO()
|
890
|
+
write_options = pa.csv.WriteOptions(
|
891
|
+
include_header=True, delimiter="\t", quoting_style="none"
|
892
|
+
)
|
893
|
+
pa.csv.write_csv(data, buffer, write_options=write_options)
|
894
|
+
serialized_data = buffer.getvalue()
|
895
|
+
else:
|
896
|
+
raise ValueError(f"Unsupported content type: {content_type}")
|
897
|
+
|
898
|
+
stream_position = current_time_ms()
|
899
|
+
delta_locator = DeltaLocator.of(partition.locator, stream_position=stream_position)
|
900
|
+
|
901
|
+
meta = ManifestMeta.of(
|
902
|
+
len(data),
|
903
|
+
len(serialized_data),
|
904
|
+
content_type=content_type,
|
905
|
+
content_encoding=ContentEncoding.IDENTITY,
|
906
|
+
source_content_length=data.nbytes,
|
907
|
+
)
|
908
|
+
|
909
|
+
manifest = Manifest.of(
|
910
|
+
entries=ManifestEntryList.of(
|
911
|
+
[
|
912
|
+
ManifestEntry.of(
|
913
|
+
uri=uri, url=uri, meta=meta, mandatory=True, uuid=manifest_entry_id
|
914
|
+
)
|
915
|
+
]
|
916
|
+
),
|
917
|
+
author=author,
|
918
|
+
uuid=manifest_entry_id,
|
919
|
+
)
|
920
|
+
|
921
|
+
delta = Delta.of(
|
922
|
+
delta_locator,
|
923
|
+
delta_type=delta_type,
|
924
|
+
meta=meta,
|
925
|
+
properties=properties,
|
926
|
+
manifest=manifest,
|
927
|
+
previous_stream_position=partition.stream_position,
|
928
|
+
)
|
929
|
+
|
930
|
+
params = (uri, serialized_data)
|
931
|
+
cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", params)
|
932
|
+
|
933
|
+
params = (delta_locator.canonical_string(), "staged_delta", json.dumps(delta))
|
934
|
+
cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
|
935
|
+
|
936
|
+
con.commit()
|
937
|
+
return delta
|
938
|
+
|
939
|
+
|
940
|
+
def commit_delta(delta: Delta, *args, **kwargs) -> Delta:
|
941
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
942
|
+
|
943
|
+
if not delta.stream_position:
|
944
|
+
delta.locator.stream_position = current_time_ms()
|
945
|
+
|
946
|
+
params = (
|
947
|
+
delta.locator.canonical_string(),
|
948
|
+
delta.partition_locator.canonical_string(),
|
949
|
+
json.dumps(delta),
|
950
|
+
)
|
951
|
+
|
952
|
+
cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
|
953
|
+
|
954
|
+
params = (
|
955
|
+
delta.partition_locator.canonical_string(),
|
956
|
+
json.dumps(delta),
|
957
|
+
delta.locator.canonical_string(),
|
958
|
+
)
|
959
|
+
cur.execute(
|
960
|
+
"UPDATE deltas SET partition_locator = ?, value = ? WHERE locator = ?", params
|
961
|
+
)
|
962
|
+
|
963
|
+
con.commit()
|
964
|
+
|
965
|
+
return delta
|
966
|
+
|
967
|
+
|
968
|
+
def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
|
969
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
970
|
+
locator = NamespaceLocator.of(namespace)
|
971
|
+
|
972
|
+
res = cur.execute(
|
973
|
+
"SELECT * FROM namespaces WHERE locator = ?", (locator.canonical_string(),)
|
974
|
+
)
|
975
|
+
serialized_result = res.fetchone()
|
976
|
+
|
977
|
+
if serialized_result is None:
|
978
|
+
return None
|
979
|
+
|
980
|
+
return Namespace(json.loads(serialized_result[1]))
|
981
|
+
|
982
|
+
|
983
|
+
def namespace_exists(namespace: str, *args, **kwargs) -> bool:
|
984
|
+
obj = get_namespace(namespace, *args, **kwargs)
|
985
|
+
|
986
|
+
return obj is not None
|
987
|
+
|
988
|
+
|
989
|
+
def get_table(namespace: str, table_name: str, *args, **kwargs) -> Optional[Table]:
|
990
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
991
|
+
locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
|
992
|
+
|
993
|
+
res = cur.execute(
|
994
|
+
"SELECT * FROM tables WHERE locator = ?", (locator.canonical_string(),)
|
995
|
+
)
|
996
|
+
serialized_result = res.fetchone()
|
997
|
+
|
998
|
+
if serialized_result is None:
|
999
|
+
return None
|
1000
|
+
|
1001
|
+
return Table(json.loads(serialized_result[2]))
|
1002
|
+
|
1003
|
+
|
1004
|
+
def table_exists(namespace: str, table_name: str, *args, **kwargs) -> bool:
|
1005
|
+
obj = get_table(namespace, table_name, *args, **kwargs)
|
1006
|
+
|
1007
|
+
return obj is not None
|
1008
|
+
|
1009
|
+
|
1010
|
+
def get_table_version(
|
1011
|
+
namespace: str, table_name: str, table_version: str, *args, **kwargs
|
1012
|
+
) -> Optional[TableVersion]:
|
1013
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
1014
|
+
locator = TableVersionLocator.of(
|
1015
|
+
TableLocator.of(NamespaceLocator.of(namespace), table_name), table_version
|
1016
|
+
)
|
1017
|
+
|
1018
|
+
res = cur.execute(
|
1019
|
+
"SELECT * FROM table_versions WHERE locator = ?", (locator.canonical_string(),)
|
1020
|
+
)
|
1021
|
+
serialized_table_version = res.fetchone()
|
1022
|
+
|
1023
|
+
if serialized_table_version is None:
|
1024
|
+
return None
|
1025
|
+
|
1026
|
+
return TableVersion(json.loads(serialized_table_version[2]))
|
1027
|
+
|
1028
|
+
|
1029
|
+
def get_latest_table_version(
|
1030
|
+
namespace: str, table_name: str, *args, **kwargs
|
1031
|
+
) -> Optional[TableVersion]:
|
1032
|
+
table_versions = list_table_versions(
|
1033
|
+
namespace, table_name, *args, **kwargs
|
1034
|
+
).all_items()
|
1035
|
+
if not table_versions:
|
1036
|
+
return None
|
1037
|
+
|
1038
|
+
table_versions.sort(reverse=True, key=lambda v: int(v.table_version))
|
1039
|
+
return table_versions[0]
|
1040
|
+
|
1041
|
+
|
1042
|
+
def get_latest_active_table_version(
|
1043
|
+
namespace: str, table_name: str, *args, **kwargs
|
1044
|
+
) -> Optional[TableVersion]:
|
1045
|
+
|
1046
|
+
# This module does not support table version lifecycle state
|
1047
|
+
return get_latest_table_version(namespace, table_name, *args, **kwargs)
|
1048
|
+
|
1049
|
+
|
1050
|
+
def get_table_version_schema(
|
1051
|
+
namespace: str,
|
1052
|
+
table_name: str,
|
1053
|
+
table_version: Optional[str] = None,
|
1054
|
+
*args,
|
1055
|
+
**kwargs,
|
1056
|
+
) -> Optional[Union[pa.Schema, str, bytes]]:
|
1057
|
+
obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
|
1058
|
+
|
1059
|
+
return obj.schema
|
1060
|
+
|
1061
|
+
|
1062
|
+
def table_version_exists(
|
1063
|
+
namespace: str, table_name: str, table_version: str, *args, **kwargs
|
1064
|
+
) -> bool:
|
1065
|
+
obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
|
1066
|
+
|
1067
|
+
return obj is not None
|
1068
|
+
|
1069
|
+
|
1070
|
+
def get_stream(
|
1071
|
+
namespace: str,
|
1072
|
+
table_name: str,
|
1073
|
+
table_version: Optional[str] = None,
|
1074
|
+
*args,
|
1075
|
+
**kwargs,
|
1076
|
+
) -> Optional[Stream]:
|
1077
|
+
assert not isinstance(table_version, int), f"Passed an integer as the table version"
|
1078
|
+
obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
|
1079
|
+
|
1080
|
+
if obj is None:
|
1081
|
+
return None
|
1082
|
+
|
1083
|
+
stream_id = obj.properties.get(STREAM_ID_PROPERTY)
|
1084
|
+
if stream_id is None:
|
1085
|
+
return None
|
1086
|
+
|
1087
|
+
cur, con = _get_sqlite3_cursor_con(kwargs)
|
1088
|
+
stream_locator = StreamLocator.of(
|
1089
|
+
obj.locator, stream_id=stream_id, storage_type=STORAGE_TYPE
|
1090
|
+
)
|
1091
|
+
res = cur.execute(
|
1092
|
+
"SELECT * FROM streams WHERE locator = ?", (stream_locator.canonical_string(),)
|
1093
|
+
)
|
1094
|
+
|
1095
|
+
serialized_stream = res.fetchone()
|
1096
|
+
if serialized_stream is None:
|
1097
|
+
return None
|
1098
|
+
|
1099
|
+
return Stream(json.loads(serialized_stream[2]))
|
1100
|
+
|
1101
|
+
|
1102
|
+
def get_table_version_column_names(
|
1103
|
+
namespace: str,
|
1104
|
+
table_name: str,
|
1105
|
+
table_version: Optional[str] = None,
|
1106
|
+
*args,
|
1107
|
+
**kwargs,
|
1108
|
+
) -> Optional[List[str]]:
|
1109
|
+
raise NotImplementedError("Fetching column names is not supported")
|