deltacat 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +27 -6
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/catalog/main/impl.py +12 -6
- deltacat/catalog/model/catalog.py +65 -47
- deltacat/catalog/model/properties.py +1 -3
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +404 -0
- deltacat/constants.py +4 -4
- deltacat/daft/daft_scan.py +7 -3
- deltacat/daft/translator.py +126 -0
- deltacat/examples/basic_logging.py +5 -3
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +199 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/types.py +5 -3
- deltacat/storage/rivulet/__init__.py +4 -4
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
- deltacat/tests/storage/rivulet/test_dataset.py +1 -1
- deltacat/tests/storage/rivulet/test_manifest.py +1 -1
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +2 -2
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
deltacat/utils/url.py
ADDED
@@ -0,0 +1,1284 @@
|
|
1
|
+
import functools
|
2
|
+
import json
|
3
|
+
from typing import Callable, List, Tuple, Any, Union, Optional
|
4
|
+
from urllib.parse import urlparse, urlunparse, parse_qs
|
5
|
+
|
6
|
+
import ray
|
7
|
+
import daft
|
8
|
+
|
9
|
+
import pandas as pd
|
10
|
+
import numpy as np
|
11
|
+
import pyarrow as pa
|
12
|
+
import polars as pl
|
13
|
+
import deltacat as dc
|
14
|
+
|
15
|
+
import pyarrow.csv as pacsv
|
16
|
+
import pyarrow.json as pajson
|
17
|
+
|
18
|
+
from deltacat.catalog import CatalogProperties
|
19
|
+
from deltacat.constants import DEFAULT_NAMESPACE
|
20
|
+
from deltacat.types.media import (
|
21
|
+
DatasetType,
|
22
|
+
DatastoreType,
|
23
|
+
)
|
24
|
+
from deltacat.utils import pyarrow as pa_utils
|
25
|
+
|
26
|
+
from deltacat.storage import (
|
27
|
+
metastore,
|
28
|
+
Dataset,
|
29
|
+
Delta,
|
30
|
+
DeltaLocator,
|
31
|
+
ListResult,
|
32
|
+
Metafile,
|
33
|
+
Namespace,
|
34
|
+
NamespaceLocator,
|
35
|
+
Partition,
|
36
|
+
Stream,
|
37
|
+
StreamFormat,
|
38
|
+
StreamLocator,
|
39
|
+
PartitionLocator,
|
40
|
+
Table,
|
41
|
+
TableLocator,
|
42
|
+
TableVersion,
|
43
|
+
TableVersionLocator,
|
44
|
+
)
|
45
|
+
|
46
|
+
RAY_DATASTORE_TYPE_TO_READER = {
|
47
|
+
DatastoreType.AUDIO: lambda url: functools.partial(
|
48
|
+
ray.data.read_audio,
|
49
|
+
url.url_path,
|
50
|
+
**url.query_params,
|
51
|
+
),
|
52
|
+
DatastoreType.AVRO: lambda url: functools.partial(
|
53
|
+
ray.data.read_avro,
|
54
|
+
url.url_path,
|
55
|
+
**url.query_params,
|
56
|
+
),
|
57
|
+
DatastoreType.BIGQUERY: lambda url: functools.partial(
|
58
|
+
ray.data.read_bigquery,
|
59
|
+
project_id=url.parsed.netloc,
|
60
|
+
dataset=url.path_elements[0] if url.path_elements else None,
|
61
|
+
**url.query_params,
|
62
|
+
),
|
63
|
+
DatastoreType.BINARY: lambda url: functools.partial(
|
64
|
+
ray.data.read_binary_files,
|
65
|
+
url.url_path,
|
66
|
+
**url.query_params,
|
67
|
+
),
|
68
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
69
|
+
ray.data.read_csv,
|
70
|
+
url.url_path,
|
71
|
+
**url.query_params,
|
72
|
+
),
|
73
|
+
DatastoreType.CLICKHOUSE: lambda url: functools.partial(
|
74
|
+
ray.data.read_clickhouse,
|
75
|
+
table=url.parsed.query,
|
76
|
+
dsn=url.url,
|
77
|
+
**url.query_params,
|
78
|
+
),
|
79
|
+
DatastoreType.DATABRICKS_TABLES: lambda url: functools.partial(
|
80
|
+
ray.data.read_databricks_tables,
|
81
|
+
warehouse_id=url.parsed.netloc,
|
82
|
+
**url.query_params,
|
83
|
+
),
|
84
|
+
DatastoreType.DELTA_SHARING: lambda url: functools.partial(
|
85
|
+
ray.data.read_delta_sharing_tables,
|
86
|
+
url.url_path,
|
87
|
+
**url.query_params,
|
88
|
+
),
|
89
|
+
DatastoreType.HUDI: lambda url: functools.partial(
|
90
|
+
ray.data.read_hudi,
|
91
|
+
url.url_path,
|
92
|
+
**url.query_params,
|
93
|
+
),
|
94
|
+
DatastoreType.ICEBERG: lambda url: functools.partial(
|
95
|
+
ray.data.read_iceberg,
|
96
|
+
table_identifier=url.parsed.netloc,
|
97
|
+
**url.query_params,
|
98
|
+
),
|
99
|
+
DatastoreType.IMAGES: lambda url: functools.partial(
|
100
|
+
ray.data.read_images,
|
101
|
+
url.url_path,
|
102
|
+
**url.query_params,
|
103
|
+
),
|
104
|
+
DatastoreType.JSON: lambda url: functools.partial(
|
105
|
+
ray.data.read_json,
|
106
|
+
url.url_path,
|
107
|
+
**url.query_params,
|
108
|
+
),
|
109
|
+
DatastoreType.LANCE: lambda url: functools.partial(
|
110
|
+
ray.data.read_lance,
|
111
|
+
url.url_path,
|
112
|
+
**url.query_params,
|
113
|
+
),
|
114
|
+
DatastoreType.MONGO: lambda url: functools.partial(
|
115
|
+
ray.data.read_mongo,
|
116
|
+
url.url,
|
117
|
+
**url.query_params,
|
118
|
+
),
|
119
|
+
DatastoreType.NUMPY: lambda url: functools.partial(
|
120
|
+
ray.data.read_numpy,
|
121
|
+
url.url_path,
|
122
|
+
**url.query_params,
|
123
|
+
),
|
124
|
+
DatastoreType.PARQUET: lambda url: functools.partial(
|
125
|
+
ray.data.read_parquet,
|
126
|
+
url.url_path,
|
127
|
+
**url.query_params,
|
128
|
+
),
|
129
|
+
DatastoreType.TEXT: lambda url: functools.partial(
|
130
|
+
ray.data.read_text,
|
131
|
+
url.url_path,
|
132
|
+
**url.query_params,
|
133
|
+
),
|
134
|
+
DatastoreType.TFRECORDS: lambda url: functools.partial(
|
135
|
+
ray.data.read_tfrecords,
|
136
|
+
url.url_path,
|
137
|
+
**url.query_params,
|
138
|
+
),
|
139
|
+
DatastoreType.VIDEOS: lambda url: functools.partial(
|
140
|
+
ray.data.read_videos,
|
141
|
+
url.url_path,
|
142
|
+
**url.query_params,
|
143
|
+
),
|
144
|
+
DatastoreType.WEBDATASET: lambda url: functools.partial(
|
145
|
+
ray.data.read_webdataset,
|
146
|
+
url.url_path,
|
147
|
+
**url.query_params,
|
148
|
+
),
|
149
|
+
}
|
150
|
+
|
151
|
+
RAY_DATASTORE_TYPE_TO_WRITER = {
|
152
|
+
DatastoreType.BIGQUERY: lambda url: functools.partial(
|
153
|
+
ray.data.Dataset.write_bigquery,
|
154
|
+
project_id=url.parsed.netloc,
|
155
|
+
dataset=url.path_elements[0] if url.path_elements else None,
|
156
|
+
**url.query_params,
|
157
|
+
),
|
158
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
159
|
+
ray.data.write_csv,
|
160
|
+
url.url_path,
|
161
|
+
**url.query_params,
|
162
|
+
),
|
163
|
+
DatastoreType.ICEBERG: lambda url: functools.partial(
|
164
|
+
ray.data.Dataset.write_iceberg,
|
165
|
+
table_identifier=url.parsed.netloc,
|
166
|
+
**url.query_params,
|
167
|
+
),
|
168
|
+
DatastoreType.IMAGES: lambda url: functools.partial(
|
169
|
+
ray.data.Dataset.write_images,
|
170
|
+
path=url.url_path,
|
171
|
+
column=url.query_params.pop("column", "image") if url.query_params else "image",
|
172
|
+
**url.query_params,
|
173
|
+
),
|
174
|
+
DatastoreType.JSON: lambda url: functools.partial(
|
175
|
+
ray.data.Dataset.write_json,
|
176
|
+
url.url_path,
|
177
|
+
**url.query_params,
|
178
|
+
),
|
179
|
+
DatastoreType.LANCE: lambda url: functools.partial(
|
180
|
+
ray.data.Dataset.write_lance,
|
181
|
+
url.url_path,
|
182
|
+
**url.query_params,
|
183
|
+
),
|
184
|
+
DatastoreType.MONGO: lambda url: functools.partial(
|
185
|
+
ray.data.Dataset.write_mongo,
|
186
|
+
url.url,
|
187
|
+
**url.query_params,
|
188
|
+
),
|
189
|
+
DatastoreType.NUMPY: lambda url: functools.partial(
|
190
|
+
ray.data.Dataset.write_numpy,
|
191
|
+
path=url.url_path,
|
192
|
+
column=url.query_params.pop("column", "data") if url.query_params else "data",
|
193
|
+
**url.query_params,
|
194
|
+
),
|
195
|
+
DatastoreType.PARQUET: lambda url: functools.partial(
|
196
|
+
ray.data.Dataset.write_parquet,
|
197
|
+
url.url_path,
|
198
|
+
**url.query_params,
|
199
|
+
),
|
200
|
+
DatastoreType.TFRECORDS: lambda url: functools.partial(
|
201
|
+
ray.data.Dataset.write_tfrecords,
|
202
|
+
url.url_path,
|
203
|
+
**url.query_params,
|
204
|
+
),
|
205
|
+
DatastoreType.WEBDATASET: lambda url: functools.partial(
|
206
|
+
ray.data.Dataset.write_webdataset,
|
207
|
+
url.url_path,
|
208
|
+
**url.query_params,
|
209
|
+
),
|
210
|
+
}
|
211
|
+
|
212
|
+
DAFT_DATASTORE_TYPE_TO_READER = {
|
213
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
214
|
+
daft.io.read_csv,
|
215
|
+
url.url_path,
|
216
|
+
**url.query_params,
|
217
|
+
),
|
218
|
+
DatastoreType.DELTA_LAKE: lambda url: functools.partial(
|
219
|
+
daft.io.read_deltalake,
|
220
|
+
url.url_path,
|
221
|
+
**url.query_params,
|
222
|
+
),
|
223
|
+
DatastoreType.HUDI: lambda url: functools.partial(
|
224
|
+
daft.io.read_hudi,
|
225
|
+
url.url_path,
|
226
|
+
**url.query_params,
|
227
|
+
),
|
228
|
+
DatastoreType.ICEBERG: lambda url: functools.partial(
|
229
|
+
daft.io.read_iceberg,
|
230
|
+
url.url_path,
|
231
|
+
**url.query_params,
|
232
|
+
),
|
233
|
+
DatastoreType.JSON: lambda url: functools.partial(
|
234
|
+
daft.io.read_json,
|
235
|
+
url.url_path,
|
236
|
+
**url.query_params,
|
237
|
+
),
|
238
|
+
DatastoreType.PARQUET: lambda url: functools.partial(
|
239
|
+
daft.io.read_parquet,
|
240
|
+
url.url_path,
|
241
|
+
**url.query_params,
|
242
|
+
),
|
243
|
+
DatastoreType.WARC: lambda url: functools.partial(
|
244
|
+
daft.io.read_warc,
|
245
|
+
url.url_path,
|
246
|
+
**url.query_params,
|
247
|
+
),
|
248
|
+
DatastoreType.TEXT: lambda url: functools.partial(
|
249
|
+
daft.io.read_csv,
|
250
|
+
url.url_path,
|
251
|
+
infer_schema=False,
|
252
|
+
schema={"text": daft.DataType.string()},
|
253
|
+
has_headers=False,
|
254
|
+
delimiter=chr(25), # end of medium char
|
255
|
+
double_quote=False,
|
256
|
+
comment=None,
|
257
|
+
),
|
258
|
+
}
|
259
|
+
|
260
|
+
DAFT_DATASTORE_TYPE_TO_WRITER = {
|
261
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
262
|
+
daft.DataFrame.write_csv,
|
263
|
+
url.url_path,
|
264
|
+
**url.query_params,
|
265
|
+
),
|
266
|
+
DatastoreType.DELTA_LAKE: lambda url: functools.partial(
|
267
|
+
daft.DataFrame.write_deltalake,
|
268
|
+
url.url_path,
|
269
|
+
**url.query_params,
|
270
|
+
),
|
271
|
+
DatastoreType.ICEBERG: lambda url: functools.partial(
|
272
|
+
daft.DataFrame.write_iceberg,
|
273
|
+
**url.query_params,
|
274
|
+
),
|
275
|
+
DatastoreType.LANCE: lambda url: functools.partial(
|
276
|
+
daft.DataFrame.write_lance,
|
277
|
+
url.url_path,
|
278
|
+
**url.query_params,
|
279
|
+
),
|
280
|
+
DatastoreType.PARQUET: lambda url: functools.partial(
|
281
|
+
daft.DataFrame.write_parquet,
|
282
|
+
url.url_path,
|
283
|
+
**url.query_params,
|
284
|
+
),
|
285
|
+
}
|
286
|
+
|
287
|
+
PYARROW_DATASTORE_TYPE_TO_READER = {
|
288
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
289
|
+
pa_utils.read_csv,
|
290
|
+
url.url_path,
|
291
|
+
read_options=pacsv.ReadOptions(use_threads=False),
|
292
|
+
**url.query_params,
|
293
|
+
),
|
294
|
+
DatastoreType.FEATHER: lambda url: functools.partial(
|
295
|
+
pa_utils.read_feather,
|
296
|
+
url.url_path,
|
297
|
+
use_threads=False,
|
298
|
+
**url.query_params,
|
299
|
+
),
|
300
|
+
DatastoreType.JSON: lambda url: functools.partial(
|
301
|
+
pa_utils.read_json,
|
302
|
+
url.url_path,
|
303
|
+
pajson.ReadOptions(use_threads=False),
|
304
|
+
**url.query_params,
|
305
|
+
),
|
306
|
+
DatastoreType.ORC: lambda url: functools.partial(
|
307
|
+
pa_utils.read_orc,
|
308
|
+
url.url_path,
|
309
|
+
**url.query_params,
|
310
|
+
),
|
311
|
+
DatastoreType.PARQUET: lambda url: functools.partial(
|
312
|
+
pa_utils.read_parquet,
|
313
|
+
url.url_path,
|
314
|
+
use_threads=False,
|
315
|
+
**url.query_params,
|
316
|
+
),
|
317
|
+
DatastoreType.TEXT: lambda url: functools.partial(
|
318
|
+
pa_utils.read_csv,
|
319
|
+
url.url_path,
|
320
|
+
read_options=pacsv.ReadOptions(
|
321
|
+
use_threads=False,
|
322
|
+
column_names=["text"],
|
323
|
+
),
|
324
|
+
parse_options=pacsv.ParseOptions(
|
325
|
+
delimiter=chr(25), # end of medium char
|
326
|
+
quote_char=False,
|
327
|
+
double_quote=False,
|
328
|
+
),
|
329
|
+
convert_options=pacsv.ConvertOptions(
|
330
|
+
check_utf8=False,
|
331
|
+
column_types={"text": pa.string()},
|
332
|
+
),
|
333
|
+
),
|
334
|
+
}
|
335
|
+
|
336
|
+
PYARROW_DATASTORE_TYPE_TO_WRITER = {
|
337
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
338
|
+
pa_utils.write_csv,
|
339
|
+
path=url.url_path,
|
340
|
+
**url.query_params,
|
341
|
+
),
|
342
|
+
DatastoreType.FEATHER: lambda url: functools.partial(
|
343
|
+
pa_utils.write_feather,
|
344
|
+
path=url.url_path,
|
345
|
+
**url.query_params,
|
346
|
+
),
|
347
|
+
DatastoreType.ORC: lambda url: functools.partial(
|
348
|
+
pa_utils.write_orc,
|
349
|
+
path=url.url_path,
|
350
|
+
**url.query_params,
|
351
|
+
),
|
352
|
+
DatastoreType.PARQUET: lambda url: functools.partial(
|
353
|
+
pa_utils.write_parquet,
|
354
|
+
path=url.url_path,
|
355
|
+
**url.query_params,
|
356
|
+
),
|
357
|
+
}
|
358
|
+
|
359
|
+
POLARS_DATASTORE_TYPE_TO_READER = {
|
360
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
361
|
+
pl.read_csv,
|
362
|
+
url.url_path,
|
363
|
+
n_threads=1,
|
364
|
+
**url.query_params,
|
365
|
+
),
|
366
|
+
DatastoreType.DELTA_LAKE: lambda url: functools.partial(
|
367
|
+
pl.read_delta,
|
368
|
+
url.url_path,
|
369
|
+
**url.query_params,
|
370
|
+
),
|
371
|
+
DatastoreType.ICEBERG: lambda url: functools.partial(
|
372
|
+
pl.scan_iceberg,
|
373
|
+
url.url_path,
|
374
|
+
**url.query_params,
|
375
|
+
),
|
376
|
+
DatastoreType.JSON: lambda url: functools.partial(
|
377
|
+
pl.read_json,
|
378
|
+
url.url_path,
|
379
|
+
**url.query_params,
|
380
|
+
),
|
381
|
+
DatastoreType.PARQUET: lambda url: functools.partial(
|
382
|
+
pl.read_parquet,
|
383
|
+
url.url_path,
|
384
|
+
**url.query_params,
|
385
|
+
),
|
386
|
+
DatastoreType.TEXT: lambda url: functools.partial(
|
387
|
+
pl.read_csv,
|
388
|
+
url.url_path,
|
389
|
+
new_columns=["text"],
|
390
|
+
n_threads=1,
|
391
|
+
separator=chr(25), # end of medium char
|
392
|
+
has_header=False,
|
393
|
+
quote_char=None,
|
394
|
+
infer_schema=False,
|
395
|
+
),
|
396
|
+
}
|
397
|
+
|
398
|
+
POLARS_DATASTORE_TYPE_TO_WRITER = {
|
399
|
+
DatastoreType.AVRO: lambda url: functools.partial(
|
400
|
+
pl.DataFrame.write_avro,
|
401
|
+
file=url.url_path,
|
402
|
+
**url.query_params,
|
403
|
+
),
|
404
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
405
|
+
pl.DataFrame.write_csv,
|
406
|
+
file=url.url_path,
|
407
|
+
**url.query_params,
|
408
|
+
),
|
409
|
+
DatastoreType.DELTA_LAKE: lambda url: functools.partial(
|
410
|
+
pl.DataFrame.write_delta,
|
411
|
+
target=url.url_path,
|
412
|
+
**url.query_params,
|
413
|
+
),
|
414
|
+
DatastoreType.ICEBERG: lambda url: functools.partial(
|
415
|
+
pl.DataFrame.write_iceberg,
|
416
|
+
target=url.url_path,
|
417
|
+
**url.query_params,
|
418
|
+
),
|
419
|
+
DatastoreType.JSON: lambda url: functools.partial(
|
420
|
+
pl.DataFrame.write_ndjson,
|
421
|
+
file=url.url_path,
|
422
|
+
**url.query_params,
|
423
|
+
),
|
424
|
+
DatastoreType.PARQUET: lambda url: functools.partial(
|
425
|
+
pl.DataFrame.write_parquet,
|
426
|
+
file=url.url_path,
|
427
|
+
**url.query_params,
|
428
|
+
),
|
429
|
+
}
|
430
|
+
|
431
|
+
PANDAS_DATASTORE_TYPE_TO_READER = {
|
432
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
433
|
+
pd.read_csv,
|
434
|
+
url.url_path,
|
435
|
+
**url.query_params,
|
436
|
+
),
|
437
|
+
DatastoreType.FEATHER: lambda url: functools.partial(
|
438
|
+
pd.read_feather,
|
439
|
+
url.url_path,
|
440
|
+
**url.query_params,
|
441
|
+
),
|
442
|
+
DatastoreType.HDF: lambda url: functools.partial(
|
443
|
+
pd.read_hdf,
|
444
|
+
url.url_path,
|
445
|
+
**url.query_params,
|
446
|
+
),
|
447
|
+
DatastoreType.HTML: lambda url: functools.partial(
|
448
|
+
pd.read_html,
|
449
|
+
url.url_path,
|
450
|
+
**url.query_params,
|
451
|
+
),
|
452
|
+
DatastoreType.JSON: lambda url: functools.partial(
|
453
|
+
pd.read_json,
|
454
|
+
url.url_path,
|
455
|
+
**url.query_params,
|
456
|
+
),
|
457
|
+
DatastoreType.ORC: lambda url: functools.partial(
|
458
|
+
pd.read_orc,
|
459
|
+
url.url_path,
|
460
|
+
**url.query_params,
|
461
|
+
),
|
462
|
+
DatastoreType.PARQUET: lambda url: functools.partial(
|
463
|
+
pd.read_parquet,
|
464
|
+
url.url_path,
|
465
|
+
**url.query_params,
|
466
|
+
),
|
467
|
+
DatastoreType.XML: lambda url: functools.partial(
|
468
|
+
pd.read_xml,
|
469
|
+
url.url_path,
|
470
|
+
**url.query_params,
|
471
|
+
),
|
472
|
+
}
|
473
|
+
|
474
|
+
PANDAS_DATASTORE_TYPE_TO_WRITER = {
|
475
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
476
|
+
pd.DataFrame.to_csv,
|
477
|
+
path_or_buf=url.url_path,
|
478
|
+
**url.query_params,
|
479
|
+
),
|
480
|
+
DatastoreType.FEATHER: lambda url: functools.partial(
|
481
|
+
pd.DataFrame.to_feather,
|
482
|
+
path=url.url_path,
|
483
|
+
**url.query_params,
|
484
|
+
),
|
485
|
+
DatastoreType.HDF: lambda url: functools.partial(
|
486
|
+
pd.DataFrame.to_hdf,
|
487
|
+
path_or_buf=url.url_path,
|
488
|
+
**url.query_params,
|
489
|
+
),
|
490
|
+
DatastoreType.HTML: lambda url: functools.partial(
|
491
|
+
pd.DataFrame.to_html,
|
492
|
+
buf=url.url_path,
|
493
|
+
**url.query_params,
|
494
|
+
),
|
495
|
+
DatastoreType.JSON: lambda url: functools.partial(
|
496
|
+
pd.DataFrame.to_json,
|
497
|
+
path_or_buf=url.url_path,
|
498
|
+
**url.query_params,
|
499
|
+
),
|
500
|
+
DatastoreType.ORC: lambda url: functools.partial(
|
501
|
+
pd.DataFrame.to_orc,
|
502
|
+
path=url.url_path,
|
503
|
+
**url.query_params,
|
504
|
+
),
|
505
|
+
DatastoreType.PARQUET: lambda url: functools.partial(
|
506
|
+
pd.DataFrame.to_parquet,
|
507
|
+
path=url.url_path,
|
508
|
+
**url.query_params,
|
509
|
+
),
|
510
|
+
DatastoreType.XML: lambda url: functools.partial(
|
511
|
+
pd.DataFrame.to_xml,
|
512
|
+
path_or_buffer=url.url_path,
|
513
|
+
**url.query_params,
|
514
|
+
),
|
515
|
+
}
|
516
|
+
|
517
|
+
NUMPY_DATASTORE_TYPE_TO_READER = {
|
518
|
+
DatastoreType.BINARY: lambda url: functools.partial(
|
519
|
+
np.fromfile,
|
520
|
+
url.url_path,
|
521
|
+
**url.query_params,
|
522
|
+
),
|
523
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
524
|
+
np.genfromtxt,
|
525
|
+
url.url_path,
|
526
|
+
**url.query_params,
|
527
|
+
),
|
528
|
+
DatastoreType.NUMPY: lambda url: functools.partial(
|
529
|
+
np.load,
|
530
|
+
url.url_path,
|
531
|
+
**url.query_params,
|
532
|
+
),
|
533
|
+
DatastoreType.TEXT: lambda url: functools.partial(
|
534
|
+
np.loadtxt,
|
535
|
+
url.url_path,
|
536
|
+
**url.query_params,
|
537
|
+
),
|
538
|
+
}
|
539
|
+
|
540
|
+
NUMPY_DATASTORE_TYPE_TO_WRITER = {
|
541
|
+
DatastoreType.CSV: lambda url: functools.partial(
|
542
|
+
np.savetxt,
|
543
|
+
url.url_path,
|
544
|
+
delimiter=",",
|
545
|
+
**url.query_params,
|
546
|
+
),
|
547
|
+
DatastoreType.NUMPY: lambda url: functools.partial(
|
548
|
+
np.savez_compressed,
|
549
|
+
url.url_path,
|
550
|
+
**url.query_params,
|
551
|
+
),
|
552
|
+
DatastoreType.TEXT: lambda url: functools.partial(
|
553
|
+
np.savetxt,
|
554
|
+
url.url_path,
|
555
|
+
**url.query_params,
|
556
|
+
),
|
557
|
+
}
|
558
|
+
|
559
|
+
DATASET_TYPE_TO_DATASTORE_TYPE_READER_RESOLVER = {
|
560
|
+
DatasetType.RAY_DATASET: RAY_DATASTORE_TYPE_TO_READER,
|
561
|
+
DatasetType.DAFT: DAFT_DATASTORE_TYPE_TO_READER,
|
562
|
+
DatasetType.PANDAS: PANDAS_DATASTORE_TYPE_TO_READER,
|
563
|
+
DatasetType.POLARS: POLARS_DATASTORE_TYPE_TO_READER,
|
564
|
+
DatasetType.PYARROW: PYARROW_DATASTORE_TYPE_TO_READER,
|
565
|
+
DatasetType.NUMPY: NUMPY_DATASTORE_TYPE_TO_READER,
|
566
|
+
}
|
567
|
+
|
568
|
+
DATASET_TYPE_TO_DATASTORE_TYPE_WRITER_RESOLVER = {
|
569
|
+
DatasetType.RAY_DATASET: RAY_DATASTORE_TYPE_TO_WRITER,
|
570
|
+
DatasetType.DAFT: DAFT_DATASTORE_TYPE_TO_WRITER,
|
571
|
+
DatasetType.PANDAS: PANDAS_DATASTORE_TYPE_TO_WRITER,
|
572
|
+
DatasetType.POLARS: POLARS_DATASTORE_TYPE_TO_WRITER,
|
573
|
+
DatasetType.PYARROW: PYARROW_DATASTORE_TYPE_TO_WRITER,
|
574
|
+
DatasetType.NUMPY: NUMPY_DATASTORE_TYPE_TO_WRITER,
|
575
|
+
}
|
576
|
+
|
577
|
+
|
578
|
+
class DeltaCatUrl:
|
579
|
+
"""
|
580
|
+
Class for parsing DeltaCAT URLs, which are used to unambiguously locate
|
581
|
+
any internal object(s) already registered in a DeltaCAT catalog, or external
|
582
|
+
object(s) that could be registered in a DeltaCAT catalog.
|
583
|
+
|
584
|
+
Valid DeltaCAT URLs that reference internal catalog objects registered in a
|
585
|
+
DeltaCAT catalog include:
|
586
|
+
|
587
|
+
dc://<catalog>/[namespace]/[table]/[tableversion]/[stream]/[partition]/[delta]
|
588
|
+
namespace://<namespace>/[table]/[tableversion]/[stream]/[partition]/[delta]
|
589
|
+
table://<table>/[tableversion]/[stream]/[partition]/[delta]
|
590
|
+
|
591
|
+
Where <arg> is a required part of the URL and [arg] is an optional part of
|
592
|
+
the URL.
|
593
|
+
|
594
|
+
Valid DeltaCAT URLs that reference external objects include most types
|
595
|
+
readable into any supported DeltaCAT dataset type (e.g., Ray Data, Daft,
|
596
|
+
PyArrow, Pandas, Numpy). External object URLs take the form
|
597
|
+
<DatastoreType>+<URL> or, to be more explicit,
|
598
|
+
<DatastoreType>+<scheme>://<path> where `DatastoreType` is any value
|
599
|
+
from :class:`deltacat.types.media.DatastoreType`
|
600
|
+
|
601
|
+
To reference a file on local disk, replace <scheme>:// with "file" or
|
602
|
+
"local". To read an absolute local file path, use "file:///" or
|
603
|
+
"local:///". To read a local file path relative to the current working
|
604
|
+
directory, use "local://".
|
605
|
+
|
606
|
+
audio+<scheme>://<path>?param1=val1¶m2=val2&...
|
607
|
+
avro+<scheme>://<path>?param1=val1¶m2=val2&...
|
608
|
+
binary+<scheme>://<path>?param1=val1¶m2=val2&...
|
609
|
+
csv+<scheme>://<path>?param1=val1¶m2=val2&...
|
610
|
+
deltasharing+<scheme>://<path>?param1=val1¶m2=val2&...
|
611
|
+
hudi+<scheme>://<path>?param1=val1¶m2=val2&...
|
612
|
+
images+<scheme>://<path>?param1=val1¶m2=val2&...
|
613
|
+
json+<scheme>://<path>?param1=val1¶m2=val2&...
|
614
|
+
lance+<scheme>://<path>?param1=val1¶m2=val2&...
|
615
|
+
numpy+<scheme>://<path>?param1=val1¶m2=val2&...
|
616
|
+
parquet+<scheme>://<path>?param1=val1¶m2=val2&...
|
617
|
+
text+<scheme>://<path>?param1=val1¶m2=val2&...
|
618
|
+
tfrecords+<scheme>://<path>?param1=val1¶m2=val2&...
|
619
|
+
videos+<scheme>://<path>?param1=val1¶m2=val2&...
|
620
|
+
webdataset+<scheme>://<path>?param1=val1¶m2=val2&...
|
621
|
+
|
622
|
+
Some DeltaCAT URLs reference special types of external objects
|
623
|
+
locatable via custom URLs that don't conform to the usual
|
624
|
+
<DatastoreType>+<URL> convention shown above, like:
|
625
|
+
|
626
|
+
<mongodb_uri>?database=<db_name>&collection=<collection_name>&...
|
627
|
+
bigquery://<project_id>/<dataset>?param1=val1&...
|
628
|
+
<clickhouse_dsn>?table=<table_name>?param1=val1&...
|
629
|
+
databricks://<warehouse_id>?param1=val1&...
|
630
|
+
iceberg://<table_identifier>?param1=val1&...
|
631
|
+
|
632
|
+
Note that, for reads, each of the above URLs typically resolves directly
|
633
|
+
to the equivalent :class:`deltacat.types.media.DatasetType` reader. For
|
634
|
+
example, if Ray Data is the dataset type then the equivalent
|
635
|
+
ray.data.read_{} API is used. For example, a read referencing a URL of the
|
636
|
+
form "audio+file:///my/audio.mp4" would resolve to a call to
|
637
|
+
ray.data.read_audio("/my/audio.mp4").
|
638
|
+
"""
|
639
|
+
|
640
|
+
# Auto-resolved DeltaCAT catalog path default identifiers
|
641
|
+
DELTACAT_URL_DEFAULT_CATALOG = "default"
|
642
|
+
DELTACAT_URL_DEFAULT_NAMESPACE = "default"
|
643
|
+
DELTACAT_URL_DEFAULT_TABLE_VERSION = "default"
|
644
|
+
DELTACAT_URL_DEFAULT_STREAM = "default"
|
645
|
+
|
646
|
+
def __init__(
|
647
|
+
self,
|
648
|
+
url: str,
|
649
|
+
):
|
650
|
+
# TODO(pdames): Handle wildcard `*` and `**` at end of url.
|
651
|
+
self.catalog_name = None
|
652
|
+
self.parsed = urlparse(url, allow_fragments=False) # support '#' in path
|
653
|
+
self.url = self.parsed.geturl()
|
654
|
+
path = self.parsed.path
|
655
|
+
# Remove leading/trailing slashes and split the path into elements
|
656
|
+
self.path_elements = [
|
657
|
+
element for element in path.strip("/").split("/") if path and element
|
658
|
+
]
|
659
|
+
# Split the scheme into the root DeltaCAT scheme and the path scheme
|
660
|
+
self.scheme_elements = self.parsed.scheme.split("+")
|
661
|
+
self.datastore_type = DatastoreType(self.scheme_elements[0])
|
662
|
+
if len(self.scheme_elements) == 2:
|
663
|
+
# Remove the source/sink type from the scheme.
|
664
|
+
self.parsed = self.parsed._replace(scheme=self.scheme_elements[1])
|
665
|
+
# Save the URL path to read/write w/o the source/sink type.
|
666
|
+
self.url_path = urlunparse(self.parsed)
|
667
|
+
elif len(self.scheme_elements) > 2:
|
668
|
+
raise ValueError(f"Invalid DeltaCAT URL: {url}")
|
669
|
+
self.query_params = parse_qs(self.parsed.query) if self.parsed.query else {}
|
670
|
+
if self.datastore_type == DatastoreType.DELTACAT:
|
671
|
+
self.catalog_name = self.parsed.netloc
|
672
|
+
self.unresolved_namespace = (
|
673
|
+
self.path_elements[0] if self.path_elements else None
|
674
|
+
)
|
675
|
+
self.table = self.path_elements[1] if len(self.path_elements) > 1 else None
|
676
|
+
self.unresolved_table_version = (
|
677
|
+
self.path_elements[2] if len(self.path_elements) > 2 else None
|
678
|
+
)
|
679
|
+
self.unresolved_stream = (
|
680
|
+
self.path_elements[3] if len(self.path_elements) > 3 else None
|
681
|
+
)
|
682
|
+
self.partition = (
|
683
|
+
self.path_elements[4] if len(self.path_elements) > 4 else None
|
684
|
+
)
|
685
|
+
self.delta = self.path_elements[5] if len(self.path_elements) > 5 else None
|
686
|
+
self._resolve_deltacat_path_identifiers()
|
687
|
+
elif self.datastore_type == DatastoreType.DELTACAT_NAMESPACE:
|
688
|
+
self.catalog_name = DeltaCatUrl.DELTACAT_URL_DEFAULT_CATALOG
|
689
|
+
self.unresolved_namespace = self.parsed.netloc
|
690
|
+
self.table = self.path_elements[0] if self.path_elements else None
|
691
|
+
self.unresolved_table_version = (
|
692
|
+
self.path_elements[1] if len(self.path_elements) > 1 else None
|
693
|
+
)
|
694
|
+
self.unresolved_stream = (
|
695
|
+
self.path_elements[2] if len(self.path_elements) > 2 else None
|
696
|
+
)
|
697
|
+
self.partition = (
|
698
|
+
self.path_elements[3] if len(self.path_elements) > 3 else None
|
699
|
+
)
|
700
|
+
self.delta = self.path_elements[4] if len(self.path_elements) > 4 else None
|
701
|
+
self._resolve_deltacat_path_identifiers()
|
702
|
+
elif self.datastore_type == DatastoreType.DELTACAT_TABLE:
|
703
|
+
self.catalog_name = DeltaCatUrl.DELTACAT_URL_DEFAULT_CATALOG
|
704
|
+
self.unresolved_namespace = DeltaCatUrl.DELTACAT_URL_DEFAULT_NAMESPACE
|
705
|
+
self.table = self.parsed.netloc
|
706
|
+
self.unresolved_table_version = (
|
707
|
+
self.path_elements[0] if self.path_elements else None
|
708
|
+
)
|
709
|
+
self.unresolved_stream = (
|
710
|
+
self.path_elements[1] if len(self.path_elements) > 1 else None
|
711
|
+
)
|
712
|
+
self.partition = (
|
713
|
+
self.path_elements[2] if len(self.path_elements) > 2 else None
|
714
|
+
)
|
715
|
+
self.delta = self.path_elements[3] if len(self.path_elements) > 3 else None
|
716
|
+
self._resolve_deltacat_path_identifiers()
|
717
|
+
|
718
|
+
def is_deltacat_catalog_url(self):
|
719
|
+
return bool(self.catalog_name)
|
720
|
+
|
721
|
+
def resolve_catalog(self):
|
722
|
+
if self.catalog_name:
|
723
|
+
if self.catalog_name.lower() == DeltaCatUrl.DELTACAT_URL_DEFAULT_CATALOG:
|
724
|
+
self.catalog = None
|
725
|
+
self.catalog: CatalogProperties = dc.get_catalog(self.catalog_name).inner
|
726
|
+
if not isinstance(self.catalog, CatalogProperties):
|
727
|
+
raise ValueError(
|
728
|
+
f"Expected catalog `{self.catalog_name}` to be a DeltaCAT "
|
729
|
+
f"catalog but found: {self.catalog}"
|
730
|
+
)
|
731
|
+
|
732
|
+
def _resolve_deltacat_path_identifiers(self):
|
733
|
+
dc.raise_if_not_initialized()
|
734
|
+
self.namespace = self.table_version = self.stream = None
|
735
|
+
if self.unresolved_namespace:
|
736
|
+
if (
|
737
|
+
self.unresolved_namespace.lower()
|
738
|
+
== DeltaCatUrl.DELTACAT_URL_DEFAULT_NAMESPACE
|
739
|
+
):
|
740
|
+
self.namespace = DEFAULT_NAMESPACE
|
741
|
+
else:
|
742
|
+
self.namespace = self.unresolved_namespace
|
743
|
+
if (
|
744
|
+
self.unresolved_table_version
|
745
|
+
and self.unresolved_table_version.lower()
|
746
|
+
!= DeltaCatUrl.DELTACAT_URL_DEFAULT_TABLE_VERSION
|
747
|
+
):
|
748
|
+
self.table_version = self.unresolved_table_version
|
749
|
+
if self.unresolved_stream:
|
750
|
+
if (
|
751
|
+
self.unresolved_stream.lower()
|
752
|
+
== DeltaCatUrl.DELTACAT_URL_DEFAULT_STREAM
|
753
|
+
):
|
754
|
+
self.stream = StreamFormat.DELTACAT
|
755
|
+
else:
|
756
|
+
self.stream = StreamFormat(self.stream)
|
757
|
+
|
758
|
+
def __str__(self):
|
759
|
+
return self.url
|
760
|
+
|
761
|
+
def __repr__(self):
|
762
|
+
return self.url
|
763
|
+
|
764
|
+
|
765
|
+
class DeltaCatUrlReader:
|
766
|
+
def __init__(
|
767
|
+
self,
|
768
|
+
url: DeltaCatUrl,
|
769
|
+
dataset_type: DatasetType = DatasetType.RAY_DATASET,
|
770
|
+
):
|
771
|
+
self._url = url
|
772
|
+
if url.is_deltacat_catalog_url():
|
773
|
+
url.resolve_catalog()
|
774
|
+
self._reader = DeltaCatUrlReader.resolve_dc_reader(url)
|
775
|
+
self._listers = DeltaCatUrlReader.resolve_dc_listers(url)
|
776
|
+
else:
|
777
|
+
self._reader = DeltaCatUrlReader.dataset_and_datastore_type_to_reader(
|
778
|
+
dataset_type,
|
779
|
+
url.datastore_type,
|
780
|
+
)
|
781
|
+
|
782
|
+
@property
|
783
|
+
def url(self) -> DeltaCatUrl:
|
784
|
+
return self._url
|
785
|
+
|
786
|
+
@property
|
787
|
+
def listers(
|
788
|
+
self,
|
789
|
+
) -> List[
|
790
|
+
Tuple[
|
791
|
+
Callable[[Any], ListResult[Metafile]],
|
792
|
+
str,
|
793
|
+
Callable[[Metafile], Union[Metafile, str]],
|
794
|
+
]
|
795
|
+
]:
|
796
|
+
return self._listers
|
797
|
+
|
798
|
+
def read(self, *args, **kwargs) -> Dataset:
|
799
|
+
if self._url.is_deltacat_catalog_url():
|
800
|
+
return self._reader(*args, **kwargs)
|
801
|
+
else:
|
802
|
+
return self._reader(self._url)(*args, **kwargs)
|
803
|
+
|
804
|
+
@staticmethod
|
805
|
+
def resolve_dc_reader(url: DeltaCatUrl) -> Callable:
|
806
|
+
if url.delta:
|
807
|
+
return functools.partial(
|
808
|
+
metastore.get_delta,
|
809
|
+
namespace=url.namespace,
|
810
|
+
table_name=url.table,
|
811
|
+
table_version=url.table_version,
|
812
|
+
partition_values=json.loads(url.partition),
|
813
|
+
stream_position=url.delta,
|
814
|
+
catalog=url.catalog,
|
815
|
+
)
|
816
|
+
if url.partition:
|
817
|
+
return functools.partial(
|
818
|
+
metastore.get_partition,
|
819
|
+
stream_locator=StreamLocator.at(
|
820
|
+
namespace=url.namespace,
|
821
|
+
table_name=url.table,
|
822
|
+
table_version=url.table_version,
|
823
|
+
stream_id=None,
|
824
|
+
stream_format=url.stream,
|
825
|
+
),
|
826
|
+
partition_values=json.loads(url.partition),
|
827
|
+
catalog=url.catalog,
|
828
|
+
)
|
829
|
+
if url.unresolved_stream:
|
830
|
+
return functools.partial(
|
831
|
+
metastore.get_stream,
|
832
|
+
namespace=url.namespace,
|
833
|
+
table_name=url.table,
|
834
|
+
table_version=url.table_version,
|
835
|
+
stream_format=url.stream,
|
836
|
+
catalog=url.catalog,
|
837
|
+
)
|
838
|
+
if url.unresolved_table_version:
|
839
|
+
return functools.partial(
|
840
|
+
metastore.get_table_version,
|
841
|
+
namespace=url.namespace,
|
842
|
+
table_name=url.table,
|
843
|
+
table_version=url.table_version,
|
844
|
+
catalog=url.catalog,
|
845
|
+
)
|
846
|
+
if url.table:
|
847
|
+
return functools.partial(
|
848
|
+
metastore.get_table,
|
849
|
+
namespace=url.namespace,
|
850
|
+
table_name=url.table,
|
851
|
+
catalog=url.catalog,
|
852
|
+
)
|
853
|
+
if url.unresolved_namespace:
|
854
|
+
return functools.partial(
|
855
|
+
metastore.get_namespace,
|
856
|
+
namespace=url.namespace,
|
857
|
+
catalog=url.catalog,
|
858
|
+
)
|
859
|
+
if url.catalog_name:
|
860
|
+
return functools.partial(
|
861
|
+
dc.get_catalog,
|
862
|
+
name=url.catalog_name,
|
863
|
+
)
|
864
|
+
raise ValueError("No DeltaCAT object to read.")
|
865
|
+
|
866
|
+
@staticmethod
|
867
|
+
def resolve_dc_listers(
|
868
|
+
url: DeltaCatUrl,
|
869
|
+
) -> List[
|
870
|
+
Tuple[
|
871
|
+
Callable[[Any], ListResult[Metafile]],
|
872
|
+
Optional[str],
|
873
|
+
Optional[Callable[[Metafile], Union[Metafile, str]]],
|
874
|
+
]
|
875
|
+
]:
|
876
|
+
if url.partition:
|
877
|
+
partition_locator = PartitionLocator.at(
|
878
|
+
namespace=url.namespace,
|
879
|
+
table_name=url.table,
|
880
|
+
table_version=url.table_version,
|
881
|
+
stream_id=None,
|
882
|
+
stream_format=url.stream,
|
883
|
+
partition_values=json.loads(url.partition),
|
884
|
+
partition_id=None,
|
885
|
+
)
|
886
|
+
delta_lister = functools.partial(
|
887
|
+
metastore.list_partition_deltas,
|
888
|
+
partition_like=partition_locator,
|
889
|
+
catalog=url.catalog,
|
890
|
+
)
|
891
|
+
return [(delta_lister, None, None)]
|
892
|
+
if url.unresolved_stream:
|
893
|
+
stream_locator = StreamLocator.at(
|
894
|
+
namespace=url.namespace,
|
895
|
+
table_name=url.table,
|
896
|
+
table_version=url.table_version,
|
897
|
+
stream_id=None,
|
898
|
+
stream_format=url.stream,
|
899
|
+
)
|
900
|
+
stream = Stream.of(
|
901
|
+
locator=stream_locator,
|
902
|
+
partition_scheme=None,
|
903
|
+
)
|
904
|
+
partition_lister = functools.partial(
|
905
|
+
metastore.list_stream_partitions,
|
906
|
+
stream=stream,
|
907
|
+
catalog=url.catalog,
|
908
|
+
)
|
909
|
+
delta_lister = functools.partial(
|
910
|
+
metastore.list_partition_deltas,
|
911
|
+
catalog=url.catalog,
|
912
|
+
)
|
913
|
+
return [
|
914
|
+
(partition_lister, None, None),
|
915
|
+
(delta_lister, "partition_like", lambda x: x),
|
916
|
+
]
|
917
|
+
if url.unresolved_table_version:
|
918
|
+
stream_lister = functools.partial(
|
919
|
+
metastore.list_streams,
|
920
|
+
namespace=url.namespace,
|
921
|
+
table_name=url.table,
|
922
|
+
table_version=url.table_version,
|
923
|
+
catalog=url.catalog,
|
924
|
+
)
|
925
|
+
partition_lister = functools.partial(
|
926
|
+
metastore.list_stream_partitions,
|
927
|
+
catalog=url.catalog,
|
928
|
+
)
|
929
|
+
delta_lister = functools.partial(
|
930
|
+
metastore.list_partition_deltas,
|
931
|
+
catalog=url.catalog,
|
932
|
+
)
|
933
|
+
return [
|
934
|
+
(stream_lister, None, None),
|
935
|
+
(partition_lister, "stream", lambda x: x),
|
936
|
+
(delta_lister, "partition_like", lambda x: x),
|
937
|
+
]
|
938
|
+
if url.table:
|
939
|
+
table_version_lister = functools.partial(
|
940
|
+
metastore.list_table_versions,
|
941
|
+
namespace=url.namespace,
|
942
|
+
table_name=url.table,
|
943
|
+
catalog=url.catalog,
|
944
|
+
)
|
945
|
+
stream_lister = functools.partial(
|
946
|
+
metastore.list_streams,
|
947
|
+
namespace=url.namespace,
|
948
|
+
table_name=url.table,
|
949
|
+
catalog=url.catalog,
|
950
|
+
)
|
951
|
+
partition_lister = functools.partial(
|
952
|
+
metastore.list_stream_partitions,
|
953
|
+
catalog=url.catalog,
|
954
|
+
)
|
955
|
+
delta_lister = functools.partial(
|
956
|
+
metastore.list_partition_deltas,
|
957
|
+
catalog=url.catalog,
|
958
|
+
)
|
959
|
+
return [
|
960
|
+
(table_version_lister, None, None),
|
961
|
+
(stream_lister, "table_version", lambda x: x.table_version),
|
962
|
+
(partition_lister, "stream", lambda x: x),
|
963
|
+
(delta_lister, "partition_like", lambda x: x),
|
964
|
+
]
|
965
|
+
if url.unresolved_namespace:
|
966
|
+
table_lister = functools.partial(
|
967
|
+
metastore.list_tables,
|
968
|
+
namespace=url.namespace,
|
969
|
+
catalog=url.catalog,
|
970
|
+
)
|
971
|
+
table_version_lister = functools.partial(
|
972
|
+
metastore.list_table_versions,
|
973
|
+
namespace=url.namespace,
|
974
|
+
catalog=url.catalog,
|
975
|
+
)
|
976
|
+
stream_lister = functools.partial(
|
977
|
+
metastore.list_streams,
|
978
|
+
namespace=url.namespace,
|
979
|
+
table_name=url.table,
|
980
|
+
catalog=url.catalog,
|
981
|
+
)
|
982
|
+
partition_lister = functools.partial(
|
983
|
+
metastore.list_stream_partitions,
|
984
|
+
catalog=url.catalog,
|
985
|
+
)
|
986
|
+
delta_lister = functools.partial(
|
987
|
+
metastore.list_partition_deltas,
|
988
|
+
catalog=url.catalog,
|
989
|
+
)
|
990
|
+
return [
|
991
|
+
(table_lister, None, None),
|
992
|
+
(table_version_lister, "table_name", lambda x: x.table_name),
|
993
|
+
(stream_lister, "table_version", lambda x: x.table_version),
|
994
|
+
(partition_lister, "stream", lambda x: x),
|
995
|
+
(delta_lister, "partition_like", lambda x: x),
|
996
|
+
]
|
997
|
+
if url.catalog_name:
|
998
|
+
namespace_lister = functools.partial(
|
999
|
+
metastore.list_namespaces,
|
1000
|
+
catalog=url.catalog,
|
1001
|
+
)
|
1002
|
+
table_lister = functools.partial(
|
1003
|
+
metastore.list_tables,
|
1004
|
+
catalog=url.catalog,
|
1005
|
+
)
|
1006
|
+
table_version_lister = functools.partial(
|
1007
|
+
metastore.list_table_versions,
|
1008
|
+
namespace=url.namespace,
|
1009
|
+
catalog=url.catalog,
|
1010
|
+
)
|
1011
|
+
stream_lister = functools.partial(
|
1012
|
+
metastore.list_streams,
|
1013
|
+
namespace=url.namespace,
|
1014
|
+
table_name=url.table,
|
1015
|
+
catalog=url.catalog,
|
1016
|
+
)
|
1017
|
+
partition_lister = functools.partial(
|
1018
|
+
metastore.list_stream_partitions,
|
1019
|
+
catalog=url.catalog,
|
1020
|
+
)
|
1021
|
+
delta_lister = functools.partial(
|
1022
|
+
metastore.list_partition_deltas,
|
1023
|
+
catalog=url.catalog,
|
1024
|
+
)
|
1025
|
+
return [
|
1026
|
+
(namespace_lister, None, None),
|
1027
|
+
(table_lister, "namespace", lambda x: x.namespace),
|
1028
|
+
(table_version_lister, "table_name", lambda x: x.table_name),
|
1029
|
+
(stream_lister, "table_version", lambda x: x.table_version),
|
1030
|
+
(partition_lister, "stream", lambda x: x),
|
1031
|
+
(delta_lister, "partition_like", lambda x: x),
|
1032
|
+
]
|
1033
|
+
raise ValueError("No DeltaCAT objects to list.")
|
1034
|
+
|
1035
|
+
@staticmethod
|
1036
|
+
def dataset_and_datastore_type_to_reader(
|
1037
|
+
dataset_type: DatasetType,
|
1038
|
+
datastore_type: DatastoreType,
|
1039
|
+
):
|
1040
|
+
reader_resolver = DATASET_TYPE_TO_DATASTORE_TYPE_READER_RESOLVER.get(
|
1041
|
+
dataset_type
|
1042
|
+
)
|
1043
|
+
if reader_resolver is None:
|
1044
|
+
raise ValueError(
|
1045
|
+
f"Unsupported dataset type: {dataset_type}. "
|
1046
|
+
f"Supported dataset types: {[dt.name for dt in DatasetType]}"
|
1047
|
+
)
|
1048
|
+
reader = reader_resolver.get(datastore_type)
|
1049
|
+
if reader is None:
|
1050
|
+
raise ValueError(
|
1051
|
+
f"Dataset type `{dataset_type} has no reader for "
|
1052
|
+
f"datastore type: `{datastore_type}`."
|
1053
|
+
f"Supported datastore types: {[k.name for k in reader_resolver.keys()]}"
|
1054
|
+
)
|
1055
|
+
return reader
|
1056
|
+
|
1057
|
+
|
1058
|
+
def _stage_and_commit_stream(
|
1059
|
+
stream: Stream,
|
1060
|
+
*args,
|
1061
|
+
**kwargs,
|
1062
|
+
) -> Stream:
|
1063
|
+
"""
|
1064
|
+
Helper method to stage and commit a stream (e.g., as part of a copy
|
1065
|
+
operation from another catalog). The committed stream will be assigned a
|
1066
|
+
different unique ID than the input stream.
|
1067
|
+
"""
|
1068
|
+
stream = metastore.stage_stream(
|
1069
|
+
namespace=stream.namespace,
|
1070
|
+
table_name=stream.table_name,
|
1071
|
+
table_version=stream.table_version,
|
1072
|
+
stream_format=StreamFormat(stream.stream_format),
|
1073
|
+
*args,
|
1074
|
+
**kwargs,
|
1075
|
+
)
|
1076
|
+
return metastore.commit_stream(
|
1077
|
+
stream=stream,
|
1078
|
+
*args,
|
1079
|
+
**kwargs,
|
1080
|
+
)
|
1081
|
+
|
1082
|
+
|
1083
|
+
def _stage_and_commit_partition(
|
1084
|
+
partition: Partition,
|
1085
|
+
*args,
|
1086
|
+
**kwargs,
|
1087
|
+
) -> Partition:
|
1088
|
+
"""
|
1089
|
+
Helper method to stage and commit a partition (e.g., as part of a copy
|
1090
|
+
operation from another catalog). The committed partition will be assigned a
|
1091
|
+
different unique ID than the input partition.
|
1092
|
+
"""
|
1093
|
+
stream = metastore.get_stream(
|
1094
|
+
namespace=partition.namespace,
|
1095
|
+
table_name=partition.table_name,
|
1096
|
+
table_version=partition.table_version,
|
1097
|
+
stream_format=StreamFormat(partition.stream_format),
|
1098
|
+
)
|
1099
|
+
partition = metastore.stage_partition(
|
1100
|
+
stream=stream,
|
1101
|
+
partition_values=partition.partition_values,
|
1102
|
+
partition_scheme_id=partition.partition_scheme_id,
|
1103
|
+
*args,
|
1104
|
+
**kwargs,
|
1105
|
+
)
|
1106
|
+
return metastore.commit_partition(
|
1107
|
+
partition=partition,
|
1108
|
+
*args,
|
1109
|
+
**kwargs,
|
1110
|
+
)
|
1111
|
+
|
1112
|
+
|
1113
|
+
class DeltaCatUrlWriter:
|
1114
|
+
def __init__(
|
1115
|
+
self,
|
1116
|
+
url: DeltaCatUrl,
|
1117
|
+
dataset_type: DatasetType = DatasetType.RAY_DATASET,
|
1118
|
+
metafile: Optional[Metafile] = None,
|
1119
|
+
):
|
1120
|
+
self._url = url
|
1121
|
+
self._metafile = metafile
|
1122
|
+
if url.is_deltacat_catalog_url():
|
1123
|
+
if url.path_elements:
|
1124
|
+
url.resolve_catalog()
|
1125
|
+
self._writer = DeltaCatUrlWriter.resolve_dc_writer(url, metafile or {})
|
1126
|
+
else:
|
1127
|
+
self._writer = DeltaCatUrlWriter.dataset_and_datastore_type_to_writer(
|
1128
|
+
dataset_type,
|
1129
|
+
url.datastore_type,
|
1130
|
+
)
|
1131
|
+
|
1132
|
+
@property
|
1133
|
+
def url(self) -> DeltaCatUrl:
|
1134
|
+
return self._url
|
1135
|
+
|
1136
|
+
@property
|
1137
|
+
def metafile(self) -> Metafile:
|
1138
|
+
return self._metafile
|
1139
|
+
|
1140
|
+
def write(self, suffix: str = "", *args, **kwargs) -> Union[Metafile, str]:
|
1141
|
+
if self._url.is_deltacat_catalog_url():
|
1142
|
+
return self._writer(*args, **kwargs)
|
1143
|
+
else:
|
1144
|
+
dest_url = DeltaCatUrl(f"{self._url.url}{suffix}")
|
1145
|
+
self._writer(dest_url)(*args, **kwargs)
|
1146
|
+
return dest_url.url_path
|
1147
|
+
|
1148
|
+
@staticmethod
|
1149
|
+
def resolve_dc_writer(
|
1150
|
+
url: DeltaCatUrl,
|
1151
|
+
metafile: Metafile,
|
1152
|
+
) -> Callable:
|
1153
|
+
if url.delta:
|
1154
|
+
delta: Delta = Delta(
|
1155
|
+
Metafile.based_on(
|
1156
|
+
other=metafile,
|
1157
|
+
new_id=url.delta,
|
1158
|
+
)
|
1159
|
+
)
|
1160
|
+
delta.locator = DeltaLocator.at(
|
1161
|
+
namespace=url.namespace,
|
1162
|
+
table_name=url.table,
|
1163
|
+
table_version=url.table_version,
|
1164
|
+
stream_id=None,
|
1165
|
+
stream_format=url.stream,
|
1166
|
+
partition_values=json.loads(url.partition),
|
1167
|
+
partition_id=None,
|
1168
|
+
stream_position=int(url.delta),
|
1169
|
+
)
|
1170
|
+
# TODO(pdames): Honor deep vs. shallow copies. Deep copies require
|
1171
|
+
# first ensuring that all files in the source delta manifest are
|
1172
|
+
# staged to the target catalog before commit. For deltas whose
|
1173
|
+
# manifests reference local files, shallow delta copies will be
|
1174
|
+
# invalid in the target catalog, and should be blocked or
|
1175
|
+
# converted to a deep copy automatically.
|
1176
|
+
return functools.partial(
|
1177
|
+
metastore.commit_delta,
|
1178
|
+
delta=delta,
|
1179
|
+
catalog=url.catalog,
|
1180
|
+
)
|
1181
|
+
if url.partition:
|
1182
|
+
partition: Partition = Partition(metafile)
|
1183
|
+
partition.locator = PartitionLocator.at(
|
1184
|
+
namespace=url.namespace,
|
1185
|
+
table_name=url.table,
|
1186
|
+
table_version=url.table_version,
|
1187
|
+
stream_id=None,
|
1188
|
+
stream_format=url.stream,
|
1189
|
+
partition_values=json.loads(url.partition),
|
1190
|
+
)
|
1191
|
+
return functools.partial(
|
1192
|
+
_stage_and_commit_partition,
|
1193
|
+
partition=partition,
|
1194
|
+
catalog=url.catalog,
|
1195
|
+
)
|
1196
|
+
if url.unresolved_stream:
|
1197
|
+
stream: Stream = Stream(metafile)
|
1198
|
+
stream.locator = StreamLocator.at(
|
1199
|
+
namespace=url.namespace,
|
1200
|
+
table_name=url.table,
|
1201
|
+
table_version=url.table_version,
|
1202
|
+
stream_id=None,
|
1203
|
+
stream_format=url.stream,
|
1204
|
+
)
|
1205
|
+
return functools.partial(
|
1206
|
+
_stage_and_commit_stream,
|
1207
|
+
stream=stream,
|
1208
|
+
catalog=url.catalog,
|
1209
|
+
)
|
1210
|
+
if url.unresolved_table_version:
|
1211
|
+
table_version: TableVersion = TableVersion(metafile)
|
1212
|
+
table_version.locator = TableVersionLocator.at(
|
1213
|
+
namespace=url.namespace,
|
1214
|
+
table_name=url.table,
|
1215
|
+
table_version=url.table_version,
|
1216
|
+
)
|
1217
|
+
return functools.partial(
|
1218
|
+
metastore.create_table_version,
|
1219
|
+
namespace=table_version.namespace,
|
1220
|
+
table_name=table_version.table_name,
|
1221
|
+
table_version=table_version.table_version,
|
1222
|
+
schema=table_version.schema,
|
1223
|
+
partition_scheme=table_version.partition_scheme,
|
1224
|
+
sort_keys=table_version.sort_scheme,
|
1225
|
+
table_version_description=table_version.description,
|
1226
|
+
table_version_properties=table_version.properties,
|
1227
|
+
table_description=table_version.description,
|
1228
|
+
table_properties=table_version.properties,
|
1229
|
+
supported_content_types=table_version.content_types,
|
1230
|
+
catalog=url.catalog,
|
1231
|
+
)
|
1232
|
+
if url.table:
|
1233
|
+
table: Table = Table(metafile)
|
1234
|
+
table.locator = TableLocator.at(
|
1235
|
+
namespace=url.namespace,
|
1236
|
+
table_name=url.table,
|
1237
|
+
)
|
1238
|
+
return functools.partial(
|
1239
|
+
metastore.create_table_version,
|
1240
|
+
namespace=table.namespace,
|
1241
|
+
table_name=table.table_name,
|
1242
|
+
table_description=table.description,
|
1243
|
+
table_properties=table.properties,
|
1244
|
+
catalog=url.catalog,
|
1245
|
+
)
|
1246
|
+
if url.unresolved_namespace:
|
1247
|
+
namespace: Namespace = Namespace(metafile)
|
1248
|
+
namespace.locator = NamespaceLocator.of(
|
1249
|
+
namespace=url.namespace,
|
1250
|
+
)
|
1251
|
+
return functools.partial(
|
1252
|
+
metastore.create_namespace,
|
1253
|
+
namespace=url.namespace,
|
1254
|
+
properties=namespace.properties,
|
1255
|
+
catalog=url.catalog,
|
1256
|
+
)
|
1257
|
+
if url.catalog_name:
|
1258
|
+
return functools.partial(
|
1259
|
+
dc.put_catalog,
|
1260
|
+
name=url.catalog_name,
|
1261
|
+
)
|
1262
|
+
raise ValueError("No DeltaCAT object to write.")
|
1263
|
+
|
1264
|
+
@staticmethod
|
1265
|
+
def dataset_and_datastore_type_to_writer(
|
1266
|
+
dataset_type: DatasetType,
|
1267
|
+
datastore_type: DatastoreType,
|
1268
|
+
):
|
1269
|
+
writer_resolver = DATASET_TYPE_TO_DATASTORE_TYPE_WRITER_RESOLVER.get(
|
1270
|
+
dataset_type
|
1271
|
+
)
|
1272
|
+
if writer_resolver is None:
|
1273
|
+
raise ValueError(
|
1274
|
+
f"Unsupported dataset type: {dataset_type}. "
|
1275
|
+
f"Supported dataset types: {[dt.name for dt in DatasetType]}"
|
1276
|
+
)
|
1277
|
+
writer = writer_resolver.get(datastore_type)
|
1278
|
+
if writer is None:
|
1279
|
+
raise ValueError(
|
1280
|
+
f"Dataset type `{dataset_type} has no writer for "
|
1281
|
+
f"datastore type: `{datastore_type}`."
|
1282
|
+
f"Supported datastore types: {[k.name for k in writer_resolver.keys()]}"
|
1283
|
+
)
|
1284
|
+
return writer
|