deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. deltacat/__init__.py +27 -6
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/conftest.py +1 -1
  5. deltacat/catalog/main/impl.py +12 -6
  6. deltacat/catalog/model/catalog.py +65 -47
  7. deltacat/catalog/model/properties.py +1 -3
  8. deltacat/compute/__init__.py +14 -0
  9. deltacat/compute/converter/constants.py +5 -0
  10. deltacat/compute/converter/converter_session.py +78 -36
  11. deltacat/compute/converter/model/convert_input.py +24 -4
  12. deltacat/compute/converter/model/convert_result.py +61 -0
  13. deltacat/compute/converter/model/converter_session_params.py +52 -10
  14. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  15. deltacat/compute/converter/steps/convert.py +84 -36
  16. deltacat/compute/converter/steps/dedupe.py +25 -4
  17. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  18. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  19. deltacat/compute/converter/utils/io.py +82 -11
  20. deltacat/compute/converter/utils/s3u.py +13 -4
  21. deltacat/compute/jobs/__init__.py +0 -0
  22. deltacat/compute/jobs/client.py +404 -0
  23. deltacat/constants.py +4 -4
  24. deltacat/daft/daft_scan.py +7 -3
  25. deltacat/daft/translator.py +126 -0
  26. deltacat/examples/basic_logging.py +5 -3
  27. deltacat/examples/hello_world.py +4 -2
  28. deltacat/examples/indexer/__init__.py +0 -0
  29. deltacat/examples/indexer/aws/__init__.py +0 -0
  30. deltacat/examples/indexer/gcp/__init__.py +0 -0
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +199 -0
  33. deltacat/io/__init__.py +13 -0
  34. deltacat/io/dataset/__init__.py +0 -0
  35. deltacat/io/dataset/deltacat_dataset.py +91 -0
  36. deltacat/io/datasink/__init__.py +0 -0
  37. deltacat/io/datasink/deltacat_datasink.py +207 -0
  38. deltacat/io/datasource/__init__.py +0 -0
  39. deltacat/io/datasource/deltacat_datasource.py +580 -0
  40. deltacat/io/reader/__init__.py +0 -0
  41. deltacat/io/reader/deltacat_read_api.py +172 -0
  42. deltacat/storage/__init__.py +2 -0
  43. deltacat/storage/model/expression/__init__.py +47 -0
  44. deltacat/storage/model/expression/expression.py +656 -0
  45. deltacat/storage/model/expression/visitor.py +248 -0
  46. deltacat/storage/model/metafile.py +74 -42
  47. deltacat/storage/model/scan/push_down.py +32 -5
  48. deltacat/storage/model/types.py +5 -3
  49. deltacat/storage/rivulet/__init__.py +4 -4
  50. deltacat/tests/_io/reader/__init__.py +0 -0
  51. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  52. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  53. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  54. deltacat/tests/storage/model/test_expression.py +327 -0
  55. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
  56. deltacat/tests/storage/rivulet/test_dataset.py +1 -1
  57. deltacat/tests/storage/rivulet/test_manifest.py +1 -1
  58. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
  59. deltacat/tests/test_deltacat_api.py +50 -9
  60. deltacat/types/media.py +141 -43
  61. deltacat/types/tables.py +35 -7
  62. deltacat/utils/daft.py +2 -2
  63. deltacat/utils/filesystem.py +39 -9
  64. deltacat/utils/polars.py +128 -0
  65. deltacat/utils/pyarrow.py +151 -15
  66. deltacat/utils/ray_utils/concurrency.py +1 -1
  67. deltacat/utils/ray_utils/runtime.py +56 -4
  68. deltacat/utils/url.py +1284 -0
  69. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
  70. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
  71. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
  72. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
  73. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
deltacat/utils/url.py ADDED
@@ -0,0 +1,1284 @@
1
+ import functools
2
+ import json
3
+ from typing import Callable, List, Tuple, Any, Union, Optional
4
+ from urllib.parse import urlparse, urlunparse, parse_qs
5
+
6
+ import ray
7
+ import daft
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ import pyarrow as pa
12
+ import polars as pl
13
+ import deltacat as dc
14
+
15
+ import pyarrow.csv as pacsv
16
+ import pyarrow.json as pajson
17
+
18
+ from deltacat.catalog import CatalogProperties
19
+ from deltacat.constants import DEFAULT_NAMESPACE
20
+ from deltacat.types.media import (
21
+ DatasetType,
22
+ DatastoreType,
23
+ )
24
+ from deltacat.utils import pyarrow as pa_utils
25
+
26
+ from deltacat.storage import (
27
+ metastore,
28
+ Dataset,
29
+ Delta,
30
+ DeltaLocator,
31
+ ListResult,
32
+ Metafile,
33
+ Namespace,
34
+ NamespaceLocator,
35
+ Partition,
36
+ Stream,
37
+ StreamFormat,
38
+ StreamLocator,
39
+ PartitionLocator,
40
+ Table,
41
+ TableLocator,
42
+ TableVersion,
43
+ TableVersionLocator,
44
+ )
45
+
46
+ RAY_DATASTORE_TYPE_TO_READER = {
47
+ DatastoreType.AUDIO: lambda url: functools.partial(
48
+ ray.data.read_audio,
49
+ url.url_path,
50
+ **url.query_params,
51
+ ),
52
+ DatastoreType.AVRO: lambda url: functools.partial(
53
+ ray.data.read_avro,
54
+ url.url_path,
55
+ **url.query_params,
56
+ ),
57
+ DatastoreType.BIGQUERY: lambda url: functools.partial(
58
+ ray.data.read_bigquery,
59
+ project_id=url.parsed.netloc,
60
+ dataset=url.path_elements[0] if url.path_elements else None,
61
+ **url.query_params,
62
+ ),
63
+ DatastoreType.BINARY: lambda url: functools.partial(
64
+ ray.data.read_binary_files,
65
+ url.url_path,
66
+ **url.query_params,
67
+ ),
68
+ DatastoreType.CSV: lambda url: functools.partial(
69
+ ray.data.read_csv,
70
+ url.url_path,
71
+ **url.query_params,
72
+ ),
73
+ DatastoreType.CLICKHOUSE: lambda url: functools.partial(
74
+ ray.data.read_clickhouse,
75
+ table=url.parsed.query,
76
+ dsn=url.url,
77
+ **url.query_params,
78
+ ),
79
+ DatastoreType.DATABRICKS_TABLES: lambda url: functools.partial(
80
+ ray.data.read_databricks_tables,
81
+ warehouse_id=url.parsed.netloc,
82
+ **url.query_params,
83
+ ),
84
+ DatastoreType.DELTA_SHARING: lambda url: functools.partial(
85
+ ray.data.read_delta_sharing_tables,
86
+ url.url_path,
87
+ **url.query_params,
88
+ ),
89
+ DatastoreType.HUDI: lambda url: functools.partial(
90
+ ray.data.read_hudi,
91
+ url.url_path,
92
+ **url.query_params,
93
+ ),
94
+ DatastoreType.ICEBERG: lambda url: functools.partial(
95
+ ray.data.read_iceberg,
96
+ table_identifier=url.parsed.netloc,
97
+ **url.query_params,
98
+ ),
99
+ DatastoreType.IMAGES: lambda url: functools.partial(
100
+ ray.data.read_images,
101
+ url.url_path,
102
+ **url.query_params,
103
+ ),
104
+ DatastoreType.JSON: lambda url: functools.partial(
105
+ ray.data.read_json,
106
+ url.url_path,
107
+ **url.query_params,
108
+ ),
109
+ DatastoreType.LANCE: lambda url: functools.partial(
110
+ ray.data.read_lance,
111
+ url.url_path,
112
+ **url.query_params,
113
+ ),
114
+ DatastoreType.MONGO: lambda url: functools.partial(
115
+ ray.data.read_mongo,
116
+ url.url,
117
+ **url.query_params,
118
+ ),
119
+ DatastoreType.NUMPY: lambda url: functools.partial(
120
+ ray.data.read_numpy,
121
+ url.url_path,
122
+ **url.query_params,
123
+ ),
124
+ DatastoreType.PARQUET: lambda url: functools.partial(
125
+ ray.data.read_parquet,
126
+ url.url_path,
127
+ **url.query_params,
128
+ ),
129
+ DatastoreType.TEXT: lambda url: functools.partial(
130
+ ray.data.read_text,
131
+ url.url_path,
132
+ **url.query_params,
133
+ ),
134
+ DatastoreType.TFRECORDS: lambda url: functools.partial(
135
+ ray.data.read_tfrecords,
136
+ url.url_path,
137
+ **url.query_params,
138
+ ),
139
+ DatastoreType.VIDEOS: lambda url: functools.partial(
140
+ ray.data.read_videos,
141
+ url.url_path,
142
+ **url.query_params,
143
+ ),
144
+ DatastoreType.WEBDATASET: lambda url: functools.partial(
145
+ ray.data.read_webdataset,
146
+ url.url_path,
147
+ **url.query_params,
148
+ ),
149
+ }
150
+
151
+ RAY_DATASTORE_TYPE_TO_WRITER = {
152
+ DatastoreType.BIGQUERY: lambda url: functools.partial(
153
+ ray.data.Dataset.write_bigquery,
154
+ project_id=url.parsed.netloc,
155
+ dataset=url.path_elements[0] if url.path_elements else None,
156
+ **url.query_params,
157
+ ),
158
+ DatastoreType.CSV: lambda url: functools.partial(
159
+ ray.data.write_csv,
160
+ url.url_path,
161
+ **url.query_params,
162
+ ),
163
+ DatastoreType.ICEBERG: lambda url: functools.partial(
164
+ ray.data.Dataset.write_iceberg,
165
+ table_identifier=url.parsed.netloc,
166
+ **url.query_params,
167
+ ),
168
+ DatastoreType.IMAGES: lambda url: functools.partial(
169
+ ray.data.Dataset.write_images,
170
+ path=url.url_path,
171
+ column=url.query_params.pop("column", "image") if url.query_params else "image",
172
+ **url.query_params,
173
+ ),
174
+ DatastoreType.JSON: lambda url: functools.partial(
175
+ ray.data.Dataset.write_json,
176
+ url.url_path,
177
+ **url.query_params,
178
+ ),
179
+ DatastoreType.LANCE: lambda url: functools.partial(
180
+ ray.data.Dataset.write_lance,
181
+ url.url_path,
182
+ **url.query_params,
183
+ ),
184
+ DatastoreType.MONGO: lambda url: functools.partial(
185
+ ray.data.Dataset.write_mongo,
186
+ url.url,
187
+ **url.query_params,
188
+ ),
189
+ DatastoreType.NUMPY: lambda url: functools.partial(
190
+ ray.data.Dataset.write_numpy,
191
+ path=url.url_path,
192
+ column=url.query_params.pop("column", "data") if url.query_params else "data",
193
+ **url.query_params,
194
+ ),
195
+ DatastoreType.PARQUET: lambda url: functools.partial(
196
+ ray.data.Dataset.write_parquet,
197
+ url.url_path,
198
+ **url.query_params,
199
+ ),
200
+ DatastoreType.TFRECORDS: lambda url: functools.partial(
201
+ ray.data.Dataset.write_tfrecords,
202
+ url.url_path,
203
+ **url.query_params,
204
+ ),
205
+ DatastoreType.WEBDATASET: lambda url: functools.partial(
206
+ ray.data.Dataset.write_webdataset,
207
+ url.url_path,
208
+ **url.query_params,
209
+ ),
210
+ }
211
+
212
+ DAFT_DATASTORE_TYPE_TO_READER = {
213
+ DatastoreType.CSV: lambda url: functools.partial(
214
+ daft.io.read_csv,
215
+ url.url_path,
216
+ **url.query_params,
217
+ ),
218
+ DatastoreType.DELTA_LAKE: lambda url: functools.partial(
219
+ daft.io.read_deltalake,
220
+ url.url_path,
221
+ **url.query_params,
222
+ ),
223
+ DatastoreType.HUDI: lambda url: functools.partial(
224
+ daft.io.read_hudi,
225
+ url.url_path,
226
+ **url.query_params,
227
+ ),
228
+ DatastoreType.ICEBERG: lambda url: functools.partial(
229
+ daft.io.read_iceberg,
230
+ url.url_path,
231
+ **url.query_params,
232
+ ),
233
+ DatastoreType.JSON: lambda url: functools.partial(
234
+ daft.io.read_json,
235
+ url.url_path,
236
+ **url.query_params,
237
+ ),
238
+ DatastoreType.PARQUET: lambda url: functools.partial(
239
+ daft.io.read_parquet,
240
+ url.url_path,
241
+ **url.query_params,
242
+ ),
243
+ DatastoreType.WARC: lambda url: functools.partial(
244
+ daft.io.read_warc,
245
+ url.url_path,
246
+ **url.query_params,
247
+ ),
248
+ DatastoreType.TEXT: lambda url: functools.partial(
249
+ daft.io.read_csv,
250
+ url.url_path,
251
+ infer_schema=False,
252
+ schema={"text": daft.DataType.string()},
253
+ has_headers=False,
254
+ delimiter=chr(25), # end of medium char
255
+ double_quote=False,
256
+ comment=None,
257
+ ),
258
+ }
259
+
260
+ DAFT_DATASTORE_TYPE_TO_WRITER = {
261
+ DatastoreType.CSV: lambda url: functools.partial(
262
+ daft.DataFrame.write_csv,
263
+ url.url_path,
264
+ **url.query_params,
265
+ ),
266
+ DatastoreType.DELTA_LAKE: lambda url: functools.partial(
267
+ daft.DataFrame.write_deltalake,
268
+ url.url_path,
269
+ **url.query_params,
270
+ ),
271
+ DatastoreType.ICEBERG: lambda url: functools.partial(
272
+ daft.DataFrame.write_iceberg,
273
+ **url.query_params,
274
+ ),
275
+ DatastoreType.LANCE: lambda url: functools.partial(
276
+ daft.DataFrame.write_lance,
277
+ url.url_path,
278
+ **url.query_params,
279
+ ),
280
+ DatastoreType.PARQUET: lambda url: functools.partial(
281
+ daft.DataFrame.write_parquet,
282
+ url.url_path,
283
+ **url.query_params,
284
+ ),
285
+ }
286
+
287
+ PYARROW_DATASTORE_TYPE_TO_READER = {
288
+ DatastoreType.CSV: lambda url: functools.partial(
289
+ pa_utils.read_csv,
290
+ url.url_path,
291
+ read_options=pacsv.ReadOptions(use_threads=False),
292
+ **url.query_params,
293
+ ),
294
+ DatastoreType.FEATHER: lambda url: functools.partial(
295
+ pa_utils.read_feather,
296
+ url.url_path,
297
+ use_threads=False,
298
+ **url.query_params,
299
+ ),
300
+ DatastoreType.JSON: lambda url: functools.partial(
301
+ pa_utils.read_json,
302
+ url.url_path,
303
+ pajson.ReadOptions(use_threads=False),
304
+ **url.query_params,
305
+ ),
306
+ DatastoreType.ORC: lambda url: functools.partial(
307
+ pa_utils.read_orc,
308
+ url.url_path,
309
+ **url.query_params,
310
+ ),
311
+ DatastoreType.PARQUET: lambda url: functools.partial(
312
+ pa_utils.read_parquet,
313
+ url.url_path,
314
+ use_threads=False,
315
+ **url.query_params,
316
+ ),
317
+ DatastoreType.TEXT: lambda url: functools.partial(
318
+ pa_utils.read_csv,
319
+ url.url_path,
320
+ read_options=pacsv.ReadOptions(
321
+ use_threads=False,
322
+ column_names=["text"],
323
+ ),
324
+ parse_options=pacsv.ParseOptions(
325
+ delimiter=chr(25), # end of medium char
326
+ quote_char=False,
327
+ double_quote=False,
328
+ ),
329
+ convert_options=pacsv.ConvertOptions(
330
+ check_utf8=False,
331
+ column_types={"text": pa.string()},
332
+ ),
333
+ ),
334
+ }
335
+
336
+ PYARROW_DATASTORE_TYPE_TO_WRITER = {
337
+ DatastoreType.CSV: lambda url: functools.partial(
338
+ pa_utils.write_csv,
339
+ path=url.url_path,
340
+ **url.query_params,
341
+ ),
342
+ DatastoreType.FEATHER: lambda url: functools.partial(
343
+ pa_utils.write_feather,
344
+ path=url.url_path,
345
+ **url.query_params,
346
+ ),
347
+ DatastoreType.ORC: lambda url: functools.partial(
348
+ pa_utils.write_orc,
349
+ path=url.url_path,
350
+ **url.query_params,
351
+ ),
352
+ DatastoreType.PARQUET: lambda url: functools.partial(
353
+ pa_utils.write_parquet,
354
+ path=url.url_path,
355
+ **url.query_params,
356
+ ),
357
+ }
358
+
359
+ POLARS_DATASTORE_TYPE_TO_READER = {
360
+ DatastoreType.CSV: lambda url: functools.partial(
361
+ pl.read_csv,
362
+ url.url_path,
363
+ n_threads=1,
364
+ **url.query_params,
365
+ ),
366
+ DatastoreType.DELTA_LAKE: lambda url: functools.partial(
367
+ pl.read_delta,
368
+ url.url_path,
369
+ **url.query_params,
370
+ ),
371
+ DatastoreType.ICEBERG: lambda url: functools.partial(
372
+ pl.scan_iceberg,
373
+ url.url_path,
374
+ **url.query_params,
375
+ ),
376
+ DatastoreType.JSON: lambda url: functools.partial(
377
+ pl.read_json,
378
+ url.url_path,
379
+ **url.query_params,
380
+ ),
381
+ DatastoreType.PARQUET: lambda url: functools.partial(
382
+ pl.read_parquet,
383
+ url.url_path,
384
+ **url.query_params,
385
+ ),
386
+ DatastoreType.TEXT: lambda url: functools.partial(
387
+ pl.read_csv,
388
+ url.url_path,
389
+ new_columns=["text"],
390
+ n_threads=1,
391
+ separator=chr(25), # end of medium char
392
+ has_header=False,
393
+ quote_char=None,
394
+ infer_schema=False,
395
+ ),
396
+ }
397
+
398
+ POLARS_DATASTORE_TYPE_TO_WRITER = {
399
+ DatastoreType.AVRO: lambda url: functools.partial(
400
+ pl.DataFrame.write_avro,
401
+ file=url.url_path,
402
+ **url.query_params,
403
+ ),
404
+ DatastoreType.CSV: lambda url: functools.partial(
405
+ pl.DataFrame.write_csv,
406
+ file=url.url_path,
407
+ **url.query_params,
408
+ ),
409
+ DatastoreType.DELTA_LAKE: lambda url: functools.partial(
410
+ pl.DataFrame.write_delta,
411
+ target=url.url_path,
412
+ **url.query_params,
413
+ ),
414
+ DatastoreType.ICEBERG: lambda url: functools.partial(
415
+ pl.DataFrame.write_iceberg,
416
+ target=url.url_path,
417
+ **url.query_params,
418
+ ),
419
+ DatastoreType.JSON: lambda url: functools.partial(
420
+ pl.DataFrame.write_ndjson,
421
+ file=url.url_path,
422
+ **url.query_params,
423
+ ),
424
+ DatastoreType.PARQUET: lambda url: functools.partial(
425
+ pl.DataFrame.write_parquet,
426
+ file=url.url_path,
427
+ **url.query_params,
428
+ ),
429
+ }
430
+
431
+ PANDAS_DATASTORE_TYPE_TO_READER = {
432
+ DatastoreType.CSV: lambda url: functools.partial(
433
+ pd.read_csv,
434
+ url.url_path,
435
+ **url.query_params,
436
+ ),
437
+ DatastoreType.FEATHER: lambda url: functools.partial(
438
+ pd.read_feather,
439
+ url.url_path,
440
+ **url.query_params,
441
+ ),
442
+ DatastoreType.HDF: lambda url: functools.partial(
443
+ pd.read_hdf,
444
+ url.url_path,
445
+ **url.query_params,
446
+ ),
447
+ DatastoreType.HTML: lambda url: functools.partial(
448
+ pd.read_html,
449
+ url.url_path,
450
+ **url.query_params,
451
+ ),
452
+ DatastoreType.JSON: lambda url: functools.partial(
453
+ pd.read_json,
454
+ url.url_path,
455
+ **url.query_params,
456
+ ),
457
+ DatastoreType.ORC: lambda url: functools.partial(
458
+ pd.read_orc,
459
+ url.url_path,
460
+ **url.query_params,
461
+ ),
462
+ DatastoreType.PARQUET: lambda url: functools.partial(
463
+ pd.read_parquet,
464
+ url.url_path,
465
+ **url.query_params,
466
+ ),
467
+ DatastoreType.XML: lambda url: functools.partial(
468
+ pd.read_xml,
469
+ url.url_path,
470
+ **url.query_params,
471
+ ),
472
+ }
473
+
474
+ PANDAS_DATASTORE_TYPE_TO_WRITER = {
475
+ DatastoreType.CSV: lambda url: functools.partial(
476
+ pd.DataFrame.to_csv,
477
+ path_or_buf=url.url_path,
478
+ **url.query_params,
479
+ ),
480
+ DatastoreType.FEATHER: lambda url: functools.partial(
481
+ pd.DataFrame.to_feather,
482
+ path=url.url_path,
483
+ **url.query_params,
484
+ ),
485
+ DatastoreType.HDF: lambda url: functools.partial(
486
+ pd.DataFrame.to_hdf,
487
+ path_or_buf=url.url_path,
488
+ **url.query_params,
489
+ ),
490
+ DatastoreType.HTML: lambda url: functools.partial(
491
+ pd.DataFrame.to_html,
492
+ buf=url.url_path,
493
+ **url.query_params,
494
+ ),
495
+ DatastoreType.JSON: lambda url: functools.partial(
496
+ pd.DataFrame.to_json,
497
+ path_or_buf=url.url_path,
498
+ **url.query_params,
499
+ ),
500
+ DatastoreType.ORC: lambda url: functools.partial(
501
+ pd.DataFrame.to_orc,
502
+ path=url.url_path,
503
+ **url.query_params,
504
+ ),
505
+ DatastoreType.PARQUET: lambda url: functools.partial(
506
+ pd.DataFrame.to_parquet,
507
+ path=url.url_path,
508
+ **url.query_params,
509
+ ),
510
+ DatastoreType.XML: lambda url: functools.partial(
511
+ pd.DataFrame.to_xml,
512
+ path_or_buffer=url.url_path,
513
+ **url.query_params,
514
+ ),
515
+ }
516
+
517
+ NUMPY_DATASTORE_TYPE_TO_READER = {
518
+ DatastoreType.BINARY: lambda url: functools.partial(
519
+ np.fromfile,
520
+ url.url_path,
521
+ **url.query_params,
522
+ ),
523
+ DatastoreType.CSV: lambda url: functools.partial(
524
+ np.genfromtxt,
525
+ url.url_path,
526
+ **url.query_params,
527
+ ),
528
+ DatastoreType.NUMPY: lambda url: functools.partial(
529
+ np.load,
530
+ url.url_path,
531
+ **url.query_params,
532
+ ),
533
+ DatastoreType.TEXT: lambda url: functools.partial(
534
+ np.loadtxt,
535
+ url.url_path,
536
+ **url.query_params,
537
+ ),
538
+ }
539
+
540
+ NUMPY_DATASTORE_TYPE_TO_WRITER = {
541
+ DatastoreType.CSV: lambda url: functools.partial(
542
+ np.savetxt,
543
+ url.url_path,
544
+ delimiter=",",
545
+ **url.query_params,
546
+ ),
547
+ DatastoreType.NUMPY: lambda url: functools.partial(
548
+ np.savez_compressed,
549
+ url.url_path,
550
+ **url.query_params,
551
+ ),
552
+ DatastoreType.TEXT: lambda url: functools.partial(
553
+ np.savetxt,
554
+ url.url_path,
555
+ **url.query_params,
556
+ ),
557
+ }
558
+
559
+ DATASET_TYPE_TO_DATASTORE_TYPE_READER_RESOLVER = {
560
+ DatasetType.RAY_DATASET: RAY_DATASTORE_TYPE_TO_READER,
561
+ DatasetType.DAFT: DAFT_DATASTORE_TYPE_TO_READER,
562
+ DatasetType.PANDAS: PANDAS_DATASTORE_TYPE_TO_READER,
563
+ DatasetType.POLARS: POLARS_DATASTORE_TYPE_TO_READER,
564
+ DatasetType.PYARROW: PYARROW_DATASTORE_TYPE_TO_READER,
565
+ DatasetType.NUMPY: NUMPY_DATASTORE_TYPE_TO_READER,
566
+ }
567
+
568
+ DATASET_TYPE_TO_DATASTORE_TYPE_WRITER_RESOLVER = {
569
+ DatasetType.RAY_DATASET: RAY_DATASTORE_TYPE_TO_WRITER,
570
+ DatasetType.DAFT: DAFT_DATASTORE_TYPE_TO_WRITER,
571
+ DatasetType.PANDAS: PANDAS_DATASTORE_TYPE_TO_WRITER,
572
+ DatasetType.POLARS: POLARS_DATASTORE_TYPE_TO_WRITER,
573
+ DatasetType.PYARROW: PYARROW_DATASTORE_TYPE_TO_WRITER,
574
+ DatasetType.NUMPY: NUMPY_DATASTORE_TYPE_TO_WRITER,
575
+ }
576
+
577
+
578
+ class DeltaCatUrl:
579
+ """
580
+ Class for parsing DeltaCAT URLs, which are used to unambiguously locate
581
+ any internal object(s) already registered in a DeltaCAT catalog, or external
582
+ object(s) that could be registered in a DeltaCAT catalog.
583
+
584
+ Valid DeltaCAT URLs that reference internal catalog objects registered in a
585
+ DeltaCAT catalog include:
586
+
587
+ dc://<catalog>/[namespace]/[table]/[tableversion]/[stream]/[partition]/[delta]
588
+ namespace://<namespace>/[table]/[tableversion]/[stream]/[partition]/[delta]
589
+ table://<table>/[tableversion]/[stream]/[partition]/[delta]
590
+
591
+ Where <arg> is a required part of the URL and [arg] is an optional part of
592
+ the URL.
593
+
594
+ Valid DeltaCAT URLs that reference external objects include most types
595
+ readable into any supported DeltaCAT dataset type (e.g., Ray Data, Daft,
596
+ PyArrow, Pandas, Numpy). External object URLs take the form
597
+ <DatastoreType>+<URL> or, to be more explicit,
598
+ <DatastoreType>+<scheme>://<path> where `DatastoreType` is any value
599
+ from :class:`deltacat.types.media.DatastoreType`
600
+
601
+ To reference a file on local disk, replace <scheme>:// with "file" or
602
+ "local". To read an absolute local file path, use "file:///" or
603
+ "local:///". To read a local file path relative to the current working
604
+ directory, use "local://".
605
+
606
+ audio+<scheme>://<path>?param1=val1&param2=val2&...
607
+ avro+<scheme>://<path>?param1=val1&param2=val2&...
608
+ binary+<scheme>://<path>?param1=val1&param2=val2&...
609
+ csv+<scheme>://<path>?param1=val1&param2=val2&...
610
+ deltasharing+<scheme>://<path>?param1=val1&param2=val2&...
611
+ hudi+<scheme>://<path>?param1=val1&param2=val2&...
612
+ images+<scheme>://<path>?param1=val1&param2=val2&...
613
+ json+<scheme>://<path>?param1=val1&param2=val2&...
614
+ lance+<scheme>://<path>?param1=val1&param2=val2&...
615
+ numpy+<scheme>://<path>?param1=val1&param2=val2&...
616
+ parquet+<scheme>://<path>?param1=val1&param2=val2&...
617
+ text+<scheme>://<path>?param1=val1&param2=val2&...
618
+ tfrecords+<scheme>://<path>?param1=val1&param2=val2&...
619
+ videos+<scheme>://<path>?param1=val1&param2=val2&...
620
+ webdataset+<scheme>://<path>?param1=val1&param2=val2&...
621
+
622
+ Some DeltaCAT URLs reference special types of external objects
623
+ locatable via custom URLs that don't conform to the usual
624
+ <DatastoreType>+<URL> convention shown above, like:
625
+
626
+ <mongodb_uri>?database=<db_name>&collection=<collection_name>&...
627
+ bigquery://<project_id>/<dataset>?param1=val1&...
628
+ <clickhouse_dsn>?table=<table_name>?param1=val1&...
629
+ databricks://<warehouse_id>?param1=val1&...
630
+ iceberg://<table_identifier>?param1=val1&...
631
+
632
+ Note that, for reads, each of the above URLs typically resolves directly
633
+ to the equivalent :class:`deltacat.types.media.DatasetType` reader. For
634
+ example, if Ray Data is the dataset type then the equivalent
635
+ ray.data.read_{} API is used. For example, a read referencing a URL of the
636
+ form "audio+file:///my/audio.mp4" would resolve to a call to
637
+ ray.data.read_audio("/my/audio.mp4").
638
+ """
639
+
640
+ # Auto-resolved DeltaCAT catalog path default identifiers
641
+ DELTACAT_URL_DEFAULT_CATALOG = "default"
642
+ DELTACAT_URL_DEFAULT_NAMESPACE = "default"
643
+ DELTACAT_URL_DEFAULT_TABLE_VERSION = "default"
644
+ DELTACAT_URL_DEFAULT_STREAM = "default"
645
+
646
+ def __init__(
647
+ self,
648
+ url: str,
649
+ ):
650
+ # TODO(pdames): Handle wildcard `*` and `**` at end of url.
651
+ self.catalog_name = None
652
+ self.parsed = urlparse(url, allow_fragments=False) # support '#' in path
653
+ self.url = self.parsed.geturl()
654
+ path = self.parsed.path
655
+ # Remove leading/trailing slashes and split the path into elements
656
+ self.path_elements = [
657
+ element for element in path.strip("/").split("/") if path and element
658
+ ]
659
+ # Split the scheme into the root DeltaCAT scheme and the path scheme
660
+ self.scheme_elements = self.parsed.scheme.split("+")
661
+ self.datastore_type = DatastoreType(self.scheme_elements[0])
662
+ if len(self.scheme_elements) == 2:
663
+ # Remove the source/sink type from the scheme.
664
+ self.parsed = self.parsed._replace(scheme=self.scheme_elements[1])
665
+ # Save the URL path to read/write w/o the source/sink type.
666
+ self.url_path = urlunparse(self.parsed)
667
+ elif len(self.scheme_elements) > 2:
668
+ raise ValueError(f"Invalid DeltaCAT URL: {url}")
669
+ self.query_params = parse_qs(self.parsed.query) if self.parsed.query else {}
670
+ if self.datastore_type == DatastoreType.DELTACAT:
671
+ self.catalog_name = self.parsed.netloc
672
+ self.unresolved_namespace = (
673
+ self.path_elements[0] if self.path_elements else None
674
+ )
675
+ self.table = self.path_elements[1] if len(self.path_elements) > 1 else None
676
+ self.unresolved_table_version = (
677
+ self.path_elements[2] if len(self.path_elements) > 2 else None
678
+ )
679
+ self.unresolved_stream = (
680
+ self.path_elements[3] if len(self.path_elements) > 3 else None
681
+ )
682
+ self.partition = (
683
+ self.path_elements[4] if len(self.path_elements) > 4 else None
684
+ )
685
+ self.delta = self.path_elements[5] if len(self.path_elements) > 5 else None
686
+ self._resolve_deltacat_path_identifiers()
687
+ elif self.datastore_type == DatastoreType.DELTACAT_NAMESPACE:
688
+ self.catalog_name = DeltaCatUrl.DELTACAT_URL_DEFAULT_CATALOG
689
+ self.unresolved_namespace = self.parsed.netloc
690
+ self.table = self.path_elements[0] if self.path_elements else None
691
+ self.unresolved_table_version = (
692
+ self.path_elements[1] if len(self.path_elements) > 1 else None
693
+ )
694
+ self.unresolved_stream = (
695
+ self.path_elements[2] if len(self.path_elements) > 2 else None
696
+ )
697
+ self.partition = (
698
+ self.path_elements[3] if len(self.path_elements) > 3 else None
699
+ )
700
+ self.delta = self.path_elements[4] if len(self.path_elements) > 4 else None
701
+ self._resolve_deltacat_path_identifiers()
702
+ elif self.datastore_type == DatastoreType.DELTACAT_TABLE:
703
+ self.catalog_name = DeltaCatUrl.DELTACAT_URL_DEFAULT_CATALOG
704
+ self.unresolved_namespace = DeltaCatUrl.DELTACAT_URL_DEFAULT_NAMESPACE
705
+ self.table = self.parsed.netloc
706
+ self.unresolved_table_version = (
707
+ self.path_elements[0] if self.path_elements else None
708
+ )
709
+ self.unresolved_stream = (
710
+ self.path_elements[1] if len(self.path_elements) > 1 else None
711
+ )
712
+ self.partition = (
713
+ self.path_elements[2] if len(self.path_elements) > 2 else None
714
+ )
715
+ self.delta = self.path_elements[3] if len(self.path_elements) > 3 else None
716
+ self._resolve_deltacat_path_identifiers()
717
+
718
+ def is_deltacat_catalog_url(self):
719
+ return bool(self.catalog_name)
720
+
721
+ def resolve_catalog(self):
722
+ if self.catalog_name:
723
+ if self.catalog_name.lower() == DeltaCatUrl.DELTACAT_URL_DEFAULT_CATALOG:
724
+ self.catalog = None
725
+ self.catalog: CatalogProperties = dc.get_catalog(self.catalog_name).inner
726
+ if not isinstance(self.catalog, CatalogProperties):
727
+ raise ValueError(
728
+ f"Expected catalog `{self.catalog_name}` to be a DeltaCAT "
729
+ f"catalog but found: {self.catalog}"
730
+ )
731
+
732
+ def _resolve_deltacat_path_identifiers(self):
733
+ dc.raise_if_not_initialized()
734
+ self.namespace = self.table_version = self.stream = None
735
+ if self.unresolved_namespace:
736
+ if (
737
+ self.unresolved_namespace.lower()
738
+ == DeltaCatUrl.DELTACAT_URL_DEFAULT_NAMESPACE
739
+ ):
740
+ self.namespace = DEFAULT_NAMESPACE
741
+ else:
742
+ self.namespace = self.unresolved_namespace
743
+ if (
744
+ self.unresolved_table_version
745
+ and self.unresolved_table_version.lower()
746
+ != DeltaCatUrl.DELTACAT_URL_DEFAULT_TABLE_VERSION
747
+ ):
748
+ self.table_version = self.unresolved_table_version
749
+ if self.unresolved_stream:
750
+ if (
751
+ self.unresolved_stream.lower()
752
+ == DeltaCatUrl.DELTACAT_URL_DEFAULT_STREAM
753
+ ):
754
+ self.stream = StreamFormat.DELTACAT
755
+ else:
756
+ self.stream = StreamFormat(self.stream)
757
+
758
+ def __str__(self):
759
+ return self.url
760
+
761
+ def __repr__(self):
762
+ return self.url
763
+
764
+
765
+ class DeltaCatUrlReader:
766
+ def __init__(
767
+ self,
768
+ url: DeltaCatUrl,
769
+ dataset_type: DatasetType = DatasetType.RAY_DATASET,
770
+ ):
771
+ self._url = url
772
+ if url.is_deltacat_catalog_url():
773
+ url.resolve_catalog()
774
+ self._reader = DeltaCatUrlReader.resolve_dc_reader(url)
775
+ self._listers = DeltaCatUrlReader.resolve_dc_listers(url)
776
+ else:
777
+ self._reader = DeltaCatUrlReader.dataset_and_datastore_type_to_reader(
778
+ dataset_type,
779
+ url.datastore_type,
780
+ )
781
+
782
+ @property
783
+ def url(self) -> DeltaCatUrl:
784
+ return self._url
785
+
786
+ @property
787
+ def listers(
788
+ self,
789
+ ) -> List[
790
+ Tuple[
791
+ Callable[[Any], ListResult[Metafile]],
792
+ str,
793
+ Callable[[Metafile], Union[Metafile, str]],
794
+ ]
795
+ ]:
796
+ return self._listers
797
+
798
+ def read(self, *args, **kwargs) -> Dataset:
799
+ if self._url.is_deltacat_catalog_url():
800
+ return self._reader(*args, **kwargs)
801
+ else:
802
+ return self._reader(self._url)(*args, **kwargs)
803
+
804
+ @staticmethod
805
+ def resolve_dc_reader(url: DeltaCatUrl) -> Callable:
806
+ if url.delta:
807
+ return functools.partial(
808
+ metastore.get_delta,
809
+ namespace=url.namespace,
810
+ table_name=url.table,
811
+ table_version=url.table_version,
812
+ partition_values=json.loads(url.partition),
813
+ stream_position=url.delta,
814
+ catalog=url.catalog,
815
+ )
816
+ if url.partition:
817
+ return functools.partial(
818
+ metastore.get_partition,
819
+ stream_locator=StreamLocator.at(
820
+ namespace=url.namespace,
821
+ table_name=url.table,
822
+ table_version=url.table_version,
823
+ stream_id=None,
824
+ stream_format=url.stream,
825
+ ),
826
+ partition_values=json.loads(url.partition),
827
+ catalog=url.catalog,
828
+ )
829
+ if url.unresolved_stream:
830
+ return functools.partial(
831
+ metastore.get_stream,
832
+ namespace=url.namespace,
833
+ table_name=url.table,
834
+ table_version=url.table_version,
835
+ stream_format=url.stream,
836
+ catalog=url.catalog,
837
+ )
838
+ if url.unresolved_table_version:
839
+ return functools.partial(
840
+ metastore.get_table_version,
841
+ namespace=url.namespace,
842
+ table_name=url.table,
843
+ table_version=url.table_version,
844
+ catalog=url.catalog,
845
+ )
846
+ if url.table:
847
+ return functools.partial(
848
+ metastore.get_table,
849
+ namespace=url.namespace,
850
+ table_name=url.table,
851
+ catalog=url.catalog,
852
+ )
853
+ if url.unresolved_namespace:
854
+ return functools.partial(
855
+ metastore.get_namespace,
856
+ namespace=url.namespace,
857
+ catalog=url.catalog,
858
+ )
859
+ if url.catalog_name:
860
+ return functools.partial(
861
+ dc.get_catalog,
862
+ name=url.catalog_name,
863
+ )
864
+ raise ValueError("No DeltaCAT object to read.")
865
+
866
+ @staticmethod
867
+ def resolve_dc_listers(
868
+ url: DeltaCatUrl,
869
+ ) -> List[
870
+ Tuple[
871
+ Callable[[Any], ListResult[Metafile]],
872
+ Optional[str],
873
+ Optional[Callable[[Metafile], Union[Metafile, str]]],
874
+ ]
875
+ ]:
876
+ if url.partition:
877
+ partition_locator = PartitionLocator.at(
878
+ namespace=url.namespace,
879
+ table_name=url.table,
880
+ table_version=url.table_version,
881
+ stream_id=None,
882
+ stream_format=url.stream,
883
+ partition_values=json.loads(url.partition),
884
+ partition_id=None,
885
+ )
886
+ delta_lister = functools.partial(
887
+ metastore.list_partition_deltas,
888
+ partition_like=partition_locator,
889
+ catalog=url.catalog,
890
+ )
891
+ return [(delta_lister, None, None)]
892
+ if url.unresolved_stream:
893
+ stream_locator = StreamLocator.at(
894
+ namespace=url.namespace,
895
+ table_name=url.table,
896
+ table_version=url.table_version,
897
+ stream_id=None,
898
+ stream_format=url.stream,
899
+ )
900
+ stream = Stream.of(
901
+ locator=stream_locator,
902
+ partition_scheme=None,
903
+ )
904
+ partition_lister = functools.partial(
905
+ metastore.list_stream_partitions,
906
+ stream=stream,
907
+ catalog=url.catalog,
908
+ )
909
+ delta_lister = functools.partial(
910
+ metastore.list_partition_deltas,
911
+ catalog=url.catalog,
912
+ )
913
+ return [
914
+ (partition_lister, None, None),
915
+ (delta_lister, "partition_like", lambda x: x),
916
+ ]
917
+ if url.unresolved_table_version:
918
+ stream_lister = functools.partial(
919
+ metastore.list_streams,
920
+ namespace=url.namespace,
921
+ table_name=url.table,
922
+ table_version=url.table_version,
923
+ catalog=url.catalog,
924
+ )
925
+ partition_lister = functools.partial(
926
+ metastore.list_stream_partitions,
927
+ catalog=url.catalog,
928
+ )
929
+ delta_lister = functools.partial(
930
+ metastore.list_partition_deltas,
931
+ catalog=url.catalog,
932
+ )
933
+ return [
934
+ (stream_lister, None, None),
935
+ (partition_lister, "stream", lambda x: x),
936
+ (delta_lister, "partition_like", lambda x: x),
937
+ ]
938
+ if url.table:
939
+ table_version_lister = functools.partial(
940
+ metastore.list_table_versions,
941
+ namespace=url.namespace,
942
+ table_name=url.table,
943
+ catalog=url.catalog,
944
+ )
945
+ stream_lister = functools.partial(
946
+ metastore.list_streams,
947
+ namespace=url.namespace,
948
+ table_name=url.table,
949
+ catalog=url.catalog,
950
+ )
951
+ partition_lister = functools.partial(
952
+ metastore.list_stream_partitions,
953
+ catalog=url.catalog,
954
+ )
955
+ delta_lister = functools.partial(
956
+ metastore.list_partition_deltas,
957
+ catalog=url.catalog,
958
+ )
959
+ return [
960
+ (table_version_lister, None, None),
961
+ (stream_lister, "table_version", lambda x: x.table_version),
962
+ (partition_lister, "stream", lambda x: x),
963
+ (delta_lister, "partition_like", lambda x: x),
964
+ ]
965
+ if url.unresolved_namespace:
966
+ table_lister = functools.partial(
967
+ metastore.list_tables,
968
+ namespace=url.namespace,
969
+ catalog=url.catalog,
970
+ )
971
+ table_version_lister = functools.partial(
972
+ metastore.list_table_versions,
973
+ namespace=url.namespace,
974
+ catalog=url.catalog,
975
+ )
976
+ stream_lister = functools.partial(
977
+ metastore.list_streams,
978
+ namespace=url.namespace,
979
+ table_name=url.table,
980
+ catalog=url.catalog,
981
+ )
982
+ partition_lister = functools.partial(
983
+ metastore.list_stream_partitions,
984
+ catalog=url.catalog,
985
+ )
986
+ delta_lister = functools.partial(
987
+ metastore.list_partition_deltas,
988
+ catalog=url.catalog,
989
+ )
990
+ return [
991
+ (table_lister, None, None),
992
+ (table_version_lister, "table_name", lambda x: x.table_name),
993
+ (stream_lister, "table_version", lambda x: x.table_version),
994
+ (partition_lister, "stream", lambda x: x),
995
+ (delta_lister, "partition_like", lambda x: x),
996
+ ]
997
+ if url.catalog_name:
998
+ namespace_lister = functools.partial(
999
+ metastore.list_namespaces,
1000
+ catalog=url.catalog,
1001
+ )
1002
+ table_lister = functools.partial(
1003
+ metastore.list_tables,
1004
+ catalog=url.catalog,
1005
+ )
1006
+ table_version_lister = functools.partial(
1007
+ metastore.list_table_versions,
1008
+ namespace=url.namespace,
1009
+ catalog=url.catalog,
1010
+ )
1011
+ stream_lister = functools.partial(
1012
+ metastore.list_streams,
1013
+ namespace=url.namespace,
1014
+ table_name=url.table,
1015
+ catalog=url.catalog,
1016
+ )
1017
+ partition_lister = functools.partial(
1018
+ metastore.list_stream_partitions,
1019
+ catalog=url.catalog,
1020
+ )
1021
+ delta_lister = functools.partial(
1022
+ metastore.list_partition_deltas,
1023
+ catalog=url.catalog,
1024
+ )
1025
+ return [
1026
+ (namespace_lister, None, None),
1027
+ (table_lister, "namespace", lambda x: x.namespace),
1028
+ (table_version_lister, "table_name", lambda x: x.table_name),
1029
+ (stream_lister, "table_version", lambda x: x.table_version),
1030
+ (partition_lister, "stream", lambda x: x),
1031
+ (delta_lister, "partition_like", lambda x: x),
1032
+ ]
1033
+ raise ValueError("No DeltaCAT objects to list.")
1034
+
1035
+ @staticmethod
1036
+ def dataset_and_datastore_type_to_reader(
1037
+ dataset_type: DatasetType,
1038
+ datastore_type: DatastoreType,
1039
+ ):
1040
+ reader_resolver = DATASET_TYPE_TO_DATASTORE_TYPE_READER_RESOLVER.get(
1041
+ dataset_type
1042
+ )
1043
+ if reader_resolver is None:
1044
+ raise ValueError(
1045
+ f"Unsupported dataset type: {dataset_type}. "
1046
+ f"Supported dataset types: {[dt.name for dt in DatasetType]}"
1047
+ )
1048
+ reader = reader_resolver.get(datastore_type)
1049
+ if reader is None:
1050
+ raise ValueError(
1051
+ f"Dataset type `{dataset_type} has no reader for "
1052
+ f"datastore type: `{datastore_type}`."
1053
+ f"Supported datastore types: {[k.name for k in reader_resolver.keys()]}"
1054
+ )
1055
+ return reader
1056
+
1057
+
1058
+ def _stage_and_commit_stream(
1059
+ stream: Stream,
1060
+ *args,
1061
+ **kwargs,
1062
+ ) -> Stream:
1063
+ """
1064
+ Helper method to stage and commit a stream (e.g., as part of a copy
1065
+ operation from another catalog). The committed stream will be assigned a
1066
+ different unique ID than the input stream.
1067
+ """
1068
+ stream = metastore.stage_stream(
1069
+ namespace=stream.namespace,
1070
+ table_name=stream.table_name,
1071
+ table_version=stream.table_version,
1072
+ stream_format=StreamFormat(stream.stream_format),
1073
+ *args,
1074
+ **kwargs,
1075
+ )
1076
+ return metastore.commit_stream(
1077
+ stream=stream,
1078
+ *args,
1079
+ **kwargs,
1080
+ )
1081
+
1082
+
1083
+ def _stage_and_commit_partition(
1084
+ partition: Partition,
1085
+ *args,
1086
+ **kwargs,
1087
+ ) -> Partition:
1088
+ """
1089
+ Helper method to stage and commit a partition (e.g., as part of a copy
1090
+ operation from another catalog). The committed partition will be assigned a
1091
+ different unique ID than the input partition.
1092
+ """
1093
+ stream = metastore.get_stream(
1094
+ namespace=partition.namespace,
1095
+ table_name=partition.table_name,
1096
+ table_version=partition.table_version,
1097
+ stream_format=StreamFormat(partition.stream_format),
1098
+ )
1099
+ partition = metastore.stage_partition(
1100
+ stream=stream,
1101
+ partition_values=partition.partition_values,
1102
+ partition_scheme_id=partition.partition_scheme_id,
1103
+ *args,
1104
+ **kwargs,
1105
+ )
1106
+ return metastore.commit_partition(
1107
+ partition=partition,
1108
+ *args,
1109
+ **kwargs,
1110
+ )
1111
+
1112
+
1113
+ class DeltaCatUrlWriter:
1114
+ def __init__(
1115
+ self,
1116
+ url: DeltaCatUrl,
1117
+ dataset_type: DatasetType = DatasetType.RAY_DATASET,
1118
+ metafile: Optional[Metafile] = None,
1119
+ ):
1120
+ self._url = url
1121
+ self._metafile = metafile
1122
+ if url.is_deltacat_catalog_url():
1123
+ if url.path_elements:
1124
+ url.resolve_catalog()
1125
+ self._writer = DeltaCatUrlWriter.resolve_dc_writer(url, metafile or {})
1126
+ else:
1127
+ self._writer = DeltaCatUrlWriter.dataset_and_datastore_type_to_writer(
1128
+ dataset_type,
1129
+ url.datastore_type,
1130
+ )
1131
+
1132
+ @property
1133
+ def url(self) -> DeltaCatUrl:
1134
+ return self._url
1135
+
1136
+ @property
1137
+ def metafile(self) -> Metafile:
1138
+ return self._metafile
1139
+
1140
+ def write(self, suffix: str = "", *args, **kwargs) -> Union[Metafile, str]:
1141
+ if self._url.is_deltacat_catalog_url():
1142
+ return self._writer(*args, **kwargs)
1143
+ else:
1144
+ dest_url = DeltaCatUrl(f"{self._url.url}{suffix}")
1145
+ self._writer(dest_url)(*args, **kwargs)
1146
+ return dest_url.url_path
1147
+
1148
+ @staticmethod
1149
+ def resolve_dc_writer(
1150
+ url: DeltaCatUrl,
1151
+ metafile: Metafile,
1152
+ ) -> Callable:
1153
+ if url.delta:
1154
+ delta: Delta = Delta(
1155
+ Metafile.based_on(
1156
+ other=metafile,
1157
+ new_id=url.delta,
1158
+ )
1159
+ )
1160
+ delta.locator = DeltaLocator.at(
1161
+ namespace=url.namespace,
1162
+ table_name=url.table,
1163
+ table_version=url.table_version,
1164
+ stream_id=None,
1165
+ stream_format=url.stream,
1166
+ partition_values=json.loads(url.partition),
1167
+ partition_id=None,
1168
+ stream_position=int(url.delta),
1169
+ )
1170
+ # TODO(pdames): Honor deep vs. shallow copies. Deep copies require
1171
+ # first ensuring that all files in the source delta manifest are
1172
+ # staged to the target catalog before commit. For deltas whose
1173
+ # manifests reference local files, shallow delta copies will be
1174
+ # invalid in the target catalog, and should be blocked or
1175
+ # converted to a deep copy automatically.
1176
+ return functools.partial(
1177
+ metastore.commit_delta,
1178
+ delta=delta,
1179
+ catalog=url.catalog,
1180
+ )
1181
+ if url.partition:
1182
+ partition: Partition = Partition(metafile)
1183
+ partition.locator = PartitionLocator.at(
1184
+ namespace=url.namespace,
1185
+ table_name=url.table,
1186
+ table_version=url.table_version,
1187
+ stream_id=None,
1188
+ stream_format=url.stream,
1189
+ partition_values=json.loads(url.partition),
1190
+ )
1191
+ return functools.partial(
1192
+ _stage_and_commit_partition,
1193
+ partition=partition,
1194
+ catalog=url.catalog,
1195
+ )
1196
+ if url.unresolved_stream:
1197
+ stream: Stream = Stream(metafile)
1198
+ stream.locator = StreamLocator.at(
1199
+ namespace=url.namespace,
1200
+ table_name=url.table,
1201
+ table_version=url.table_version,
1202
+ stream_id=None,
1203
+ stream_format=url.stream,
1204
+ )
1205
+ return functools.partial(
1206
+ _stage_and_commit_stream,
1207
+ stream=stream,
1208
+ catalog=url.catalog,
1209
+ )
1210
+ if url.unresolved_table_version:
1211
+ table_version: TableVersion = TableVersion(metafile)
1212
+ table_version.locator = TableVersionLocator.at(
1213
+ namespace=url.namespace,
1214
+ table_name=url.table,
1215
+ table_version=url.table_version,
1216
+ )
1217
+ return functools.partial(
1218
+ metastore.create_table_version,
1219
+ namespace=table_version.namespace,
1220
+ table_name=table_version.table_name,
1221
+ table_version=table_version.table_version,
1222
+ schema=table_version.schema,
1223
+ partition_scheme=table_version.partition_scheme,
1224
+ sort_keys=table_version.sort_scheme,
1225
+ table_version_description=table_version.description,
1226
+ table_version_properties=table_version.properties,
1227
+ table_description=table_version.description,
1228
+ table_properties=table_version.properties,
1229
+ supported_content_types=table_version.content_types,
1230
+ catalog=url.catalog,
1231
+ )
1232
+ if url.table:
1233
+ table: Table = Table(metafile)
1234
+ table.locator = TableLocator.at(
1235
+ namespace=url.namespace,
1236
+ table_name=url.table,
1237
+ )
1238
+ return functools.partial(
1239
+ metastore.create_table_version,
1240
+ namespace=table.namespace,
1241
+ table_name=table.table_name,
1242
+ table_description=table.description,
1243
+ table_properties=table.properties,
1244
+ catalog=url.catalog,
1245
+ )
1246
+ if url.unresolved_namespace:
1247
+ namespace: Namespace = Namespace(metafile)
1248
+ namespace.locator = NamespaceLocator.of(
1249
+ namespace=url.namespace,
1250
+ )
1251
+ return functools.partial(
1252
+ metastore.create_namespace,
1253
+ namespace=url.namespace,
1254
+ properties=namespace.properties,
1255
+ catalog=url.catalog,
1256
+ )
1257
+ if url.catalog_name:
1258
+ return functools.partial(
1259
+ dc.put_catalog,
1260
+ name=url.catalog_name,
1261
+ )
1262
+ raise ValueError("No DeltaCAT object to write.")
1263
+
1264
+ @staticmethod
1265
+ def dataset_and_datastore_type_to_writer(
1266
+ dataset_type: DatasetType,
1267
+ datastore_type: DatastoreType,
1268
+ ):
1269
+ writer_resolver = DATASET_TYPE_TO_DATASTORE_TYPE_WRITER_RESOLVER.get(
1270
+ dataset_type
1271
+ )
1272
+ if writer_resolver is None:
1273
+ raise ValueError(
1274
+ f"Unsupported dataset type: {dataset_type}. "
1275
+ f"Supported dataset types: {[dt.name for dt in DatasetType]}"
1276
+ )
1277
+ writer = writer_resolver.get(datastore_type)
1278
+ if writer is None:
1279
+ raise ValueError(
1280
+ f"Dataset type `{dataset_type} has no writer for "
1281
+ f"datastore type: `{datastore_type}`."
1282
+ f"Supported datastore types: {[k.name for k in writer_resolver.keys()]}"
1283
+ )
1284
+ return writer