deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. deltacat/__init__.py +27 -6
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/conftest.py +1 -1
  5. deltacat/catalog/main/impl.py +12 -6
  6. deltacat/catalog/model/catalog.py +65 -47
  7. deltacat/catalog/model/properties.py +1 -3
  8. deltacat/compute/__init__.py +14 -0
  9. deltacat/compute/converter/constants.py +5 -0
  10. deltacat/compute/converter/converter_session.py +78 -36
  11. deltacat/compute/converter/model/convert_input.py +24 -4
  12. deltacat/compute/converter/model/convert_result.py +61 -0
  13. deltacat/compute/converter/model/converter_session_params.py +52 -10
  14. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  15. deltacat/compute/converter/steps/convert.py +84 -36
  16. deltacat/compute/converter/steps/dedupe.py +25 -4
  17. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  18. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  19. deltacat/compute/converter/utils/io.py +82 -11
  20. deltacat/compute/converter/utils/s3u.py +13 -4
  21. deltacat/compute/jobs/__init__.py +0 -0
  22. deltacat/compute/jobs/client.py +404 -0
  23. deltacat/constants.py +4 -4
  24. deltacat/daft/daft_scan.py +7 -3
  25. deltacat/daft/translator.py +126 -0
  26. deltacat/examples/basic_logging.py +5 -3
  27. deltacat/examples/hello_world.py +4 -2
  28. deltacat/examples/indexer/__init__.py +0 -0
  29. deltacat/examples/indexer/aws/__init__.py +0 -0
  30. deltacat/examples/indexer/gcp/__init__.py +0 -0
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +199 -0
  33. deltacat/io/__init__.py +13 -0
  34. deltacat/io/dataset/__init__.py +0 -0
  35. deltacat/io/dataset/deltacat_dataset.py +91 -0
  36. deltacat/io/datasink/__init__.py +0 -0
  37. deltacat/io/datasink/deltacat_datasink.py +207 -0
  38. deltacat/io/datasource/__init__.py +0 -0
  39. deltacat/io/datasource/deltacat_datasource.py +580 -0
  40. deltacat/io/reader/__init__.py +0 -0
  41. deltacat/io/reader/deltacat_read_api.py +172 -0
  42. deltacat/storage/__init__.py +2 -0
  43. deltacat/storage/model/expression/__init__.py +47 -0
  44. deltacat/storage/model/expression/expression.py +656 -0
  45. deltacat/storage/model/expression/visitor.py +248 -0
  46. deltacat/storage/model/metafile.py +74 -42
  47. deltacat/storage/model/scan/push_down.py +32 -5
  48. deltacat/storage/model/types.py +5 -3
  49. deltacat/storage/rivulet/__init__.py +4 -4
  50. deltacat/tests/_io/reader/__init__.py +0 -0
  51. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  52. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  53. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  54. deltacat/tests/storage/model/test_expression.py +327 -0
  55. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
  56. deltacat/tests/storage/rivulet/test_dataset.py +1 -1
  57. deltacat/tests/storage/rivulet/test_manifest.py +1 -1
  58. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
  59. deltacat/tests/test_deltacat_api.py +50 -9
  60. deltacat/types/media.py +141 -43
  61. deltacat/types/tables.py +35 -7
  62. deltacat/utils/daft.py +2 -2
  63. deltacat/utils/filesystem.py +39 -9
  64. deltacat/utils/polars.py +128 -0
  65. deltacat/utils/pyarrow.py +151 -15
  66. deltacat/utils/ray_utils/concurrency.py +1 -1
  67. deltacat/utils/ray_utils/runtime.py +56 -4
  68. deltacat/utils/url.py +1284 -0
  69. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
  70. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
  71. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
  72. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
  73. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,580 @@
1
+ import copy
2
+ import functools
3
+ import logging
4
+
5
+ from collections import defaultdict
6
+ from enum import Enum
7
+ from typing import Union, List, Callable, Optional, Dict, Any, Tuple, Iterable
8
+
9
+ import numpy as np
10
+
11
+ import pyarrow as pa
12
+ import pyarrow.fs
13
+ from pyarrow.fs import S3FileSystem
14
+
15
+ from ray.data import (
16
+ Datasource,
17
+ ReadTask,
18
+ )
19
+ from ray.data.block import BlockMetadata, Block, BlockAccessor
20
+ from ray.data.datasource import (
21
+ FastFileMetadataProvider,
22
+ ParquetMetadataProvider,
23
+ )
24
+
25
+ from deltacat.constants import METAFILE_FORMAT_MSGPACK
26
+ from deltacat.aws.s3u import (
27
+ S3Url,
28
+ parse_s3_url,
29
+ )
30
+ from deltacat.types.media import (
31
+ ContentType,
32
+ )
33
+ from deltacat.storage import (
34
+ Manifest,
35
+ ManifestEntryList,
36
+ )
37
+ from deltacat.utils.common import ReadKwargsProvider
38
+ from deltacat.utils.url import DeltaCatUrl, DeltaCatUrlReader
39
+ from deltacat.storage import (
40
+ Metafile,
41
+ ListResult,
42
+ )
43
+ from deltacat import logs
44
+
45
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
46
+
47
+ METAFILE_DATA_COLUMN_NAME = "deltacat_metafile_data"
48
+ METAFILE_TYPE_COLUMN_NAME = "deltacat_metafile_type"
49
+
50
+
51
+ class DeltacatReadType(str, Enum):
52
+ METADATA = "metadata" # get only a single metafile
53
+ METADATA_LIST = "metadata_list" # list top-level metafiles
54
+ METADATA_LIST_RECURSIVE = "metadata_recursive" # list all metafiles
55
+ DATA = "data" # read all data files
56
+
57
+
58
+ class CachedFileMetadataProvider(
59
+ FastFileMetadataProvider,
60
+ ParquetMetadataProvider,
61
+ ):
62
+ def __init__(self, meta_cache: Dict[str, BlockMetadata]):
63
+ self._meta_cache = meta_cache
64
+
65
+ def get_meta_cache(self) -> Dict[str, BlockMetadata]:
66
+ return self._meta_cache
67
+
68
+ def _get_block_metadata(
69
+ self,
70
+ paths: List[str],
71
+ schema: Optional[Union[type, pa.Schema]],
72
+ **kwargs,
73
+ ) -> BlockMetadata:
74
+ agg_block_metadata = BlockMetadata(
75
+ num_rows=0,
76
+ size_bytes=0,
77
+ schema=schema,
78
+ input_files=[],
79
+ exec_stats=None,
80
+ )
81
+ for path in paths:
82
+ block_metadata = self._meta_cache.get(path)
83
+ if block_metadata is None:
84
+ raise ValueError(f"Block metadata not found for path: {path}")
85
+ if block_metadata.num_rows is None:
86
+ agg_block_metadata.num_rows = None
87
+ elif agg_block_metadata.num_rows is not None:
88
+ agg_block_metadata.num_rows += block_metadata.num_rows
89
+ if block_metadata.size_bytes is None:
90
+ agg_block_metadata.size_bytes = None
91
+ elif agg_block_metadata.size_bytes is not None:
92
+ agg_block_metadata.size_bytes += block_metadata.size_bytes
93
+ agg_block_metadata.input_files.append(path)
94
+ return agg_block_metadata
95
+
96
+
97
+ class PathType(str, Enum):
98
+ MANIFEST = "manifest"
99
+ FILES_AND_FOLDERS = "files_and_folders"
100
+
101
+
102
+ class DelimitedTextReaderConfig:
103
+ def __init__(
104
+ self,
105
+ csv: bool = False,
106
+ header: bool = False,
107
+ delimiter: Optional[str] = None,
108
+ bzip2: bool = False,
109
+ gzip: bool = False,
110
+ zstd: bool = False,
111
+ add_quotes: Optional[bool] = None,
112
+ null_as: str = "",
113
+ escape: bool = False,
114
+ fixed_width: bool = False,
115
+ ):
116
+ self.header = header
117
+ self.delimiter = delimiter if delimiter else "," if csv else "|"
118
+ self.bzip2 = bzip2
119
+ self.gzip = gzip
120
+ self.zstd = zstd
121
+ self.add_quotes = add_quotes if add_quotes else True if csv else False
122
+ self.null_as = null_as
123
+ self.escape = escape
124
+ self.fixed_width = fixed_width
125
+
126
+ def _get_arrow_compression_codec_name(self) -> str:
127
+ arrow_compression_codec_name = None
128
+ codecs_enabled = {
129
+ "bz2": self.bzip2,
130
+ "gzip": self.gzip,
131
+ "zstd": self.zstd,
132
+ }
133
+ for encoding, flag in codecs_enabled.items():
134
+ if arrow_compression_codec_name and flag:
135
+ raise ValueError(
136
+ f"Multiple delimited text compression types specified "
137
+ f"({codecs_enabled}). Please ensure that only one "
138
+ f"compression type is set and try again."
139
+ )
140
+ if flag:
141
+ arrow_compression_codec_name = encoding
142
+ return arrow_compression_codec_name
143
+
144
+ def to_arrow_reader_kwargs(
145
+ self, include_columns: Optional[List[str]], schema: Optional[pa.Schema]
146
+ ) -> Dict[str, Any]:
147
+ from pyarrow import csv
148
+
149
+ if self.fixed_width:
150
+ raise NotImplementedError(
151
+ "Delimited text files configured with FIXEDWIDTH are not "
152
+ "currently supported."
153
+ )
154
+ open_stream_args = {}
155
+ arrow_compression_codec_name = self._get_arrow_compression_codec_name()
156
+ if arrow_compression_codec_name:
157
+ open_stream_args["compression"] = arrow_compression_codec_name
158
+ column_names = None
159
+ if schema:
160
+ column_names = schema.names
161
+ autogen_column_names = False if self.header or column_names else True
162
+ read_options = csv.ReadOptions(
163
+ use_threads=False,
164
+ column_names=column_names,
165
+ autogenerate_column_names=autogen_column_names,
166
+ )
167
+ parse_options = csv.ParseOptions(
168
+ delimiter=self.delimiter,
169
+ quote_char='"' if self.add_quotes else False,
170
+ escape_char="\\" if self.escape else False,
171
+ double_quote=False if self.escape else True,
172
+ )
173
+ convert_options = csv.ConvertOptions(
174
+ column_types=schema,
175
+ null_values=[self.null_as] if self.null_as is not None else [],
176
+ true_values=["t"],
177
+ false_values=["f"],
178
+ strings_can_be_null=True if self.null_as is not None else False,
179
+ quoted_strings_can_be_null=True if self.null_as else False,
180
+ include_columns=include_columns,
181
+ )
182
+ return {
183
+ "open_stream_args": open_stream_args,
184
+ "read_options": read_options,
185
+ "parse_options": parse_options,
186
+ "convert_options": convert_options,
187
+ }
188
+
189
+
190
+ def normalize_s3_paths_for_filesystem(
191
+ paths: Union[str, List[str]],
192
+ filesystem: pyarrow.fs.FileSystem,
193
+ ) -> Tuple[List[str], List[S3Url]]:
194
+ urls = []
195
+ if isinstance(paths, str):
196
+ paths = [paths]
197
+ if isinstance(filesystem, S3FileSystem):
198
+ urls = [parse_s3_url(url) for url in paths]
199
+ # pyarrow.fs.FileSystem paths should not start with "s3://"
200
+ # pyarrow.fs.FileSystem paths should not end with "/"
201
+ paths = [f"{u.bucket}/{u.key}".rstrip("/") for u in urls]
202
+ return paths, urls
203
+
204
+
205
+ def _infer_content_types_from_paths(
206
+ paths: List[str],
207
+ content_type_provider: Callable[[str], ContentType],
208
+ ) -> Dict[ContentType, List[str]]:
209
+ content_type_to_paths = defaultdict(list)
210
+ for path in paths:
211
+ if not path.endswith("/"):
212
+ content_type_to_paths[content_type_provider(path)].append(path)
213
+ return content_type_to_paths
214
+
215
+
216
+ def _expand_manifest_paths_by_content_type(
217
+ manifest: Manifest,
218
+ ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
219
+ content_type_to_paths = {}
220
+ meta_provider = CachedFileMetadataProvider({})
221
+ if not manifest.entries:
222
+ logger.warning(f"No entries to read in DeltaCAT Manifest: {manifest}")
223
+ else:
224
+ content_type_to_paths, meta_provider = _read_manifest_entry_paths(
225
+ manifest.entries,
226
+ manifest.meta.content_type if manifest.meta else None,
227
+ )
228
+ # TODO(pdames): infer the schema from a manifest if available?
229
+ # if not schema and ContentType.PARQUET not in content_type_to_paths:
230
+ # schema = _infer_schema_from_manifest(manifest)
231
+ return content_type_to_paths, meta_provider
232
+
233
+
234
+ def _read_manifest_entry_paths(
235
+ entries: ManifestEntryList,
236
+ manifest_content_type: Optional[str],
237
+ ) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
238
+ # support manifests with heterogenous content types
239
+ content_type_to_paths = defaultdict(list)
240
+ meta_cache: Dict[str, BlockMetadata] = {}
241
+ for e in entries:
242
+ url = e.url if e.url else e.uri
243
+ # get manifest entry content type or fall back to manifest content type
244
+ content_type = e.meta.content_type or manifest_content_type
245
+ if content_type:
246
+ content_type_to_paths[ContentType(content_type)] = url
247
+ else:
248
+ # TODO(pdames): fall back to content type inference by file extension
249
+ raise ValueError(
250
+ f"Manifest entry missing content type: {e}. "
251
+ f"Please specify a content type for each manifest entry."
252
+ )
253
+ meta_cache[url] = BlockMetadata(
254
+ num_rows=e.meta.record_count,
255
+ size_bytes=e.meta.content_length,
256
+ schema=None,
257
+ input_files=[],
258
+ exec_stats=None,
259
+ )
260
+ return content_type_to_paths, CachedFileMetadataProvider(meta_cache)
261
+
262
+
263
+ def _get_metafile_read_task(
264
+ metafile: Metafile,
265
+ ) -> Iterable[Block]:
266
+ pyarrow_table_dict = {
267
+ METAFILE_DATA_COLUMN_NAME: [metafile.serialize(METAFILE_FORMAT_MSGPACK)],
268
+ METAFILE_TYPE_COLUMN_NAME: [Metafile.get_type_name(metafile)],
269
+ }
270
+ yield BlockAccessor.batch_to_arrow_block(pyarrow_table_dict)
271
+
272
+
273
+ def _get_metafile_lister_read_task(
274
+ lister: Callable[[Any], ListResult[Metafile]],
275
+ all_lister_kwargs: List[Dict[str, Any]],
276
+ ) -> Iterable[Block]:
277
+ metafiles = []
278
+ for lister_kwargs in all_lister_kwargs:
279
+ metafile_list_result = lister(**lister_kwargs)
280
+ # TODO(pdames): switch to paginated read
281
+ metafiles.append(metafile_list_result.all_items())
282
+ pyarrow_table_dict = {
283
+ METAFILE_DATA_COLUMN_NAME: [
284
+ meta.serialize(METAFILE_FORMAT_MSGPACK)
285
+ for metasublist in metafiles
286
+ for meta in metasublist
287
+ ],
288
+ METAFILE_TYPE_COLUMN_NAME: [
289
+ Metafile.get_class(meta).__name__
290
+ for metasublist in metafiles
291
+ for meta in metasublist
292
+ ],
293
+ }
294
+ yield BlockAccessor.batch_to_arrow_block(pyarrow_table_dict)
295
+
296
+
297
+ class DeltaCatDatasource(Datasource):
298
+ """Datasource for reading registered DeltaCAT catalog objects."""
299
+
300
+ def __init__(
301
+ self,
302
+ url: DeltaCatUrl,
303
+ deltacat_read_type: DeltacatReadType = DeltacatReadType.DATA,
304
+ timestamp_as_of: Optional[int] = None,
305
+ merge_on_read: Optional[bool] = False,
306
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
307
+ ):
308
+ self._url = url
309
+ self._reader = DeltaCatUrlReader(url)
310
+ self._deltacat_read_type = deltacat_read_type
311
+ self._timestamp_as_of = timestamp_as_of
312
+ self._merge_on_read = merge_on_read
313
+ self._filesystem = url.catalog.filesystem
314
+ self._read_kwargs_provider = read_kwargs_provider
315
+
316
+ def estimate_inmemory_data_size(self) -> Optional[int]:
317
+ """Return an estimate of the in-memory data size, or None if unknown.
318
+
319
+ Note that the in-memory data size may be larger than the on-disk data size.
320
+ """
321
+ return None
322
+
323
+ def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
324
+ """Execute the read and return read tasks.
325
+
326
+ Args:
327
+ parallelism: The requested read parallelism. The number of read
328
+ tasks should equal to this value if possible.
329
+
330
+ Returns:
331
+ A list of read tasks that can be executed to read blocks from the
332
+ datasource in parallel.
333
+ """
334
+ kwargs = self._read_kwargs_provider(self._url.datastore_type, {})
335
+ if self._deltacat_read_type == DeltacatReadType.METADATA:
336
+ # do a shallow read of the top-level DeltaCAT metadata
337
+ empty_block_metadata = BlockMetadata(
338
+ num_rows=None,
339
+ size_bytes=None,
340
+ schema=None,
341
+ input_files=None,
342
+ exec_stats=None,
343
+ )
344
+ metafile = self._reader.read(**kwargs)
345
+ read_tasks = [
346
+ ReadTask(
347
+ lambda: _get_metafile_read_task(metafile),
348
+ empty_block_metadata,
349
+ )
350
+ ]
351
+ elif self._deltacat_read_type == DeltacatReadType.METADATA_LIST:
352
+ # do a shallow read of the top-level DeltaCAT metadata
353
+ print(f"listers: {self._reader.listers}")
354
+ listers = copy.deepcopy(self._reader.listers)
355
+ listers = [listers[0]]
356
+ read_tasks = self._list_all_metafiles_read_tasks(
357
+ parallelism=parallelism,
358
+ listers=listers,
359
+ **kwargs,
360
+ )
361
+ elif self._deltacat_read_type == DeltacatReadType.METADATA_LIST_RECURSIVE:
362
+ read_tasks = self._list_all_metafiles_read_tasks(
363
+ parallelism=parallelism,
364
+ listers=copy.deepcopy(self._reader.listers),
365
+ **kwargs,
366
+ )
367
+
368
+ elif self._deltacat_read_type == DeltacatReadType.DATA:
369
+ # do a deep read across all in-scope Delta manifest file paths
370
+ # recursive is implicitly true for deep data reads
371
+ # TODO(pdames): For data reads targeting DeltaCAT catalogs, run a
372
+ # recursive distributed metadata read first, then a data read
373
+ # second.
374
+ raise NotImplementedError()
375
+ """
376
+ list_results = self._list_all_metafiles(**kwargs)
377
+ deltas: List[Delta] = list_results[len(list_results) - 1]
378
+ read_tasks = []
379
+ for delta in deltas:
380
+ read_tasks.append(
381
+ self._get_delta_manifest_read_tasks(
382
+ delta.manifest,
383
+ parallelism,
384
+ ),
385
+ )
386
+ """
387
+ else:
388
+ raise NotImplementedError(
389
+ f"Unsupported DeltaCAT read type: {self._deltacat_read_type}"
390
+ )
391
+
392
+ return read_tasks
393
+
394
+ def _list_all_metafiles_read_tasks(
395
+ self,
396
+ parallelism: int,
397
+ listers: List[Callable[[Any], ListResult[Metafile]]],
398
+ **kwargs,
399
+ ) -> List[ReadTask]:
400
+ list_results: List[ListResult[Metafile]] = []
401
+ # the first lister doesn't have any missing keyword args
402
+ (
403
+ first_lister,
404
+ first_kwarg_name,
405
+ first_kwarg_val_resolver_fn,
406
+ ) = listers.pop(0)
407
+ if listers:
408
+ metafile_list_result = first_lister(**kwargs)
409
+ list_results.append(metafile_list_result)
410
+ (
411
+ last_lister,
412
+ last_kwarg_name,
413
+ last_kwarg_val_resolver_fn,
414
+ ) = listers.pop()
415
+ else:
416
+ metafile_list_result = None
417
+ (
418
+ last_lister,
419
+ last_kwarg_name,
420
+ last_kwarg_val_resolver_fn,
421
+ ) = (first_lister, first_kwarg_name, first_kwarg_val_resolver_fn)
422
+ for lister, kwarg_name, kwarg_val_resolver_fn in listers:
423
+ # each subsequent lister needs to inject missing keyword args from the parent metafile
424
+ for metafile in metafile_list_result.all_items():
425
+ kwargs_update = (
426
+ {kwarg_name: kwarg_val_resolver_fn(metafile)}
427
+ if kwarg_name and kwarg_val_resolver_fn
428
+ else {}
429
+ )
430
+ lister_kwargs = {
431
+ **kwargs,
432
+ **kwargs_update,
433
+ }
434
+ metafile_list_result = lister(**lister_kwargs)
435
+ list_results.append(metafile_list_result)
436
+ empty_block_metadata = BlockMetadata(
437
+ num_rows=None,
438
+ size_bytes=None,
439
+ schema=None,
440
+ input_files=None,
441
+ exec_stats=None,
442
+ )
443
+ if metafile_list_result:
444
+ # use a single read task to materialize all prior metafiles read
445
+ # as an arrow table block
446
+ # (very lightweight, so not counted against target parallelism)
447
+ read_tasks = [
448
+ ReadTask(
449
+ read_fn=functools.partial(
450
+ _get_metafile_lister_read_task,
451
+ lister=lambda all_list_results: ListResult.of(
452
+ [
453
+ item
454
+ for list_result in all_list_results
455
+ for item in list_result.all_items()
456
+ ]
457
+ ),
458
+ all_lister_kwargs=[{"all_list_results": list_results}],
459
+ ),
460
+ metadata=empty_block_metadata,
461
+ )
462
+ ]
463
+ # parallelize the listing of all metafile leaf nodes
464
+ split_metafiles = np.array_split(
465
+ metafile_list_result.all_items(),
466
+ parallelism,
467
+ )
468
+ for metafiles in split_metafiles:
469
+ all_lister_kwargs = []
470
+ for metafile in metafiles:
471
+ kwargs_update = (
472
+ {last_kwarg_name: last_kwarg_val_resolver_fn(metafile)}
473
+ if last_kwarg_name and last_kwarg_val_resolver_fn
474
+ else {}
475
+ )
476
+ lister_kwargs = {
477
+ **kwargs,
478
+ **kwargs_update,
479
+ }
480
+ all_lister_kwargs.append(lister_kwargs)
481
+ read_tasks.append(
482
+ ReadTask(
483
+ read_fn=functools.partial(
484
+ _get_metafile_lister_read_task,
485
+ lister=last_lister,
486
+ all_lister_kwargs=all_lister_kwargs,
487
+ ),
488
+ metadata=empty_block_metadata,
489
+ )
490
+ )
491
+ else:
492
+ # first lister is also the last lister (i.e., shallow listing)
493
+ read_tasks = [
494
+ ReadTask(
495
+ read_fn=functools.partial(
496
+ _get_metafile_lister_read_task,
497
+ lister=last_lister,
498
+ all_lister_kwargs=[kwargs],
499
+ ),
500
+ metadata=empty_block_metadata,
501
+ )
502
+ ]
503
+ return read_tasks
504
+
505
+ """
506
+
507
+ def _get_delta_manifest_read_tasks(
508
+ self,
509
+ delta_manifest: Manifest,
510
+ parallelism: int,
511
+ ) -> List[ReadTask]:
512
+ # find all files in the Delta manifest
513
+ content_type_to_paths, meta_provider = _expand_manifest_paths_by_content_type(
514
+ delta_manifest,
515
+ self._filesystem,
516
+ )
517
+ num_content_types = len(content_type_to_paths)
518
+ if num_content_types > 1 and not schema:
519
+ # infer schema from a single parquet file
520
+ # TODO (pdames): read verbose manifest schema if available, and infer
521
+ # schema from a sample parquet dataset if not
522
+ path = content_type_to_paths[ContentType.PARQUET][0]
523
+ with resolved_fs.open_input_file(path, **open_stream_args) as f:
524
+ schema = pq.read_schema(f)
525
+ content_type_to_reader = {
526
+ ContentType.PARQUET: ParquetDatasource(),
527
+ ContentType.CSV: CSVDatasource(),
528
+ }
529
+ all_read_tasks = []
530
+ for content_type, paths in content_type_to_paths.items():
531
+ reader = content_type_to_reader.get(content_type)
532
+ assert reader, f"No datasource found for: {content_type}"
533
+ prepare_read_kwargs = {
534
+ "parallelism": parallelism,
535
+ "paths": paths,
536
+ "filesystem": self._filesystem,
537
+ "schema": schema,
538
+ "meta_provider": meta_provider,
539
+ }
540
+ if content_type == ContentType.PARQUET:
541
+ if columns:
542
+ prepare_read_kwargs["columns"] = columns
543
+ elif content_type in DELIMITED_TEXT_CONTENT_TYPES:
544
+ prepare_read_kwargs.update(
545
+ csv_reader_config.to_arrow_reader_kwargs(columns, schema)
546
+ )
547
+ else:
548
+ raise NotImplementedError(f"Unsupported content type: {content_type}")
549
+ # merge any provided reader kwargs for this content type with those
550
+ # inferred from CSV Reader Config
551
+ if read_kwargs_provider:
552
+ prepare_read_kwargs = read_kwargs_provider(
553
+ content_type,
554
+ prepare_read_kwargs,
555
+ )
556
+ # explicitly specified `open_stream_args` override those inferred
557
+ # from CSV Reader Config
558
+ if open_stream_args:
559
+ prepare_read_kwargs["open_stream_args"] = open_stream_args
560
+ read_tasks = reader.prepare_read(**prepare_read_kwargs)
561
+ all_read_tasks.extend(read_tasks)
562
+ return all_read_tasks
563
+
564
+ def prepare_read(
565
+ self,
566
+ parallelism: int,
567
+ paths: Union[str, List[str]],
568
+ content_type_provider: Callable[[str], ContentType],
569
+ path_type: PathType = PathType.MANIFEST,
570
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
571
+ columns: Optional[List[str]] = None,
572
+ schema: Optional[pa.Schema] = None,
573
+ csv_reader_config: DelimitedTextReaderConfig = DelimitedTextReaderConfig(),
574
+ partitioning: HivePartitionParser = None,
575
+ open_stream_args: Optional[Dict[str, Any]] = None,
576
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
577
+ **s3_client_kwargs,
578
+ ) -> List[ReadTask]:
579
+ pass
580
+ """
File without changes