deltacat 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +27 -6
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/catalog/main/impl.py +12 -6
- deltacat/catalog/model/catalog.py +65 -47
- deltacat/catalog/model/properties.py +1 -3
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +404 -0
- deltacat/constants.py +4 -4
- deltacat/daft/daft_scan.py +7 -3
- deltacat/daft/translator.py +126 -0
- deltacat/examples/basic_logging.py +5 -3
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +199 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/types.py +5 -3
- deltacat/storage/rivulet/__init__.py +4 -4
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
- deltacat/tests/storage/rivulet/test_dataset.py +1 -1
- deltacat/tests/storage/rivulet/test_manifest.py +1 -1
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +2 -2
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,580 @@
|
|
1
|
+
import copy
|
2
|
+
import functools
|
3
|
+
import logging
|
4
|
+
|
5
|
+
from collections import defaultdict
|
6
|
+
from enum import Enum
|
7
|
+
from typing import Union, List, Callable, Optional, Dict, Any, Tuple, Iterable
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
|
11
|
+
import pyarrow as pa
|
12
|
+
import pyarrow.fs
|
13
|
+
from pyarrow.fs import S3FileSystem
|
14
|
+
|
15
|
+
from ray.data import (
|
16
|
+
Datasource,
|
17
|
+
ReadTask,
|
18
|
+
)
|
19
|
+
from ray.data.block import BlockMetadata, Block, BlockAccessor
|
20
|
+
from ray.data.datasource import (
|
21
|
+
FastFileMetadataProvider,
|
22
|
+
ParquetMetadataProvider,
|
23
|
+
)
|
24
|
+
|
25
|
+
from deltacat.constants import METAFILE_FORMAT_MSGPACK
|
26
|
+
from deltacat.aws.s3u import (
|
27
|
+
S3Url,
|
28
|
+
parse_s3_url,
|
29
|
+
)
|
30
|
+
from deltacat.types.media import (
|
31
|
+
ContentType,
|
32
|
+
)
|
33
|
+
from deltacat.storage import (
|
34
|
+
Manifest,
|
35
|
+
ManifestEntryList,
|
36
|
+
)
|
37
|
+
from deltacat.utils.common import ReadKwargsProvider
|
38
|
+
from deltacat.utils.url import DeltaCatUrl, DeltaCatUrlReader
|
39
|
+
from deltacat.storage import (
|
40
|
+
Metafile,
|
41
|
+
ListResult,
|
42
|
+
)
|
43
|
+
from deltacat import logs
|
44
|
+
|
45
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
46
|
+
|
47
|
+
METAFILE_DATA_COLUMN_NAME = "deltacat_metafile_data"
|
48
|
+
METAFILE_TYPE_COLUMN_NAME = "deltacat_metafile_type"
|
49
|
+
|
50
|
+
|
51
|
+
class DeltacatReadType(str, Enum):
|
52
|
+
METADATA = "metadata" # get only a single metafile
|
53
|
+
METADATA_LIST = "metadata_list" # list top-level metafiles
|
54
|
+
METADATA_LIST_RECURSIVE = "metadata_recursive" # list all metafiles
|
55
|
+
DATA = "data" # read all data files
|
56
|
+
|
57
|
+
|
58
|
+
class CachedFileMetadataProvider(
|
59
|
+
FastFileMetadataProvider,
|
60
|
+
ParquetMetadataProvider,
|
61
|
+
):
|
62
|
+
def __init__(self, meta_cache: Dict[str, BlockMetadata]):
|
63
|
+
self._meta_cache = meta_cache
|
64
|
+
|
65
|
+
def get_meta_cache(self) -> Dict[str, BlockMetadata]:
|
66
|
+
return self._meta_cache
|
67
|
+
|
68
|
+
def _get_block_metadata(
|
69
|
+
self,
|
70
|
+
paths: List[str],
|
71
|
+
schema: Optional[Union[type, pa.Schema]],
|
72
|
+
**kwargs,
|
73
|
+
) -> BlockMetadata:
|
74
|
+
agg_block_metadata = BlockMetadata(
|
75
|
+
num_rows=0,
|
76
|
+
size_bytes=0,
|
77
|
+
schema=schema,
|
78
|
+
input_files=[],
|
79
|
+
exec_stats=None,
|
80
|
+
)
|
81
|
+
for path in paths:
|
82
|
+
block_metadata = self._meta_cache.get(path)
|
83
|
+
if block_metadata is None:
|
84
|
+
raise ValueError(f"Block metadata not found for path: {path}")
|
85
|
+
if block_metadata.num_rows is None:
|
86
|
+
agg_block_metadata.num_rows = None
|
87
|
+
elif agg_block_metadata.num_rows is not None:
|
88
|
+
agg_block_metadata.num_rows += block_metadata.num_rows
|
89
|
+
if block_metadata.size_bytes is None:
|
90
|
+
agg_block_metadata.size_bytes = None
|
91
|
+
elif agg_block_metadata.size_bytes is not None:
|
92
|
+
agg_block_metadata.size_bytes += block_metadata.size_bytes
|
93
|
+
agg_block_metadata.input_files.append(path)
|
94
|
+
return agg_block_metadata
|
95
|
+
|
96
|
+
|
97
|
+
class PathType(str, Enum):
|
98
|
+
MANIFEST = "manifest"
|
99
|
+
FILES_AND_FOLDERS = "files_and_folders"
|
100
|
+
|
101
|
+
|
102
|
+
class DelimitedTextReaderConfig:
|
103
|
+
def __init__(
|
104
|
+
self,
|
105
|
+
csv: bool = False,
|
106
|
+
header: bool = False,
|
107
|
+
delimiter: Optional[str] = None,
|
108
|
+
bzip2: bool = False,
|
109
|
+
gzip: bool = False,
|
110
|
+
zstd: bool = False,
|
111
|
+
add_quotes: Optional[bool] = None,
|
112
|
+
null_as: str = "",
|
113
|
+
escape: bool = False,
|
114
|
+
fixed_width: bool = False,
|
115
|
+
):
|
116
|
+
self.header = header
|
117
|
+
self.delimiter = delimiter if delimiter else "," if csv else "|"
|
118
|
+
self.bzip2 = bzip2
|
119
|
+
self.gzip = gzip
|
120
|
+
self.zstd = zstd
|
121
|
+
self.add_quotes = add_quotes if add_quotes else True if csv else False
|
122
|
+
self.null_as = null_as
|
123
|
+
self.escape = escape
|
124
|
+
self.fixed_width = fixed_width
|
125
|
+
|
126
|
+
def _get_arrow_compression_codec_name(self) -> str:
|
127
|
+
arrow_compression_codec_name = None
|
128
|
+
codecs_enabled = {
|
129
|
+
"bz2": self.bzip2,
|
130
|
+
"gzip": self.gzip,
|
131
|
+
"zstd": self.zstd,
|
132
|
+
}
|
133
|
+
for encoding, flag in codecs_enabled.items():
|
134
|
+
if arrow_compression_codec_name and flag:
|
135
|
+
raise ValueError(
|
136
|
+
f"Multiple delimited text compression types specified "
|
137
|
+
f"({codecs_enabled}). Please ensure that only one "
|
138
|
+
f"compression type is set and try again."
|
139
|
+
)
|
140
|
+
if flag:
|
141
|
+
arrow_compression_codec_name = encoding
|
142
|
+
return arrow_compression_codec_name
|
143
|
+
|
144
|
+
def to_arrow_reader_kwargs(
|
145
|
+
self, include_columns: Optional[List[str]], schema: Optional[pa.Schema]
|
146
|
+
) -> Dict[str, Any]:
|
147
|
+
from pyarrow import csv
|
148
|
+
|
149
|
+
if self.fixed_width:
|
150
|
+
raise NotImplementedError(
|
151
|
+
"Delimited text files configured with FIXEDWIDTH are not "
|
152
|
+
"currently supported."
|
153
|
+
)
|
154
|
+
open_stream_args = {}
|
155
|
+
arrow_compression_codec_name = self._get_arrow_compression_codec_name()
|
156
|
+
if arrow_compression_codec_name:
|
157
|
+
open_stream_args["compression"] = arrow_compression_codec_name
|
158
|
+
column_names = None
|
159
|
+
if schema:
|
160
|
+
column_names = schema.names
|
161
|
+
autogen_column_names = False if self.header or column_names else True
|
162
|
+
read_options = csv.ReadOptions(
|
163
|
+
use_threads=False,
|
164
|
+
column_names=column_names,
|
165
|
+
autogenerate_column_names=autogen_column_names,
|
166
|
+
)
|
167
|
+
parse_options = csv.ParseOptions(
|
168
|
+
delimiter=self.delimiter,
|
169
|
+
quote_char='"' if self.add_quotes else False,
|
170
|
+
escape_char="\\" if self.escape else False,
|
171
|
+
double_quote=False if self.escape else True,
|
172
|
+
)
|
173
|
+
convert_options = csv.ConvertOptions(
|
174
|
+
column_types=schema,
|
175
|
+
null_values=[self.null_as] if self.null_as is not None else [],
|
176
|
+
true_values=["t"],
|
177
|
+
false_values=["f"],
|
178
|
+
strings_can_be_null=True if self.null_as is not None else False,
|
179
|
+
quoted_strings_can_be_null=True if self.null_as else False,
|
180
|
+
include_columns=include_columns,
|
181
|
+
)
|
182
|
+
return {
|
183
|
+
"open_stream_args": open_stream_args,
|
184
|
+
"read_options": read_options,
|
185
|
+
"parse_options": parse_options,
|
186
|
+
"convert_options": convert_options,
|
187
|
+
}
|
188
|
+
|
189
|
+
|
190
|
+
def normalize_s3_paths_for_filesystem(
|
191
|
+
paths: Union[str, List[str]],
|
192
|
+
filesystem: pyarrow.fs.FileSystem,
|
193
|
+
) -> Tuple[List[str], List[S3Url]]:
|
194
|
+
urls = []
|
195
|
+
if isinstance(paths, str):
|
196
|
+
paths = [paths]
|
197
|
+
if isinstance(filesystem, S3FileSystem):
|
198
|
+
urls = [parse_s3_url(url) for url in paths]
|
199
|
+
# pyarrow.fs.FileSystem paths should not start with "s3://"
|
200
|
+
# pyarrow.fs.FileSystem paths should not end with "/"
|
201
|
+
paths = [f"{u.bucket}/{u.key}".rstrip("/") for u in urls]
|
202
|
+
return paths, urls
|
203
|
+
|
204
|
+
|
205
|
+
def _infer_content_types_from_paths(
|
206
|
+
paths: List[str],
|
207
|
+
content_type_provider: Callable[[str], ContentType],
|
208
|
+
) -> Dict[ContentType, List[str]]:
|
209
|
+
content_type_to_paths = defaultdict(list)
|
210
|
+
for path in paths:
|
211
|
+
if not path.endswith("/"):
|
212
|
+
content_type_to_paths[content_type_provider(path)].append(path)
|
213
|
+
return content_type_to_paths
|
214
|
+
|
215
|
+
|
216
|
+
def _expand_manifest_paths_by_content_type(
|
217
|
+
manifest: Manifest,
|
218
|
+
) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
|
219
|
+
content_type_to_paths = {}
|
220
|
+
meta_provider = CachedFileMetadataProvider({})
|
221
|
+
if not manifest.entries:
|
222
|
+
logger.warning(f"No entries to read in DeltaCAT Manifest: {manifest}")
|
223
|
+
else:
|
224
|
+
content_type_to_paths, meta_provider = _read_manifest_entry_paths(
|
225
|
+
manifest.entries,
|
226
|
+
manifest.meta.content_type if manifest.meta else None,
|
227
|
+
)
|
228
|
+
# TODO(pdames): infer the schema from a manifest if available?
|
229
|
+
# if not schema and ContentType.PARQUET not in content_type_to_paths:
|
230
|
+
# schema = _infer_schema_from_manifest(manifest)
|
231
|
+
return content_type_to_paths, meta_provider
|
232
|
+
|
233
|
+
|
234
|
+
def _read_manifest_entry_paths(
|
235
|
+
entries: ManifestEntryList,
|
236
|
+
manifest_content_type: Optional[str],
|
237
|
+
) -> Tuple[Dict[ContentType, List[str]], CachedFileMetadataProvider]:
|
238
|
+
# support manifests with heterogenous content types
|
239
|
+
content_type_to_paths = defaultdict(list)
|
240
|
+
meta_cache: Dict[str, BlockMetadata] = {}
|
241
|
+
for e in entries:
|
242
|
+
url = e.url if e.url else e.uri
|
243
|
+
# get manifest entry content type or fall back to manifest content type
|
244
|
+
content_type = e.meta.content_type or manifest_content_type
|
245
|
+
if content_type:
|
246
|
+
content_type_to_paths[ContentType(content_type)] = url
|
247
|
+
else:
|
248
|
+
# TODO(pdames): fall back to content type inference by file extension
|
249
|
+
raise ValueError(
|
250
|
+
f"Manifest entry missing content type: {e}. "
|
251
|
+
f"Please specify a content type for each manifest entry."
|
252
|
+
)
|
253
|
+
meta_cache[url] = BlockMetadata(
|
254
|
+
num_rows=e.meta.record_count,
|
255
|
+
size_bytes=e.meta.content_length,
|
256
|
+
schema=None,
|
257
|
+
input_files=[],
|
258
|
+
exec_stats=None,
|
259
|
+
)
|
260
|
+
return content_type_to_paths, CachedFileMetadataProvider(meta_cache)
|
261
|
+
|
262
|
+
|
263
|
+
def _get_metafile_read_task(
|
264
|
+
metafile: Metafile,
|
265
|
+
) -> Iterable[Block]:
|
266
|
+
pyarrow_table_dict = {
|
267
|
+
METAFILE_DATA_COLUMN_NAME: [metafile.serialize(METAFILE_FORMAT_MSGPACK)],
|
268
|
+
METAFILE_TYPE_COLUMN_NAME: [Metafile.get_type_name(metafile)],
|
269
|
+
}
|
270
|
+
yield BlockAccessor.batch_to_arrow_block(pyarrow_table_dict)
|
271
|
+
|
272
|
+
|
273
|
+
def _get_metafile_lister_read_task(
|
274
|
+
lister: Callable[[Any], ListResult[Metafile]],
|
275
|
+
all_lister_kwargs: List[Dict[str, Any]],
|
276
|
+
) -> Iterable[Block]:
|
277
|
+
metafiles = []
|
278
|
+
for lister_kwargs in all_lister_kwargs:
|
279
|
+
metafile_list_result = lister(**lister_kwargs)
|
280
|
+
# TODO(pdames): switch to paginated read
|
281
|
+
metafiles.append(metafile_list_result.all_items())
|
282
|
+
pyarrow_table_dict = {
|
283
|
+
METAFILE_DATA_COLUMN_NAME: [
|
284
|
+
meta.serialize(METAFILE_FORMAT_MSGPACK)
|
285
|
+
for metasublist in metafiles
|
286
|
+
for meta in metasublist
|
287
|
+
],
|
288
|
+
METAFILE_TYPE_COLUMN_NAME: [
|
289
|
+
Metafile.get_class(meta).__name__
|
290
|
+
for metasublist in metafiles
|
291
|
+
for meta in metasublist
|
292
|
+
],
|
293
|
+
}
|
294
|
+
yield BlockAccessor.batch_to_arrow_block(pyarrow_table_dict)
|
295
|
+
|
296
|
+
|
297
|
+
class DeltaCatDatasource(Datasource):
|
298
|
+
"""Datasource for reading registered DeltaCAT catalog objects."""
|
299
|
+
|
300
|
+
def __init__(
|
301
|
+
self,
|
302
|
+
url: DeltaCatUrl,
|
303
|
+
deltacat_read_type: DeltacatReadType = DeltacatReadType.DATA,
|
304
|
+
timestamp_as_of: Optional[int] = None,
|
305
|
+
merge_on_read: Optional[bool] = False,
|
306
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
307
|
+
):
|
308
|
+
self._url = url
|
309
|
+
self._reader = DeltaCatUrlReader(url)
|
310
|
+
self._deltacat_read_type = deltacat_read_type
|
311
|
+
self._timestamp_as_of = timestamp_as_of
|
312
|
+
self._merge_on_read = merge_on_read
|
313
|
+
self._filesystem = url.catalog.filesystem
|
314
|
+
self._read_kwargs_provider = read_kwargs_provider
|
315
|
+
|
316
|
+
def estimate_inmemory_data_size(self) -> Optional[int]:
|
317
|
+
"""Return an estimate of the in-memory data size, or None if unknown.
|
318
|
+
|
319
|
+
Note that the in-memory data size may be larger than the on-disk data size.
|
320
|
+
"""
|
321
|
+
return None
|
322
|
+
|
323
|
+
def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
|
324
|
+
"""Execute the read and return read tasks.
|
325
|
+
|
326
|
+
Args:
|
327
|
+
parallelism: The requested read parallelism. The number of read
|
328
|
+
tasks should equal to this value if possible.
|
329
|
+
|
330
|
+
Returns:
|
331
|
+
A list of read tasks that can be executed to read blocks from the
|
332
|
+
datasource in parallel.
|
333
|
+
"""
|
334
|
+
kwargs = self._read_kwargs_provider(self._url.datastore_type, {})
|
335
|
+
if self._deltacat_read_type == DeltacatReadType.METADATA:
|
336
|
+
# do a shallow read of the top-level DeltaCAT metadata
|
337
|
+
empty_block_metadata = BlockMetadata(
|
338
|
+
num_rows=None,
|
339
|
+
size_bytes=None,
|
340
|
+
schema=None,
|
341
|
+
input_files=None,
|
342
|
+
exec_stats=None,
|
343
|
+
)
|
344
|
+
metafile = self._reader.read(**kwargs)
|
345
|
+
read_tasks = [
|
346
|
+
ReadTask(
|
347
|
+
lambda: _get_metafile_read_task(metafile),
|
348
|
+
empty_block_metadata,
|
349
|
+
)
|
350
|
+
]
|
351
|
+
elif self._deltacat_read_type == DeltacatReadType.METADATA_LIST:
|
352
|
+
# do a shallow read of the top-level DeltaCAT metadata
|
353
|
+
print(f"listers: {self._reader.listers}")
|
354
|
+
listers = copy.deepcopy(self._reader.listers)
|
355
|
+
listers = [listers[0]]
|
356
|
+
read_tasks = self._list_all_metafiles_read_tasks(
|
357
|
+
parallelism=parallelism,
|
358
|
+
listers=listers,
|
359
|
+
**kwargs,
|
360
|
+
)
|
361
|
+
elif self._deltacat_read_type == DeltacatReadType.METADATA_LIST_RECURSIVE:
|
362
|
+
read_tasks = self._list_all_metafiles_read_tasks(
|
363
|
+
parallelism=parallelism,
|
364
|
+
listers=copy.deepcopy(self._reader.listers),
|
365
|
+
**kwargs,
|
366
|
+
)
|
367
|
+
|
368
|
+
elif self._deltacat_read_type == DeltacatReadType.DATA:
|
369
|
+
# do a deep read across all in-scope Delta manifest file paths
|
370
|
+
# recursive is implicitly true for deep data reads
|
371
|
+
# TODO(pdames): For data reads targeting DeltaCAT catalogs, run a
|
372
|
+
# recursive distributed metadata read first, then a data read
|
373
|
+
# second.
|
374
|
+
raise NotImplementedError()
|
375
|
+
"""
|
376
|
+
list_results = self._list_all_metafiles(**kwargs)
|
377
|
+
deltas: List[Delta] = list_results[len(list_results) - 1]
|
378
|
+
read_tasks = []
|
379
|
+
for delta in deltas:
|
380
|
+
read_tasks.append(
|
381
|
+
self._get_delta_manifest_read_tasks(
|
382
|
+
delta.manifest,
|
383
|
+
parallelism,
|
384
|
+
),
|
385
|
+
)
|
386
|
+
"""
|
387
|
+
else:
|
388
|
+
raise NotImplementedError(
|
389
|
+
f"Unsupported DeltaCAT read type: {self._deltacat_read_type}"
|
390
|
+
)
|
391
|
+
|
392
|
+
return read_tasks
|
393
|
+
|
394
|
+
def _list_all_metafiles_read_tasks(
|
395
|
+
self,
|
396
|
+
parallelism: int,
|
397
|
+
listers: List[Callable[[Any], ListResult[Metafile]]],
|
398
|
+
**kwargs,
|
399
|
+
) -> List[ReadTask]:
|
400
|
+
list_results: List[ListResult[Metafile]] = []
|
401
|
+
# the first lister doesn't have any missing keyword args
|
402
|
+
(
|
403
|
+
first_lister,
|
404
|
+
first_kwarg_name,
|
405
|
+
first_kwarg_val_resolver_fn,
|
406
|
+
) = listers.pop(0)
|
407
|
+
if listers:
|
408
|
+
metafile_list_result = first_lister(**kwargs)
|
409
|
+
list_results.append(metafile_list_result)
|
410
|
+
(
|
411
|
+
last_lister,
|
412
|
+
last_kwarg_name,
|
413
|
+
last_kwarg_val_resolver_fn,
|
414
|
+
) = listers.pop()
|
415
|
+
else:
|
416
|
+
metafile_list_result = None
|
417
|
+
(
|
418
|
+
last_lister,
|
419
|
+
last_kwarg_name,
|
420
|
+
last_kwarg_val_resolver_fn,
|
421
|
+
) = (first_lister, first_kwarg_name, first_kwarg_val_resolver_fn)
|
422
|
+
for lister, kwarg_name, kwarg_val_resolver_fn in listers:
|
423
|
+
# each subsequent lister needs to inject missing keyword args from the parent metafile
|
424
|
+
for metafile in metafile_list_result.all_items():
|
425
|
+
kwargs_update = (
|
426
|
+
{kwarg_name: kwarg_val_resolver_fn(metafile)}
|
427
|
+
if kwarg_name and kwarg_val_resolver_fn
|
428
|
+
else {}
|
429
|
+
)
|
430
|
+
lister_kwargs = {
|
431
|
+
**kwargs,
|
432
|
+
**kwargs_update,
|
433
|
+
}
|
434
|
+
metafile_list_result = lister(**lister_kwargs)
|
435
|
+
list_results.append(metafile_list_result)
|
436
|
+
empty_block_metadata = BlockMetadata(
|
437
|
+
num_rows=None,
|
438
|
+
size_bytes=None,
|
439
|
+
schema=None,
|
440
|
+
input_files=None,
|
441
|
+
exec_stats=None,
|
442
|
+
)
|
443
|
+
if metafile_list_result:
|
444
|
+
# use a single read task to materialize all prior metafiles read
|
445
|
+
# as an arrow table block
|
446
|
+
# (very lightweight, so not counted against target parallelism)
|
447
|
+
read_tasks = [
|
448
|
+
ReadTask(
|
449
|
+
read_fn=functools.partial(
|
450
|
+
_get_metafile_lister_read_task,
|
451
|
+
lister=lambda all_list_results: ListResult.of(
|
452
|
+
[
|
453
|
+
item
|
454
|
+
for list_result in all_list_results
|
455
|
+
for item in list_result.all_items()
|
456
|
+
]
|
457
|
+
),
|
458
|
+
all_lister_kwargs=[{"all_list_results": list_results}],
|
459
|
+
),
|
460
|
+
metadata=empty_block_metadata,
|
461
|
+
)
|
462
|
+
]
|
463
|
+
# parallelize the listing of all metafile leaf nodes
|
464
|
+
split_metafiles = np.array_split(
|
465
|
+
metafile_list_result.all_items(),
|
466
|
+
parallelism,
|
467
|
+
)
|
468
|
+
for metafiles in split_metafiles:
|
469
|
+
all_lister_kwargs = []
|
470
|
+
for metafile in metafiles:
|
471
|
+
kwargs_update = (
|
472
|
+
{last_kwarg_name: last_kwarg_val_resolver_fn(metafile)}
|
473
|
+
if last_kwarg_name and last_kwarg_val_resolver_fn
|
474
|
+
else {}
|
475
|
+
)
|
476
|
+
lister_kwargs = {
|
477
|
+
**kwargs,
|
478
|
+
**kwargs_update,
|
479
|
+
}
|
480
|
+
all_lister_kwargs.append(lister_kwargs)
|
481
|
+
read_tasks.append(
|
482
|
+
ReadTask(
|
483
|
+
read_fn=functools.partial(
|
484
|
+
_get_metafile_lister_read_task,
|
485
|
+
lister=last_lister,
|
486
|
+
all_lister_kwargs=all_lister_kwargs,
|
487
|
+
),
|
488
|
+
metadata=empty_block_metadata,
|
489
|
+
)
|
490
|
+
)
|
491
|
+
else:
|
492
|
+
# first lister is also the last lister (i.e., shallow listing)
|
493
|
+
read_tasks = [
|
494
|
+
ReadTask(
|
495
|
+
read_fn=functools.partial(
|
496
|
+
_get_metafile_lister_read_task,
|
497
|
+
lister=last_lister,
|
498
|
+
all_lister_kwargs=[kwargs],
|
499
|
+
),
|
500
|
+
metadata=empty_block_metadata,
|
501
|
+
)
|
502
|
+
]
|
503
|
+
return read_tasks
|
504
|
+
|
505
|
+
"""
|
506
|
+
|
507
|
+
def _get_delta_manifest_read_tasks(
|
508
|
+
self,
|
509
|
+
delta_manifest: Manifest,
|
510
|
+
parallelism: int,
|
511
|
+
) -> List[ReadTask]:
|
512
|
+
# find all files in the Delta manifest
|
513
|
+
content_type_to_paths, meta_provider = _expand_manifest_paths_by_content_type(
|
514
|
+
delta_manifest,
|
515
|
+
self._filesystem,
|
516
|
+
)
|
517
|
+
num_content_types = len(content_type_to_paths)
|
518
|
+
if num_content_types > 1 and not schema:
|
519
|
+
# infer schema from a single parquet file
|
520
|
+
# TODO (pdames): read verbose manifest schema if available, and infer
|
521
|
+
# schema from a sample parquet dataset if not
|
522
|
+
path = content_type_to_paths[ContentType.PARQUET][0]
|
523
|
+
with resolved_fs.open_input_file(path, **open_stream_args) as f:
|
524
|
+
schema = pq.read_schema(f)
|
525
|
+
content_type_to_reader = {
|
526
|
+
ContentType.PARQUET: ParquetDatasource(),
|
527
|
+
ContentType.CSV: CSVDatasource(),
|
528
|
+
}
|
529
|
+
all_read_tasks = []
|
530
|
+
for content_type, paths in content_type_to_paths.items():
|
531
|
+
reader = content_type_to_reader.get(content_type)
|
532
|
+
assert reader, f"No datasource found for: {content_type}"
|
533
|
+
prepare_read_kwargs = {
|
534
|
+
"parallelism": parallelism,
|
535
|
+
"paths": paths,
|
536
|
+
"filesystem": self._filesystem,
|
537
|
+
"schema": schema,
|
538
|
+
"meta_provider": meta_provider,
|
539
|
+
}
|
540
|
+
if content_type == ContentType.PARQUET:
|
541
|
+
if columns:
|
542
|
+
prepare_read_kwargs["columns"] = columns
|
543
|
+
elif content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
544
|
+
prepare_read_kwargs.update(
|
545
|
+
csv_reader_config.to_arrow_reader_kwargs(columns, schema)
|
546
|
+
)
|
547
|
+
else:
|
548
|
+
raise NotImplementedError(f"Unsupported content type: {content_type}")
|
549
|
+
# merge any provided reader kwargs for this content type with those
|
550
|
+
# inferred from CSV Reader Config
|
551
|
+
if read_kwargs_provider:
|
552
|
+
prepare_read_kwargs = read_kwargs_provider(
|
553
|
+
content_type,
|
554
|
+
prepare_read_kwargs,
|
555
|
+
)
|
556
|
+
# explicitly specified `open_stream_args` override those inferred
|
557
|
+
# from CSV Reader Config
|
558
|
+
if open_stream_args:
|
559
|
+
prepare_read_kwargs["open_stream_args"] = open_stream_args
|
560
|
+
read_tasks = reader.prepare_read(**prepare_read_kwargs)
|
561
|
+
all_read_tasks.extend(read_tasks)
|
562
|
+
return all_read_tasks
|
563
|
+
|
564
|
+
def prepare_read(
|
565
|
+
self,
|
566
|
+
parallelism: int,
|
567
|
+
paths: Union[str, List[str]],
|
568
|
+
content_type_provider: Callable[[str], ContentType],
|
569
|
+
path_type: PathType = PathType.MANIFEST,
|
570
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
571
|
+
columns: Optional[List[str]] = None,
|
572
|
+
schema: Optional[pa.Schema] = None,
|
573
|
+
csv_reader_config: DelimitedTextReaderConfig = DelimitedTextReaderConfig(),
|
574
|
+
partitioning: HivePartitionParser = None,
|
575
|
+
open_stream_args: Optional[Dict[str, Any]] = None,
|
576
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
577
|
+
**s3_client_kwargs,
|
578
|
+
) -> List[ReadTask]:
|
579
|
+
pass
|
580
|
+
"""
|
File without changes
|