deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +27 -6
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/catalog/main/impl.py +12 -6
- deltacat/catalog/model/catalog.py +65 -47
- deltacat/catalog/model/properties.py +1 -3
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +404 -0
- deltacat/constants.py +4 -4
- deltacat/daft/daft_scan.py +7 -3
- deltacat/daft/translator.py +126 -0
- deltacat/examples/basic_logging.py +5 -3
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +199 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/types.py +5 -3
- deltacat/storage/rivulet/__init__.py +4 -4
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
- deltacat/tests/storage/rivulet/test_dataset.py +1 -1
- deltacat/tests/storage/rivulet/test_manifest.py +1 -1
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +2 -2
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
deltacat/api.py
CHANGED
@@ -1,56 +1,193 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
import time
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Any, Union, List, Optional, Dict, Callable, Tuple
|
3
4
|
|
5
|
+
import ray
|
4
6
|
import deltacat as dc
|
5
|
-
|
7
|
+
import pyarrow.fs as pafs
|
8
|
+
|
9
|
+
from pyarrow.fs import FileType
|
10
|
+
from ray.exceptions import OutOfMemoryError
|
11
|
+
|
12
|
+
from deltacat.constants import BYTES_PER_GIBIBYTE
|
13
|
+
from deltacat.io import (
|
14
|
+
read_deltacat,
|
15
|
+
DeltacatReadType,
|
16
|
+
)
|
17
|
+
from deltacat.storage import (
|
18
|
+
Dataset,
|
19
|
+
DistributedDataset,
|
20
|
+
ListResult,
|
21
|
+
LocalTable,
|
22
|
+
Metafile,
|
23
|
+
)
|
24
|
+
from deltacat.types.media import DatasetType
|
25
|
+
from deltacat.utils.url import (
|
26
|
+
DeltaCatUrl,
|
27
|
+
DeltaCatUrlReader,
|
28
|
+
DeltaCatUrlWriter,
|
29
|
+
)
|
30
|
+
from deltacat.utils.common import ReadKwargsProvider
|
31
|
+
from deltacat.types.tables import (
|
32
|
+
get_table_size,
|
33
|
+
get_table_length,
|
34
|
+
)
|
35
|
+
from deltacat.utils.filesystem import (
|
36
|
+
resolve_path_and_filesystem,
|
37
|
+
get_file_info,
|
38
|
+
)
|
39
|
+
from deltacat.utils.performance import timed_invocation
|
40
|
+
from deltacat.utils.ray_utils.runtime import (
|
41
|
+
current_node_resources,
|
42
|
+
live_cpu_waiter,
|
43
|
+
live_node_resource_keys,
|
44
|
+
other_live_node_resource_keys,
|
45
|
+
find_max_single_node_resource_type,
|
46
|
+
)
|
47
|
+
|
48
|
+
"""
|
49
|
+
# CLI Example of Copying from Source to Dest without file conversion
|
50
|
+
# (i.e., register only - shallow copy):
|
51
|
+
$ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table
|
52
|
+
$ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table
|
53
|
+
|
54
|
+
# CLI Example of Copying from Source to Dest without file conversion
|
55
|
+
# (i.e., register only - deep copy):
|
56
|
+
$ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table -r
|
57
|
+
# The above command will make a deep copy of all JSON files found in the source
|
58
|
+
# to the catalog data file directory in the destination.
|
59
|
+
|
60
|
+
# CLI Example of Copying from Source to Dest with file conversion
|
61
|
+
# (i.e., deep copy with file content type transformation):
|
62
|
+
$ dcat convert json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/ --type FEATHER
|
63
|
+
# The above command will read JSON files found in the source, transform them to
|
64
|
+
# Arrow Feather files, and register them in the destination.
|
6
65
|
|
66
|
+
# Python Example of Copying from Source to Dest with file conversion
|
67
|
+
# (i.e., deep copy with file content type transformation):
|
68
|
+
>>> ds = dc.get("json+s3://my_bucket/log_manager/")
|
69
|
+
>>> dc.put("dc://my_deltacat_catalog/log_manager/", dataset=ds, type=ContentType.FEATHER)
|
70
|
+
# Or, equivalently, we can do the write directly from the dataset:
|
71
|
+
>>> ds.write_deltacat("dc://my_deltacat_catalog/log_manager/", type=ContentType.FEATHER)
|
72
|
+
"""
|
7
73
|
|
8
|
-
|
9
|
-
|
74
|
+
|
75
|
+
def _copy_dc(
|
76
|
+
source: DeltaCatUrl,
|
77
|
+
destination: DeltaCatUrl,
|
78
|
+
recursive: bool = False,
|
79
|
+
) -> Metafile:
|
80
|
+
if recursive:
|
81
|
+
src_obj = list(source, recursive=True)
|
82
|
+
else:
|
83
|
+
src_obj = get(source) if not source.url.endswith("/*") else list(source)
|
84
|
+
"""
|
85
|
+
dc_dest_url = DeltacatUrl(destination)
|
86
|
+
# TODO(pdames): Add writer with support for Ray Dataset DeltaCAT Sink &
|
87
|
+
# Recursive DeltaCAT source object copies. Ideally, the Ray Dataset read
|
88
|
+
# is lazy, and only indexes metadata about the objects at source instead
|
89
|
+
# of eagerly converting them to PyArrow-based Blocks.
|
90
|
+
dc_dest_url.writer(src_obj, recursive=recursive)
|
91
|
+
"""
|
92
|
+
|
93
|
+
src_parts = source.url.split("/")
|
10
94
|
src_parts = [part for part in src_parts if part]
|
11
|
-
dst_parts = destination.split("/")
|
95
|
+
dst_parts = destination.url.split("/")
|
12
96
|
dst_parts = [part for part in dst_parts if part]
|
13
|
-
|
14
|
-
|
15
|
-
if len(src_parts) != len(dst_parts) and len(src_parts) != len(dst_parts) + 1:
|
97
|
+
dc.raise_if_not_initialized()
|
98
|
+
if len(src_parts) != len(dst_parts):
|
16
99
|
# TODO(pdames): Better error message.
|
17
100
|
raise ValueError(
|
18
101
|
f"Cannot copy {source} to {destination}. "
|
19
102
|
f"Source and destination must share the same type."
|
20
103
|
)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
104
|
+
return put(destination, metafile=src_obj)
|
105
|
+
|
106
|
+
|
107
|
+
def copy(
|
108
|
+
src: DeltaCatUrl,
|
109
|
+
dst: DeltaCatUrl,
|
110
|
+
*,
|
111
|
+
transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
|
112
|
+
extension_to_memory_multiplier: Dict[str, float] = {
|
113
|
+
"pq": 5,
|
114
|
+
"parquet": 5,
|
115
|
+
"feather": 1.5,
|
116
|
+
"arrow": 1.5,
|
117
|
+
"csv": 1.5,
|
118
|
+
"tsv": 1.5,
|
119
|
+
"psv": 1.5,
|
120
|
+
"txt": 1.5,
|
121
|
+
"json": 1.5,
|
122
|
+
"jsonl": 1.5,
|
123
|
+
"gz": 35,
|
124
|
+
"bz2": 35,
|
125
|
+
"zip": 35,
|
126
|
+
"7z": 35,
|
127
|
+
"*": 2.5,
|
128
|
+
},
|
129
|
+
minimum_worker_cpus: int = 0,
|
130
|
+
reader_args: Dict[str, Any] = {},
|
131
|
+
writer_args: Dict[str, Any] = {},
|
132
|
+
filesystem: Optional[pafs.FileSystem] = None,
|
133
|
+
) -> Union[Metafile, str]:
|
134
|
+
"""
|
135
|
+
Copies data from the source datastore to the destination datastore. By
|
136
|
+
default, this method launches one parallel Ray process to read/transform
|
137
|
+
each input file found in the source followed by one parallel Ray process
|
138
|
+
to write each output file to the destination. Files written to the
|
139
|
+
destination are split or combined to contain uniform record counts. To
|
140
|
+
ensure that adequate resources are available to complete the operation,
|
141
|
+
you may optionally specify minimum cluster and/or worker CPUs to wait for
|
142
|
+
before starting parallel processing.
|
143
|
+
|
144
|
+
Args:
|
145
|
+
src: DeltaCAT URL of the source datastore to read.
|
146
|
+
dst: DeltaCAT URL of the destination datastore to write.
|
147
|
+
transforms: List of transforms to apply to the source dataset prior
|
148
|
+
to write it to the destination datastore. Transforms take the in-memory
|
149
|
+
dataset type read (e.g., Polars DataFrame) and source DeltaCAT URL as
|
150
|
+
input and return the same dataset type as output. Transforms are
|
151
|
+
applied to the dataset in the order given.
|
152
|
+
extension_to_memory_multiplier: Dictionary of file extensions to
|
153
|
+
in-memory inflation estimates for that extension (i.e., the amount
|
154
|
+
of memory required to read a source file, apply transforms, and write
|
155
|
+
it back to a destination file).
|
156
|
+
minimum_worker_cpus: The minimum number of Ray worker CPUs
|
157
|
+
to wait for before starting distributed execution. Useful for cases
|
158
|
+
where the operation is known to suffer from resource starvation (e.g.,
|
159
|
+
out-of-memory errors) if started before the cluster has launched a
|
160
|
+
minimum number of required worker nodes.
|
161
|
+
reader_args: Additional keyword arguments to forward to the reader
|
162
|
+
associated with the in-memory dataset and datastore type to read
|
163
|
+
(e.g., polars.read_csv(**kwargs)).
|
164
|
+
writer_args: Additional keyword arguments to forward to the writer
|
165
|
+
associated with the in-memory dataset type read and datastore type to
|
166
|
+
write (e.g., polars.DataFrame.write_parquet(**kwargs)).
|
167
|
+
filesystem: Optional PyArrow filesystem to use for file IO. Will be
|
168
|
+
automatically resolved from the input path if not specified, and
|
169
|
+
will attempt to automatically resolve storage read/write
|
170
|
+
credentials for the associated source/dest file cloud provider(s).
|
171
|
+
Try providing your own filesystem with credentials, retry strategy,
|
172
|
+
etc. pre-configured if you encounter latency issues or errors
|
173
|
+
reading/writing files.
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
None
|
177
|
+
"""
|
178
|
+
if src.is_deltacat_catalog_url() or dst.is_deltacat_catalog_url():
|
179
|
+
return _copy_dc(src, dst, recursive=src.url.endswith("/**"))
|
180
|
+
else:
|
181
|
+
return _copy_external_ray(
|
182
|
+
src,
|
183
|
+
dst,
|
184
|
+
transforms=transforms,
|
185
|
+
extension_to_memory_multiplier=extension_to_memory_multiplier,
|
186
|
+
minimum_worker_cpus=minimum_worker_cpus,
|
187
|
+
reader_args=reader_args,
|
188
|
+
writer_args=writer_args,
|
189
|
+
filesystem=filesystem,
|
36
190
|
)
|
37
|
-
return new_namespace
|
38
|
-
elif len(src_parts) == 3:
|
39
|
-
# copy the given table
|
40
|
-
raise NotImplementedError
|
41
|
-
elif len(src_parts) == 4:
|
42
|
-
# copy the given table version
|
43
|
-
raise NotImplementedError
|
44
|
-
elif len(src_parts) == 5:
|
45
|
-
# copy the given stream
|
46
|
-
raise NotImplementedError
|
47
|
-
elif len(src_parts) == 6:
|
48
|
-
# copy the given partition
|
49
|
-
raise NotImplementedError
|
50
|
-
elif len(src_parts) == 7:
|
51
|
-
# copy the given partition delta
|
52
|
-
raise NotImplementedError
|
53
|
-
raise ValueError(f"Invalid path: {src_parts}")
|
54
191
|
|
55
192
|
|
56
193
|
def concat(source, destination):
|
@@ -65,98 +202,125 @@ def move(source, destination):
|
|
65
202
|
raise NotImplementedError
|
66
203
|
|
67
204
|
|
68
|
-
def
|
69
|
-
|
205
|
+
def _list_all_metafiles(
|
206
|
+
url: DeltaCatUrl,
|
207
|
+
recursive: bool = False,
|
208
|
+
**kwargs,
|
209
|
+
) -> List[Metafile]:
|
210
|
+
reader = DeltaCatUrlReader(url)
|
211
|
+
list_results: List[ListResult[Metafile]] = []
|
212
|
+
lister = reader.listers.pop(0)[0]
|
213
|
+
# the top-level lister doesn't have any missing keyword args
|
214
|
+
metafiles: ListResult[Metafile] = lister(**kwargs)
|
215
|
+
list_results.append(metafiles)
|
216
|
+
if recursive:
|
217
|
+
for lister, kwarg_name, kwarg_val_resolver_fn in reader.listers:
|
218
|
+
# each subsequent lister needs to inject missing keyword args from the parent metafile
|
219
|
+
for metafile in metafiles.all_items():
|
220
|
+
kwargs_update = (
|
221
|
+
{kwarg_name: kwarg_val_resolver_fn(metafile)}
|
222
|
+
if kwarg_name and kwarg_val_resolver_fn
|
223
|
+
else {}
|
224
|
+
)
|
225
|
+
lister_kwargs = {
|
226
|
+
**kwargs,
|
227
|
+
**kwargs_update,
|
228
|
+
}
|
229
|
+
metafiles = lister(**lister_kwargs)
|
230
|
+
list_results.append(metafiles)
|
231
|
+
return [
|
232
|
+
metafile for list_result in list_results for metafile in list_result.all_items()
|
233
|
+
]
|
70
234
|
|
71
235
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
#
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
# last session.
|
129
|
-
raise ValueError("Catalog not initialized.")
|
130
|
-
new_namespace = dc.create_namespace(
|
131
|
-
namespace=namespace_name,
|
132
|
-
catalog=catalog_name,
|
133
|
-
*args,
|
236
|
+
class CustomReadKwargsProvider(ReadKwargsProvider):
|
237
|
+
def __init__(
|
238
|
+
self,
|
239
|
+
datasource_type: str,
|
240
|
+
kwargs: Dict[str, Any],
|
241
|
+
):
|
242
|
+
self._datasource_type = datasource_type
|
243
|
+
self._kwargs = kwargs
|
244
|
+
|
245
|
+
def _get_kwargs(
|
246
|
+
self,
|
247
|
+
datasource_type: str,
|
248
|
+
kwargs: Dict[str, Any],
|
249
|
+
) -> Dict[str, Any]:
|
250
|
+
if datasource_type == self._datasource_type:
|
251
|
+
kwargs.update(self._kwargs)
|
252
|
+
return kwargs
|
253
|
+
|
254
|
+
|
255
|
+
def list(
|
256
|
+
url: DeltaCatUrl,
|
257
|
+
*,
|
258
|
+
recursive: bool = False,
|
259
|
+
dataset_type: Optional[DatasetType] = None,
|
260
|
+
**kwargs,
|
261
|
+
) -> Union[List[Metafile], LocalTable, DistributedDataset]:
|
262
|
+
if not url.is_deltacat_catalog_url():
|
263
|
+
raise NotImplementedError("List only supports DeltaCAT Catalog URLs.")
|
264
|
+
if dataset_type in DatasetType.distributed():
|
265
|
+
if dataset_type == DatasetType.RAY_DATASET:
|
266
|
+
read_type = (
|
267
|
+
DeltacatReadType.METADATA_LIST
|
268
|
+
if not recursive
|
269
|
+
else DeltacatReadType.METADATA_LIST_RECURSIVE
|
270
|
+
)
|
271
|
+
return read_deltacat(
|
272
|
+
[url],
|
273
|
+
deltacat_read_type=read_type,
|
274
|
+
timestamp_as_of=None,
|
275
|
+
merge_on_read=False,
|
276
|
+
read_kwargs_provider=CustomReadKwargsProvider(
|
277
|
+
datasource_type=url.datastore_type,
|
278
|
+
kwargs=kwargs,
|
279
|
+
),
|
280
|
+
)
|
281
|
+
else:
|
282
|
+
raise NotImplementedError(
|
283
|
+
f"Unsupported dataset type: {dataset_type.name}. "
|
284
|
+
f"Supported Dataset Types: {DatasetType.RAY_DATASET.name}",
|
285
|
+
)
|
286
|
+
else:
|
287
|
+
# return a local list of metafiles
|
288
|
+
# TODO(pdames): Cast the list to the appropriate local dataset type.
|
289
|
+
return _list_all_metafiles(
|
290
|
+
url=url,
|
291
|
+
recursive=recursive,
|
134
292
|
**kwargs,
|
135
293
|
)
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
294
|
+
|
295
|
+
|
296
|
+
def get(
|
297
|
+
url,
|
298
|
+
*args,
|
299
|
+
**kwargs,
|
300
|
+
) -> Union[Metafile, Dataset]:
|
301
|
+
reader = DeltaCatUrlReader(url)
|
302
|
+
return reader.read(*args, **kwargs)
|
303
|
+
|
304
|
+
|
305
|
+
def put(
|
306
|
+
url: DeltaCatUrl,
|
307
|
+
metafile: Optional[Metafile] = None,
|
308
|
+
*args,
|
309
|
+
**kwargs,
|
310
|
+
) -> Union[Metafile, str]:
|
311
|
+
writer = DeltaCatUrlWriter(url, metafile)
|
312
|
+
return writer.write(*args, **kwargs)
|
313
|
+
|
314
|
+
|
315
|
+
def touch(path):
|
316
|
+
raise NotImplementedError
|
153
317
|
|
154
318
|
|
155
319
|
def exists(path):
|
156
320
|
raise NotImplementedError
|
157
321
|
|
158
322
|
|
159
|
-
def query(
|
323
|
+
def query(expression):
|
160
324
|
raise NotImplementedError
|
161
325
|
|
162
326
|
|
@@ -166,3 +330,194 @@ def tail(path):
|
|
166
330
|
|
167
331
|
def head(path):
|
168
332
|
raise NotImplementedError
|
333
|
+
|
334
|
+
|
335
|
+
def _copy_external_ray(
|
336
|
+
src: DeltaCatUrl,
|
337
|
+
dst: DeltaCatUrl,
|
338
|
+
*,
|
339
|
+
transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
|
340
|
+
extension_to_memory_multiplier: Dict[str, float] = {
|
341
|
+
"pq": 5,
|
342
|
+
"parquet": 5,
|
343
|
+
"feather": 1.5,
|
344
|
+
"arrow": 1.5,
|
345
|
+
"csv": 1.5,
|
346
|
+
"tsv": 1.5,
|
347
|
+
"psv": 1.5,
|
348
|
+
"txt": 1.5,
|
349
|
+
"json": 1.5,
|
350
|
+
"jsonl": 1.5,
|
351
|
+
"gz": 35,
|
352
|
+
"bz2": 35,
|
353
|
+
"zip": 35,
|
354
|
+
"7z": 35,
|
355
|
+
"*": 2.5,
|
356
|
+
},
|
357
|
+
minimum_worker_cpus: int = 0,
|
358
|
+
reader_args: Dict[str, Any] = {},
|
359
|
+
writer_args: Dict[str, Any] = {},
|
360
|
+
filesystem: pafs.FileSystem = None,
|
361
|
+
) -> str:
|
362
|
+
print(f"DeltaCAT Copy Invocation Received at: {time.time_ns()}")
|
363
|
+
|
364
|
+
if not isinstance(src, DeltaCatUrl):
|
365
|
+
raise ValueError(f"Expected `src` to be a `DeltaCatUrl` but got `{src}`.")
|
366
|
+
|
367
|
+
# wait for required resources
|
368
|
+
head_cpu_count = int(current_node_resources()["CPU"])
|
369
|
+
if minimum_worker_cpus > 0:
|
370
|
+
print(f"Waiting for {minimum_worker_cpus} worker CPUs...")
|
371
|
+
live_cpu_waiter(
|
372
|
+
min_live_cpus=minimum_worker_cpus + head_cpu_count,
|
373
|
+
)
|
374
|
+
print(f"{minimum_worker_cpus} worker CPUs found!")
|
375
|
+
# start job execution
|
376
|
+
cluster_resources = ray.cluster_resources()
|
377
|
+
print(f"Cluster Resources: {cluster_resources}")
|
378
|
+
print(f"Available Cluster Resources: {ray.available_resources()}")
|
379
|
+
cluster_cpus = int(cluster_resources["CPU"])
|
380
|
+
print(f"Cluster CPUs: {cluster_cpus}")
|
381
|
+
all_node_resource_keys = live_node_resource_keys()
|
382
|
+
print(f"Found {len(all_node_resource_keys)} live nodes: {all_node_resource_keys}")
|
383
|
+
worker_node_resource_keys = other_live_node_resource_keys()
|
384
|
+
print(
|
385
|
+
f"Found {len(worker_node_resource_keys)} live worker nodes: {worker_node_resource_keys}"
|
386
|
+
)
|
387
|
+
worker_cpu_count = cluster_cpus - head_cpu_count
|
388
|
+
print(f"Total worker CPUs: {worker_cpu_count}")
|
389
|
+
|
390
|
+
# estimate memory requirements based on file extension
|
391
|
+
estimated_memory_bytes = 0
|
392
|
+
if extension_to_memory_multiplier:
|
393
|
+
print(f"Resolving stats collection filesystem for: {src.url_path}.")
|
394
|
+
path, filesystem = resolve_path_and_filesystem(src.url_path, filesystem)
|
395
|
+
if isinstance(filesystem, pafs.GcsFileSystem):
|
396
|
+
from datetime import timedelta
|
397
|
+
|
398
|
+
# Configure a retry time limit for GcsFileSystem so that it
|
399
|
+
# doesn't hang forever trying to get file info (e.g., when
|
400
|
+
# trying to get a public file w/o anonymous=True).
|
401
|
+
filesystem = pafs.GcsFileSystem(
|
402
|
+
anonymous=True,
|
403
|
+
retry_time_limit=timedelta(seconds=10),
|
404
|
+
)
|
405
|
+
print(f"Using filesystem {type(filesystem)} to get file size of: {path}")
|
406
|
+
file_info = get_file_info(path, filesystem)
|
407
|
+
if file_info.type != FileType.File:
|
408
|
+
raise ValueError(
|
409
|
+
f"Expected `src` to be a file but got `{file_info.type}` at "
|
410
|
+
f"`{src.url_path}`."
|
411
|
+
)
|
412
|
+
inflation_multiplier = extension_to_memory_multiplier.get(file_info.extension)
|
413
|
+
if inflation_multiplier is None:
|
414
|
+
inflation_multiplier = extension_to_memory_multiplier.get("*")
|
415
|
+
estimated_memory_bytes = inflation_multiplier * file_info.size
|
416
|
+
print(
|
417
|
+
f"Estimated Memory Required for Copy: "
|
418
|
+
f"{estimated_memory_bytes/BYTES_PER_GIBIBYTE} GiB"
|
419
|
+
)
|
420
|
+
print(f"Starting DeltaCAT Copy at: {time.time_ns()}")
|
421
|
+
|
422
|
+
index_result = None
|
423
|
+
num_cpus = 1
|
424
|
+
# TODO(pdames): remove hard-coding - issues encountered when going greater
|
425
|
+
# than 2 include verifying that the scope of schedulable nodes doesn't
|
426
|
+
# result in all large files lining up for the one large node in the cluster
|
427
|
+
# that can actually handle them (which is worse if it's also the head node)
|
428
|
+
max_allowed_cpus = 2
|
429
|
+
while not index_result:
|
430
|
+
copy_task_pending, latency = timed_invocation(
|
431
|
+
copy_task.options(num_cpus=num_cpus, memory=estimated_memory_bytes).remote,
|
432
|
+
src=src,
|
433
|
+
dest=dst,
|
434
|
+
dataset_type=DatasetType.POLARS,
|
435
|
+
transforms=transforms,
|
436
|
+
reader_args=reader_args,
|
437
|
+
writer_args=writer_args,
|
438
|
+
)
|
439
|
+
print(f"Time to Launch Copy Task: {latency} seconds")
|
440
|
+
try:
|
441
|
+
index_result, latency = timed_invocation(
|
442
|
+
ray.get,
|
443
|
+
copy_task_pending,
|
444
|
+
)
|
445
|
+
except OutOfMemoryError as e:
|
446
|
+
print(f"Copy Task Ran Out of Memory: {e}")
|
447
|
+
max_single_node_cpus = min(
|
448
|
+
max_allowed_cpus, find_max_single_node_resource_type("CPU")
|
449
|
+
)
|
450
|
+
num_cpus += 1
|
451
|
+
if num_cpus > max_single_node_cpus:
|
452
|
+
raise e
|
453
|
+
print(f"Retrying Failed Copy Task with {num_cpus} dedicated CPUs")
|
454
|
+
|
455
|
+
print(f"Time to Launch Copy Task: {latency} seconds")
|
456
|
+
print(f"Time to Complete Copy Task: {latency} seconds")
|
457
|
+
|
458
|
+
total_gib_indexed = index_result.table_size / BYTES_PER_GIBIBYTE
|
459
|
+
|
460
|
+
print(f"Records Copied: {index_result.table_length}")
|
461
|
+
print(f"Bytes Copied: {total_gib_indexed} GiB")
|
462
|
+
print(f"Conversion Rate: {total_gib_indexed/latency} GiB/s")
|
463
|
+
print(f"Finished Copy at: {time.time_ns()}")
|
464
|
+
|
465
|
+
return dst.url
|
466
|
+
|
467
|
+
|
468
|
+
@ray.remote(scheduling_strategy="SPREAD")
|
469
|
+
def copy_task(
|
470
|
+
src: DeltaCatUrl,
|
471
|
+
dest: DeltaCatUrl,
|
472
|
+
dataset_type: DatasetType,
|
473
|
+
transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
|
474
|
+
reader_args: Dict[str, Any] = {},
|
475
|
+
writer_args: Dict[str, Any] = {},
|
476
|
+
) -> Tuple[Optional[int], int]:
|
477
|
+
"""
|
478
|
+
Indexes a DeltaCAT source URL into a DeltaCAT destination URL.
|
479
|
+
"""
|
480
|
+
table, latency = timed_invocation(
|
481
|
+
read_table,
|
482
|
+
src=src,
|
483
|
+
dataset_type=dataset_type,
|
484
|
+
transforms=transforms,
|
485
|
+
reader_args=reader_args,
|
486
|
+
)
|
487
|
+
print(f"Time to read {src.url_path}: {latency} seconds")
|
488
|
+
|
489
|
+
table_size = get_table_size(table)
|
490
|
+
print(f"Table Size: {table_size/BYTES_PER_GIBIBYTE} GiB")
|
491
|
+
|
492
|
+
table_length = get_table_length(table)
|
493
|
+
print(f"Table Records: {table_length}")
|
494
|
+
|
495
|
+
writer = DeltaCatUrlWriter(dest, dataset_type)
|
496
|
+
written_file_path, latency = timed_invocation(
|
497
|
+
writer.write,
|
498
|
+
"",
|
499
|
+
table,
|
500
|
+
**writer_args,
|
501
|
+
)
|
502
|
+
print(f"Time to write {written_file_path}: {latency}")
|
503
|
+
|
504
|
+
return CopyResult(table_size, table_length)
|
505
|
+
|
506
|
+
|
507
|
+
def read_table(
|
508
|
+
src: DeltaCatUrl,
|
509
|
+
dataset_type: DatasetType,
|
510
|
+
transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
|
511
|
+
reader_args: Dict[str, Any] = {},
|
512
|
+
) -> LocalTable:
|
513
|
+
reader = DeltaCatUrlReader(src, dataset_type)
|
514
|
+
table: LocalTable = reader.read(**reader_args)
|
515
|
+
for transform in transforms:
|
516
|
+
table = transform(table, src)
|
517
|
+
return table
|
518
|
+
|
519
|
+
|
520
|
+
@dataclass(frozen=True)
|
521
|
+
class CopyResult:
|
522
|
+
table_size: int
|
523
|
+
table_length: int
|
deltacat/aws/s3u.py
CHANGED
@@ -48,7 +48,7 @@ from deltacat.types.media import (
|
|
48
48
|
)
|
49
49
|
from deltacat.types.tables import (
|
50
50
|
TABLE_CLASS_TO_SIZE_FUNC,
|
51
|
-
|
51
|
+
TABLE_TYPE_TO_S3_READER_FUNC,
|
52
52
|
TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS,
|
53
53
|
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
|
54
54
|
get_table_length,
|
@@ -261,7 +261,7 @@ def read_file(
|
|
261
261
|
**s3_client_kwargs,
|
262
262
|
) -> LocalTable:
|
263
263
|
|
264
|
-
reader =
|
264
|
+
reader = TABLE_TYPE_TO_S3_READER_FUNC[table_type.value]
|
265
265
|
try:
|
266
266
|
table = reader(
|
267
267
|
s3_url,
|
@@ -61,7 +61,7 @@ def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
|
|
61
61
|
"Daft not installed. Install Daft using pip to run these benchmarks: `pip install getdaft`"
|
62
62
|
)
|
63
63
|
|
64
|
-
tbl = daft.
|
64
|
+
tbl = daft.read_parquet(path, columns=columns)
|
65
65
|
return tbl.to_arrow()
|
66
66
|
|
67
67
|
|