deltacat 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. deltacat/__init__.py +27 -6
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/conftest.py +1 -1
  5. deltacat/catalog/main/impl.py +12 -6
  6. deltacat/catalog/model/catalog.py +65 -47
  7. deltacat/catalog/model/properties.py +1 -3
  8. deltacat/compute/__init__.py +14 -0
  9. deltacat/compute/converter/constants.py +5 -0
  10. deltacat/compute/converter/converter_session.py +78 -36
  11. deltacat/compute/converter/model/convert_input.py +24 -4
  12. deltacat/compute/converter/model/convert_result.py +61 -0
  13. deltacat/compute/converter/model/converter_session_params.py +52 -10
  14. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  15. deltacat/compute/converter/steps/convert.py +84 -36
  16. deltacat/compute/converter/steps/dedupe.py +25 -4
  17. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  18. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  19. deltacat/compute/converter/utils/io.py +82 -11
  20. deltacat/compute/converter/utils/s3u.py +13 -4
  21. deltacat/compute/jobs/__init__.py +0 -0
  22. deltacat/compute/jobs/client.py +404 -0
  23. deltacat/constants.py +4 -4
  24. deltacat/daft/daft_scan.py +7 -3
  25. deltacat/daft/translator.py +126 -0
  26. deltacat/examples/basic_logging.py +5 -3
  27. deltacat/examples/hello_world.py +4 -2
  28. deltacat/examples/indexer/__init__.py +0 -0
  29. deltacat/examples/indexer/aws/__init__.py +0 -0
  30. deltacat/examples/indexer/gcp/__init__.py +0 -0
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +199 -0
  33. deltacat/io/__init__.py +13 -0
  34. deltacat/io/dataset/__init__.py +0 -0
  35. deltacat/io/dataset/deltacat_dataset.py +91 -0
  36. deltacat/io/datasink/__init__.py +0 -0
  37. deltacat/io/datasink/deltacat_datasink.py +207 -0
  38. deltacat/io/datasource/__init__.py +0 -0
  39. deltacat/io/datasource/deltacat_datasource.py +580 -0
  40. deltacat/io/reader/__init__.py +0 -0
  41. deltacat/io/reader/deltacat_read_api.py +172 -0
  42. deltacat/storage/__init__.py +2 -0
  43. deltacat/storage/model/expression/__init__.py +47 -0
  44. deltacat/storage/model/expression/expression.py +656 -0
  45. deltacat/storage/model/expression/visitor.py +248 -0
  46. deltacat/storage/model/metafile.py +74 -42
  47. deltacat/storage/model/scan/push_down.py +32 -5
  48. deltacat/storage/model/types.py +5 -3
  49. deltacat/storage/rivulet/__init__.py +4 -4
  50. deltacat/tests/_io/reader/__init__.py +0 -0
  51. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  52. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  53. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  54. deltacat/tests/storage/model/test_expression.py +327 -0
  55. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
  56. deltacat/tests/storage/rivulet/test_dataset.py +1 -1
  57. deltacat/tests/storage/rivulet/test_manifest.py +1 -1
  58. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
  59. deltacat/tests/test_deltacat_api.py +50 -9
  60. deltacat/types/media.py +141 -43
  61. deltacat/types/tables.py +35 -7
  62. deltacat/utils/daft.py +2 -2
  63. deltacat/utils/filesystem.py +39 -9
  64. deltacat/utils/polars.py +128 -0
  65. deltacat/utils/pyarrow.py +151 -15
  66. deltacat/utils/ray_utils/concurrency.py +1 -1
  67. deltacat/utils/ray_utils/runtime.py +56 -4
  68. deltacat/utils/url.py +1284 -0
  69. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
  70. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
  71. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
  72. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
  73. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
deltacat/api.py CHANGED
@@ -1,56 +1,193 @@
1
- from typing import Any
2
-
1
+ import time
2
+ from dataclasses import dataclass
3
+ from typing import Any, Union, List, Optional, Dict, Callable, Tuple
3
4
 
5
+ import ray
4
6
  import deltacat as dc
5
- from deltacat.catalog import Catalog
7
+ import pyarrow.fs as pafs
8
+
9
+ from pyarrow.fs import FileType
10
+ from ray.exceptions import OutOfMemoryError
11
+
12
+ from deltacat.constants import BYTES_PER_GIBIBYTE
13
+ from deltacat.io import (
14
+ read_deltacat,
15
+ DeltacatReadType,
16
+ )
17
+ from deltacat.storage import (
18
+ Dataset,
19
+ DistributedDataset,
20
+ ListResult,
21
+ LocalTable,
22
+ Metafile,
23
+ )
24
+ from deltacat.types.media import DatasetType
25
+ from deltacat.utils.url import (
26
+ DeltaCatUrl,
27
+ DeltaCatUrlReader,
28
+ DeltaCatUrlWriter,
29
+ )
30
+ from deltacat.utils.common import ReadKwargsProvider
31
+ from deltacat.types.tables import (
32
+ get_table_size,
33
+ get_table_length,
34
+ )
35
+ from deltacat.utils.filesystem import (
36
+ resolve_path_and_filesystem,
37
+ get_file_info,
38
+ )
39
+ from deltacat.utils.performance import timed_invocation
40
+ from deltacat.utils.ray_utils.runtime import (
41
+ current_node_resources,
42
+ live_cpu_waiter,
43
+ live_node_resource_keys,
44
+ other_live_node_resource_keys,
45
+ find_max_single_node_resource_type,
46
+ )
47
+
48
+ """
49
+ # CLI Example of Copying from Source to Dest without file conversion
50
+ # (i.e., register only - shallow copy):
51
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table
52
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table
53
+
54
+ # CLI Example of Copying from Source to Dest without file conversion
55
+ # (i.e., register only - deep copy):
56
+ $ dcat cp json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/json_table -r
57
+ # The above command will make a deep copy of all JSON files found in the source
58
+ # to the catalog data file directory in the destination.
59
+
60
+ # CLI Example of Copying from Source to Dest with file conversion
61
+ # (i.e., deep copy with file content type transformation):
62
+ $ dcat convert json+s3://my_bucket/log_manager/ dc://my_deltacat_catalog/log_manager/ --type FEATHER
63
+ # The above command will read JSON files found in the source, transform them to
64
+ # Arrow Feather files, and register them in the destination.
6
65
 
66
+ # Python Example of Copying from Source to Dest with file conversion
67
+ # (i.e., deep copy with file content type transformation):
68
+ >>> ds = dc.get("json+s3://my_bucket/log_manager/")
69
+ >>> dc.put("dc://my_deltacat_catalog/log_manager/", dataset=ds, type=ContentType.FEATHER)
70
+ # Or, equivalently, we can do the write directly from the dataset:
71
+ >>> ds.write_deltacat("dc://my_deltacat_catalog/log_manager/", type=ContentType.FEATHER)
72
+ """
7
73
 
8
- def copy(source, destination):
9
- src_parts = source.split("/")
74
+
75
+ def _copy_dc(
76
+ source: DeltaCatUrl,
77
+ destination: DeltaCatUrl,
78
+ recursive: bool = False,
79
+ ) -> Metafile:
80
+ if recursive:
81
+ src_obj = list(source, recursive=True)
82
+ else:
83
+ src_obj = get(source) if not source.url.endswith("/*") else list(source)
84
+ """
85
+ dc_dest_url = DeltacatUrl(destination)
86
+ # TODO(pdames): Add writer with support for Ray Dataset DeltaCAT Sink &
87
+ # Recursive DeltaCAT source object copies. Ideally, the Ray Dataset read
88
+ # is lazy, and only indexes metadata about the objects at source instead
89
+ # of eagerly converting them to PyArrow-based Blocks.
90
+ dc_dest_url.writer(src_obj, recursive=recursive)
91
+ """
92
+
93
+ src_parts = source.url.split("/")
10
94
  src_parts = [part for part in src_parts if part]
11
- dst_parts = destination.split("/")
95
+ dst_parts = destination.url.split("/")
12
96
  dst_parts = [part for part in dst_parts if part]
13
- if not dc.is_initialized():
14
- raise ValueError("Catalog not initialized.")
15
- if len(src_parts) != len(dst_parts) and len(src_parts) != len(dst_parts) + 1:
97
+ dc.raise_if_not_initialized()
98
+ if len(src_parts) != len(dst_parts):
16
99
  # TODO(pdames): Better error message.
17
100
  raise ValueError(
18
101
  f"Cannot copy {source} to {destination}. "
19
102
  f"Source and destination must share the same type."
20
103
  )
21
- src_obj = get(source)
22
- if len(src_parts) == 1:
23
- # copy the given catalog
24
- raise NotImplementedError
25
- elif len(src_parts) == 2:
26
- # TODO(pdames): Make catalog specification optional if there is only
27
- # one catalog (e.g., auto-retrieve src_parts[0]/dst_parts[0])
28
- # copy the given namespace
29
- src_namespace_name = src_parts[1]
30
- dst_catalog_name = dst_parts[0]
31
- dst_namespace_name = dst_parts[1] if len(dst_parts) >= 2 else src_namespace_name
32
- new_namespace = dc.create_namespace(
33
- namespace=dst_namespace_name,
34
- properties=src_obj.properties,
35
- catalog=dst_catalog_name,
104
+ return put(destination, metafile=src_obj)
105
+
106
+
107
+ def copy(
108
+ src: DeltaCatUrl,
109
+ dst: DeltaCatUrl,
110
+ *,
111
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
112
+ extension_to_memory_multiplier: Dict[str, float] = {
113
+ "pq": 5,
114
+ "parquet": 5,
115
+ "feather": 1.5,
116
+ "arrow": 1.5,
117
+ "csv": 1.5,
118
+ "tsv": 1.5,
119
+ "psv": 1.5,
120
+ "txt": 1.5,
121
+ "json": 1.5,
122
+ "jsonl": 1.5,
123
+ "gz": 35,
124
+ "bz2": 35,
125
+ "zip": 35,
126
+ "7z": 35,
127
+ "*": 2.5,
128
+ },
129
+ minimum_worker_cpus: int = 0,
130
+ reader_args: Dict[str, Any] = {},
131
+ writer_args: Dict[str, Any] = {},
132
+ filesystem: Optional[pafs.FileSystem] = None,
133
+ ) -> Union[Metafile, str]:
134
+ """
135
+ Copies data from the source datastore to the destination datastore. By
136
+ default, this method launches one parallel Ray process to read/transform
137
+ each input file found in the source followed by one parallel Ray process
138
+ to write each output file to the destination. Files written to the
139
+ destination are split or combined to contain uniform record counts. To
140
+ ensure that adequate resources are available to complete the operation,
141
+ you may optionally specify minimum cluster and/or worker CPUs to wait for
142
+ before starting parallel processing.
143
+
144
+ Args:
145
+ src: DeltaCAT URL of the source datastore to read.
146
+ dst: DeltaCAT URL of the destination datastore to write.
147
+ transforms: List of transforms to apply to the source dataset prior
148
+ to write it to the destination datastore. Transforms take the in-memory
149
+ dataset type read (e.g., Polars DataFrame) and source DeltaCAT URL as
150
+ input and return the same dataset type as output. Transforms are
151
+ applied to the dataset in the order given.
152
+ extension_to_memory_multiplier: Dictionary of file extensions to
153
+ in-memory inflation estimates for that extension (i.e., the amount
154
+ of memory required to read a source file, apply transforms, and write
155
+ it back to a destination file).
156
+ minimum_worker_cpus: The minimum number of Ray worker CPUs
157
+ to wait for before starting distributed execution. Useful for cases
158
+ where the operation is known to suffer from resource starvation (e.g.,
159
+ out-of-memory errors) if started before the cluster has launched a
160
+ minimum number of required worker nodes.
161
+ reader_args: Additional keyword arguments to forward to the reader
162
+ associated with the in-memory dataset and datastore type to read
163
+ (e.g., polars.read_csv(**kwargs)).
164
+ writer_args: Additional keyword arguments to forward to the writer
165
+ associated with the in-memory dataset type read and datastore type to
166
+ write (e.g., polars.DataFrame.write_parquet(**kwargs)).
167
+ filesystem: Optional PyArrow filesystem to use for file IO. Will be
168
+ automatically resolved from the input path if not specified, and
169
+ will attempt to automatically resolve storage read/write
170
+ credentials for the associated source/dest file cloud provider(s).
171
+ Try providing your own filesystem with credentials, retry strategy,
172
+ etc. pre-configured if you encounter latency issues or errors
173
+ reading/writing files.
174
+
175
+ Returns:
176
+ None
177
+ """
178
+ if src.is_deltacat_catalog_url() or dst.is_deltacat_catalog_url():
179
+ return _copy_dc(src, dst, recursive=src.url.endswith("/**"))
180
+ else:
181
+ return _copy_external_ray(
182
+ src,
183
+ dst,
184
+ transforms=transforms,
185
+ extension_to_memory_multiplier=extension_to_memory_multiplier,
186
+ minimum_worker_cpus=minimum_worker_cpus,
187
+ reader_args=reader_args,
188
+ writer_args=writer_args,
189
+ filesystem=filesystem,
36
190
  )
37
- return new_namespace
38
- elif len(src_parts) == 3:
39
- # copy the given table
40
- raise NotImplementedError
41
- elif len(src_parts) == 4:
42
- # copy the given table version
43
- raise NotImplementedError
44
- elif len(src_parts) == 5:
45
- # copy the given stream
46
- raise NotImplementedError
47
- elif len(src_parts) == 6:
48
- # copy the given partition
49
- raise NotImplementedError
50
- elif len(src_parts) == 7:
51
- # copy the given partition delta
52
- raise NotImplementedError
53
- raise ValueError(f"Invalid path: {src_parts}")
54
191
 
55
192
 
56
193
  def concat(source, destination):
@@ -65,98 +202,125 @@ def move(source, destination):
65
202
  raise NotImplementedError
66
203
 
67
204
 
68
- def list(path):
69
- raise NotImplementedError
205
+ def _list_all_metafiles(
206
+ url: DeltaCatUrl,
207
+ recursive: bool = False,
208
+ **kwargs,
209
+ ) -> List[Metafile]:
210
+ reader = DeltaCatUrlReader(url)
211
+ list_results: List[ListResult[Metafile]] = []
212
+ lister = reader.listers.pop(0)[0]
213
+ # the top-level lister doesn't have any missing keyword args
214
+ metafiles: ListResult[Metafile] = lister(**kwargs)
215
+ list_results.append(metafiles)
216
+ if recursive:
217
+ for lister, kwarg_name, kwarg_val_resolver_fn in reader.listers:
218
+ # each subsequent lister needs to inject missing keyword args from the parent metafile
219
+ for metafile in metafiles.all_items():
220
+ kwargs_update = (
221
+ {kwarg_name: kwarg_val_resolver_fn(metafile)}
222
+ if kwarg_name and kwarg_val_resolver_fn
223
+ else {}
224
+ )
225
+ lister_kwargs = {
226
+ **kwargs,
227
+ **kwargs_update,
228
+ }
229
+ metafiles = lister(**lister_kwargs)
230
+ list_results.append(metafiles)
231
+ return [
232
+ metafile for list_result in list_results for metafile in list_result.all_items()
233
+ ]
70
234
 
71
235
 
72
- def get(path) -> Any:
73
- parts = path.split("/")
74
- parts = [part for part in parts if part]
75
- if not dc.is_initialized():
76
- # TODO(pdames): Re-initialize DeltaCAT with all catalogs from the
77
- # last session.
78
- raise ValueError("Catalog not initialized.")
79
- if len(parts) == 1:
80
- # TODO(pdames): Save all catalogs registered from the last session on
81
- # disk so that users don't need to re-initialize them every time.
82
- # get the given catalog
83
- catalog_name = parts[0]
84
- return dc.get_catalog(catalog_name)
85
- elif len(parts) == 2:
86
- # get the given namespace
87
- catalog_name = parts[0]
88
- namespace_name = parts[1]
89
- return dc.get_namespace(
90
- namespace=namespace_name,
91
- catalog=catalog_name,
92
- )
93
- elif len(parts) == 3:
94
- # get the given table
95
- raise NotImplementedError
96
- elif len(parts) == 4:
97
- # get the given table version
98
- raise NotImplementedError
99
- elif len(parts) == 5:
100
- # get the given stream
101
- raise NotImplementedError
102
- elif len(parts) == 6:
103
- # get the given partition
104
- raise NotImplementedError
105
- elif len(parts) == 7:
106
- # get the given partition delta
107
- raise NotImplementedError
108
- raise ValueError(f"Invalid path: {path}")
109
-
110
-
111
- def put(path, *args, **kwargs) -> Any:
112
- parts = path.split("/")
113
- parts = [part for part in parts if part]
114
- if len(parts) == 1:
115
- # TODO(pdames): Save all catalogs registered from the last session on
116
- # disk so that users don't need to re-initialize them every time.
117
- # register the given catalog
118
- catalog_name = parts[0]
119
- # Initialize default catalog using kwargs
120
- catalog = Catalog(**kwargs)
121
- return dc.put_catalog(catalog_name, catalog)
122
- elif len(parts) == 2:
123
- # register the given namespace
124
- catalog_name = parts[0]
125
- namespace_name = parts[1]
126
- if not dc.is_initialized():
127
- # TODO(pdames): Re-initialize DeltaCAT with all catalogs from the
128
- # last session.
129
- raise ValueError("Catalog not initialized.")
130
- new_namespace = dc.create_namespace(
131
- namespace=namespace_name,
132
- catalog=catalog_name,
133
- *args,
236
+ class CustomReadKwargsProvider(ReadKwargsProvider):
237
+ def __init__(
238
+ self,
239
+ datasource_type: str,
240
+ kwargs: Dict[str, Any],
241
+ ):
242
+ self._datasource_type = datasource_type
243
+ self._kwargs = kwargs
244
+
245
+ def _get_kwargs(
246
+ self,
247
+ datasource_type: str,
248
+ kwargs: Dict[str, Any],
249
+ ) -> Dict[str, Any]:
250
+ if datasource_type == self._datasource_type:
251
+ kwargs.update(self._kwargs)
252
+ return kwargs
253
+
254
+
255
+ def list(
256
+ url: DeltaCatUrl,
257
+ *,
258
+ recursive: bool = False,
259
+ dataset_type: Optional[DatasetType] = None,
260
+ **kwargs,
261
+ ) -> Union[List[Metafile], LocalTable, DistributedDataset]:
262
+ if not url.is_deltacat_catalog_url():
263
+ raise NotImplementedError("List only supports DeltaCAT Catalog URLs.")
264
+ if dataset_type in DatasetType.distributed():
265
+ if dataset_type == DatasetType.RAY_DATASET:
266
+ read_type = (
267
+ DeltacatReadType.METADATA_LIST
268
+ if not recursive
269
+ else DeltacatReadType.METADATA_LIST_RECURSIVE
270
+ )
271
+ return read_deltacat(
272
+ [url],
273
+ deltacat_read_type=read_type,
274
+ timestamp_as_of=None,
275
+ merge_on_read=False,
276
+ read_kwargs_provider=CustomReadKwargsProvider(
277
+ datasource_type=url.datastore_type,
278
+ kwargs=kwargs,
279
+ ),
280
+ )
281
+ else:
282
+ raise NotImplementedError(
283
+ f"Unsupported dataset type: {dataset_type.name}. "
284
+ f"Supported Dataset Types: {DatasetType.RAY_DATASET.name}",
285
+ )
286
+ else:
287
+ # return a local list of metafiles
288
+ # TODO(pdames): Cast the list to the appropriate local dataset type.
289
+ return _list_all_metafiles(
290
+ url=url,
291
+ recursive=recursive,
134
292
  **kwargs,
135
293
  )
136
- return new_namespace
137
- elif len(parts) == 3:
138
- # register the given table
139
- raise NotImplementedError
140
- elif len(parts) == 4:
141
- # register the given table version
142
- raise NotImplementedError
143
- elif len(parts) == 5:
144
- # register the given stream
145
- raise NotImplementedError
146
- elif len(parts) == 6:
147
- # register the given partition
148
- raise NotImplementedError
149
- elif len(parts) == 7:
150
- # register the given partition delta
151
- raise NotImplementedError
152
- raise ValueError(f"Invalid path: {path}")
294
+
295
+
296
+ def get(
297
+ url,
298
+ *args,
299
+ **kwargs,
300
+ ) -> Union[Metafile, Dataset]:
301
+ reader = DeltaCatUrlReader(url)
302
+ return reader.read(*args, **kwargs)
303
+
304
+
305
+ def put(
306
+ url: DeltaCatUrl,
307
+ metafile: Optional[Metafile] = None,
308
+ *args,
309
+ **kwargs,
310
+ ) -> Union[Metafile, str]:
311
+ writer = DeltaCatUrlWriter(url, metafile)
312
+ return writer.write(*args, **kwargs)
313
+
314
+
315
+ def touch(path):
316
+ raise NotImplementedError
153
317
 
154
318
 
155
319
  def exists(path):
156
320
  raise NotImplementedError
157
321
 
158
322
 
159
- def query(path, expression):
323
+ def query(expression):
160
324
  raise NotImplementedError
161
325
 
162
326
 
@@ -166,3 +330,194 @@ def tail(path):
166
330
 
167
331
  def head(path):
168
332
  raise NotImplementedError
333
+
334
+
335
+ def _copy_external_ray(
336
+ src: DeltaCatUrl,
337
+ dst: DeltaCatUrl,
338
+ *,
339
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
340
+ extension_to_memory_multiplier: Dict[str, float] = {
341
+ "pq": 5,
342
+ "parquet": 5,
343
+ "feather": 1.5,
344
+ "arrow": 1.5,
345
+ "csv": 1.5,
346
+ "tsv": 1.5,
347
+ "psv": 1.5,
348
+ "txt": 1.5,
349
+ "json": 1.5,
350
+ "jsonl": 1.5,
351
+ "gz": 35,
352
+ "bz2": 35,
353
+ "zip": 35,
354
+ "7z": 35,
355
+ "*": 2.5,
356
+ },
357
+ minimum_worker_cpus: int = 0,
358
+ reader_args: Dict[str, Any] = {},
359
+ writer_args: Dict[str, Any] = {},
360
+ filesystem: pafs.FileSystem = None,
361
+ ) -> str:
362
+ print(f"DeltaCAT Copy Invocation Received at: {time.time_ns()}")
363
+
364
+ if not isinstance(src, DeltaCatUrl):
365
+ raise ValueError(f"Expected `src` to be a `DeltaCatUrl` but got `{src}`.")
366
+
367
+ # wait for required resources
368
+ head_cpu_count = int(current_node_resources()["CPU"])
369
+ if minimum_worker_cpus > 0:
370
+ print(f"Waiting for {minimum_worker_cpus} worker CPUs...")
371
+ live_cpu_waiter(
372
+ min_live_cpus=minimum_worker_cpus + head_cpu_count,
373
+ )
374
+ print(f"{minimum_worker_cpus} worker CPUs found!")
375
+ # start job execution
376
+ cluster_resources = ray.cluster_resources()
377
+ print(f"Cluster Resources: {cluster_resources}")
378
+ print(f"Available Cluster Resources: {ray.available_resources()}")
379
+ cluster_cpus = int(cluster_resources["CPU"])
380
+ print(f"Cluster CPUs: {cluster_cpus}")
381
+ all_node_resource_keys = live_node_resource_keys()
382
+ print(f"Found {len(all_node_resource_keys)} live nodes: {all_node_resource_keys}")
383
+ worker_node_resource_keys = other_live_node_resource_keys()
384
+ print(
385
+ f"Found {len(worker_node_resource_keys)} live worker nodes: {worker_node_resource_keys}"
386
+ )
387
+ worker_cpu_count = cluster_cpus - head_cpu_count
388
+ print(f"Total worker CPUs: {worker_cpu_count}")
389
+
390
+ # estimate memory requirements based on file extension
391
+ estimated_memory_bytes = 0
392
+ if extension_to_memory_multiplier:
393
+ print(f"Resolving stats collection filesystem for: {src.url_path}.")
394
+ path, filesystem = resolve_path_and_filesystem(src.url_path, filesystem)
395
+ if isinstance(filesystem, pafs.GcsFileSystem):
396
+ from datetime import timedelta
397
+
398
+ # Configure a retry time limit for GcsFileSystem so that it
399
+ # doesn't hang forever trying to get file info (e.g., when
400
+ # trying to get a public file w/o anonymous=True).
401
+ filesystem = pafs.GcsFileSystem(
402
+ anonymous=True,
403
+ retry_time_limit=timedelta(seconds=10),
404
+ )
405
+ print(f"Using filesystem {type(filesystem)} to get file size of: {path}")
406
+ file_info = get_file_info(path, filesystem)
407
+ if file_info.type != FileType.File:
408
+ raise ValueError(
409
+ f"Expected `src` to be a file but got `{file_info.type}` at "
410
+ f"`{src.url_path}`."
411
+ )
412
+ inflation_multiplier = extension_to_memory_multiplier.get(file_info.extension)
413
+ if inflation_multiplier is None:
414
+ inflation_multiplier = extension_to_memory_multiplier.get("*")
415
+ estimated_memory_bytes = inflation_multiplier * file_info.size
416
+ print(
417
+ f"Estimated Memory Required for Copy: "
418
+ f"{estimated_memory_bytes/BYTES_PER_GIBIBYTE} GiB"
419
+ )
420
+ print(f"Starting DeltaCAT Copy at: {time.time_ns()}")
421
+
422
+ index_result = None
423
+ num_cpus = 1
424
+ # TODO(pdames): remove hard-coding - issues encountered when going greater
425
+ # than 2 include verifying that the scope of schedulable nodes doesn't
426
+ # result in all large files lining up for the one large node in the cluster
427
+ # that can actually handle them (which is worse if it's also the head node)
428
+ max_allowed_cpus = 2
429
+ while not index_result:
430
+ copy_task_pending, latency = timed_invocation(
431
+ copy_task.options(num_cpus=num_cpus, memory=estimated_memory_bytes).remote,
432
+ src=src,
433
+ dest=dst,
434
+ dataset_type=DatasetType.POLARS,
435
+ transforms=transforms,
436
+ reader_args=reader_args,
437
+ writer_args=writer_args,
438
+ )
439
+ print(f"Time to Launch Copy Task: {latency} seconds")
440
+ try:
441
+ index_result, latency = timed_invocation(
442
+ ray.get,
443
+ copy_task_pending,
444
+ )
445
+ except OutOfMemoryError as e:
446
+ print(f"Copy Task Ran Out of Memory: {e}")
447
+ max_single_node_cpus = min(
448
+ max_allowed_cpus, find_max_single_node_resource_type("CPU")
449
+ )
450
+ num_cpus += 1
451
+ if num_cpus > max_single_node_cpus:
452
+ raise e
453
+ print(f"Retrying Failed Copy Task with {num_cpus} dedicated CPUs")
454
+
455
+ print(f"Time to Launch Copy Task: {latency} seconds")
456
+ print(f"Time to Complete Copy Task: {latency} seconds")
457
+
458
+ total_gib_indexed = index_result.table_size / BYTES_PER_GIBIBYTE
459
+
460
+ print(f"Records Copied: {index_result.table_length}")
461
+ print(f"Bytes Copied: {total_gib_indexed} GiB")
462
+ print(f"Conversion Rate: {total_gib_indexed/latency} GiB/s")
463
+ print(f"Finished Copy at: {time.time_ns()}")
464
+
465
+ return dst.url
466
+
467
+
468
+ @ray.remote(scheduling_strategy="SPREAD")
469
+ def copy_task(
470
+ src: DeltaCatUrl,
471
+ dest: DeltaCatUrl,
472
+ dataset_type: DatasetType,
473
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
474
+ reader_args: Dict[str, Any] = {},
475
+ writer_args: Dict[str, Any] = {},
476
+ ) -> Tuple[Optional[int], int]:
477
+ """
478
+ Indexes a DeltaCAT source URL into a DeltaCAT destination URL.
479
+ """
480
+ table, latency = timed_invocation(
481
+ read_table,
482
+ src=src,
483
+ dataset_type=dataset_type,
484
+ transforms=transforms,
485
+ reader_args=reader_args,
486
+ )
487
+ print(f"Time to read {src.url_path}: {latency} seconds")
488
+
489
+ table_size = get_table_size(table)
490
+ print(f"Table Size: {table_size/BYTES_PER_GIBIBYTE} GiB")
491
+
492
+ table_length = get_table_length(table)
493
+ print(f"Table Records: {table_length}")
494
+
495
+ writer = DeltaCatUrlWriter(dest, dataset_type)
496
+ written_file_path, latency = timed_invocation(
497
+ writer.write,
498
+ "",
499
+ table,
500
+ **writer_args,
501
+ )
502
+ print(f"Time to write {written_file_path}: {latency}")
503
+
504
+ return CopyResult(table_size, table_length)
505
+
506
+
507
+ def read_table(
508
+ src: DeltaCatUrl,
509
+ dataset_type: DatasetType,
510
+ transforms: List[Callable[[Dataset, DeltaCatUrl], Dataset]] = [],
511
+ reader_args: Dict[str, Any] = {},
512
+ ) -> LocalTable:
513
+ reader = DeltaCatUrlReader(src, dataset_type)
514
+ table: LocalTable = reader.read(**reader_args)
515
+ for transform in transforms:
516
+ table = transform(table, src)
517
+ return table
518
+
519
+
520
+ @dataclass(frozen=True)
521
+ class CopyResult:
522
+ table_size: int
523
+ table_length: int
deltacat/aws/s3u.py CHANGED
@@ -48,7 +48,7 @@ from deltacat.types.media import (
48
48
  )
49
49
  from deltacat.types.tables import (
50
50
  TABLE_CLASS_TO_SIZE_FUNC,
51
- TABLE_TYPE_TO_READER_FUNC,
51
+ TABLE_TYPE_TO_S3_READER_FUNC,
52
52
  TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS,
53
53
  DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
54
54
  get_table_length,
@@ -261,7 +261,7 @@ def read_file(
261
261
  **s3_client_kwargs,
262
262
  ) -> LocalTable:
263
263
 
264
- reader = TABLE_TYPE_TO_READER_FUNC[table_type.value]
264
+ reader = TABLE_TYPE_TO_S3_READER_FUNC[table_type.value]
265
265
  try:
266
266
  table = reader(
267
267
  s3_url,
@@ -61,7 +61,7 @@ def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
61
61
  "Daft not installed. Install Daft using pip to run these benchmarks: `pip install getdaft`"
62
62
  )
63
63
 
64
- tbl = daft.table.Table.read_parquet(path, columns=columns)
64
+ tbl = daft.read_parquet(path, columns=columns)
65
65
  return tbl.to_arrow()
66
66
 
67
67