deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -16
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +26 -10
- deltacat/catalog/model/catalog.py +165 -109
- deltacat/catalog/model/properties.py +25 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/client.py +406 -0
- deltacat/constants.py +5 -6
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/types.py +5 -3
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +531 -5
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -111
- deltacat/daft/model.py +0 -258
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → compute/jobs}/__init__.py +0 -0
- /deltacat/examples/{common → experimental}/__init__.py +0 -0
- /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
deltacat/utils/polars.py
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional, List, Dict, Callable, Union
|
3
|
+
|
4
|
+
import polars as pl
|
5
|
+
|
6
|
+
from fsspec import AbstractFileSystem
|
7
|
+
from ray.data.datasource import FilenameProvider
|
8
|
+
|
9
|
+
from deltacat import logs
|
10
|
+
|
11
|
+
from deltacat.types.media import ContentType
|
12
|
+
|
13
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
14
|
+
|
15
|
+
|
16
|
+
def write_json(
|
17
|
+
table: pl.DataFrame,
|
18
|
+
path: str,
|
19
|
+
*,
|
20
|
+
filesystem: Optional[AbstractFileSystem] = None,
|
21
|
+
fs_open_kwargs: Dict[str, any] = {},
|
22
|
+
**write_kwargs,
|
23
|
+
) -> None:
|
24
|
+
if not filesystem:
|
25
|
+
table.write_ndjson(path, **write_kwargs)
|
26
|
+
else:
|
27
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
28
|
+
table.write_ndjson(f, **write_kwargs)
|
29
|
+
|
30
|
+
|
31
|
+
def write_csv(
|
32
|
+
table: pl.DataFrame,
|
33
|
+
path: str,
|
34
|
+
*,
|
35
|
+
filesystem: Optional[AbstractFileSystem] = None,
|
36
|
+
fs_open_kwargs: Dict[str, any] = {},
|
37
|
+
**write_kwargs,
|
38
|
+
) -> None:
|
39
|
+
if not filesystem:
|
40
|
+
table.write_csv(path, **write_kwargs)
|
41
|
+
else:
|
42
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
43
|
+
table.write_csv(f, **write_kwargs)
|
44
|
+
|
45
|
+
|
46
|
+
def write_avro(
|
47
|
+
table: pl.DataFrame,
|
48
|
+
path: str,
|
49
|
+
*,
|
50
|
+
filesystem: Optional[AbstractFileSystem] = None,
|
51
|
+
fs_open_kwargs: Dict[str, any] = {},
|
52
|
+
**write_kwargs,
|
53
|
+
) -> None:
|
54
|
+
if not filesystem:
|
55
|
+
table.write_avro(path, **write_kwargs)
|
56
|
+
else:
|
57
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
58
|
+
table.write_avro(f, **write_kwargs)
|
59
|
+
|
60
|
+
|
61
|
+
def write_parquet(
|
62
|
+
table: pl.DataFrame,
|
63
|
+
path: str,
|
64
|
+
*,
|
65
|
+
filesystem: Optional[AbstractFileSystem] = None,
|
66
|
+
fs_open_kwargs: Dict[str, any] = {},
|
67
|
+
**write_kwargs,
|
68
|
+
) -> None:
|
69
|
+
if not filesystem:
|
70
|
+
table.write_parquet(path, **write_kwargs)
|
71
|
+
else:
|
72
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
73
|
+
table.write_parquet(f, **write_kwargs)
|
74
|
+
|
75
|
+
|
76
|
+
CONTENT_TYPE_TO_PL_WRITE_FUNC: Dict[str, Callable] = {
|
77
|
+
# TODO (pdames): add support for other delimited text content types as
|
78
|
+
# pyarrow adds support for custom delimiters, escaping, and None value
|
79
|
+
# representations to pyarrow.csv.WriteOptions.
|
80
|
+
ContentType.AVRO.value: write_avro,
|
81
|
+
ContentType.CSV.value: write_csv,
|
82
|
+
ContentType.PARQUET.value: write_parquet,
|
83
|
+
ContentType.JSON.value: write_json,
|
84
|
+
}
|
85
|
+
|
86
|
+
|
87
|
+
def slice_table(table: pl.DataFrame, max_len: Optional[int]) -> List[pl.DataFrame]:
|
88
|
+
"""
|
89
|
+
Iteratively create 0-copy table slices.
|
90
|
+
"""
|
91
|
+
if max_len is None:
|
92
|
+
return [table]
|
93
|
+
tables = []
|
94
|
+
offset = 0
|
95
|
+
records_remaining = len(table)
|
96
|
+
while records_remaining > 0:
|
97
|
+
records_this_entry = min(max_len, records_remaining)
|
98
|
+
tables.append(table.slice(offset, records_this_entry))
|
99
|
+
records_remaining -= records_this_entry
|
100
|
+
offset += records_this_entry
|
101
|
+
return tables
|
102
|
+
|
103
|
+
|
104
|
+
def dataframe_size(table: pl.DataFrame) -> int:
|
105
|
+
return table.estimated_size()
|
106
|
+
|
107
|
+
|
108
|
+
def dataframe_to_file(
|
109
|
+
table: pl.DataFrame,
|
110
|
+
base_path: str,
|
111
|
+
file_system: Optional[AbstractFileSystem],
|
112
|
+
block_path_provider: Union[Callable, FilenameProvider],
|
113
|
+
content_type: str = ContentType.PARQUET.value,
|
114
|
+
**kwargs,
|
115
|
+
) -> None:
|
116
|
+
"""
|
117
|
+
Writes the given Pyarrow Table to a file.
|
118
|
+
"""
|
119
|
+
writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
|
120
|
+
if not writer:
|
121
|
+
raise NotImplementedError(
|
122
|
+
f"Pyarrow writer for content type '{content_type}' not "
|
123
|
+
f"implemented. Known content types: "
|
124
|
+
f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys}"
|
125
|
+
)
|
126
|
+
path = block_path_provider(base_path)
|
127
|
+
logger.debug(f"Writing table: {table} with kwargs: {kwargs} to path: {path}")
|
128
|
+
writer(table, path, filesystem=file_system, **kwargs)
|
deltacat/utils/pyarrow.py
CHANGED
@@ -13,11 +13,14 @@ from deltacat.exceptions import ContentTypeValidationError
|
|
13
13
|
import pyarrow as pa
|
14
14
|
import numpy as np
|
15
15
|
import pyarrow.compute as pc
|
16
|
+
import pyarrow.fs as pafs
|
17
|
+
|
16
18
|
from fsspec import AbstractFileSystem
|
17
19
|
from pyarrow import csv as pacsv
|
18
20
|
from pyarrow import feather as paf
|
19
21
|
from pyarrow import json as pajson
|
20
22
|
from pyarrow import parquet as papq
|
23
|
+
from pyarrow import orc as paorc
|
21
24
|
from ray.data.datasource import FilenameProvider
|
22
25
|
from deltacat.utils.s3fs import create_s3_file_system
|
23
26
|
|
@@ -40,8 +43,10 @@ from deltacat.utils.arguments import (
|
|
40
43
|
sanitize_kwargs_to_callable,
|
41
44
|
sanitize_kwargs_by_supported_kwargs,
|
42
45
|
)
|
46
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
43
47
|
from functools import lru_cache
|
44
48
|
|
49
|
+
|
45
50
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
46
51
|
|
47
52
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
@@ -103,6 +108,82 @@ def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
|
|
103
108
|
raise e
|
104
109
|
|
105
110
|
|
111
|
+
# TODO(pdames): Remove deprecated S3-only readers.
|
112
|
+
def read_csv(
|
113
|
+
path: str,
|
114
|
+
*,
|
115
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
116
|
+
fs_open_kwargs: Dict[str, any] = {},
|
117
|
+
**read_kwargs,
|
118
|
+
) -> pa.Table:
|
119
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
120
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
121
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
122
|
+
return pacsv.read_csv(f, **read_kwargs)
|
123
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
124
|
+
return pacsv.read_csv(f, **read_kwargs)
|
125
|
+
|
126
|
+
|
127
|
+
def read_feather(
|
128
|
+
path: str,
|
129
|
+
*,
|
130
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
131
|
+
fs_open_kwargs: Dict[str, any] = {},
|
132
|
+
**read_kwargs,
|
133
|
+
) -> pa.Table:
|
134
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
135
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
136
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
137
|
+
return paf.read_feather(f, **read_kwargs)
|
138
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
139
|
+
return paf.read_feather(f, **read_kwargs)
|
140
|
+
|
141
|
+
|
142
|
+
def read_json(
|
143
|
+
path: str,
|
144
|
+
*,
|
145
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
146
|
+
fs_open_kwargs: Dict[str, any] = {},
|
147
|
+
**read_kwargs,
|
148
|
+
) -> pa.Table:
|
149
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
150
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
151
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
152
|
+
return pajson.read_json(f, **read_kwargs)
|
153
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
154
|
+
return pajson.read_json(f, **read_kwargs)
|
155
|
+
|
156
|
+
|
157
|
+
def read_orc(
|
158
|
+
path: str,
|
159
|
+
*,
|
160
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
161
|
+
fs_open_kwargs: Dict[str, any] = {},
|
162
|
+
**read_kwargs,
|
163
|
+
) -> pa.Table:
|
164
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
165
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
166
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
167
|
+
return paorc.read_table(f, **read_kwargs)
|
168
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
169
|
+
return paorc.read_table(f, **read_kwargs)
|
170
|
+
|
171
|
+
|
172
|
+
def read_parquet(
|
173
|
+
path: str,
|
174
|
+
*,
|
175
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
176
|
+
fs_open_kwargs: Dict[str, any] = {},
|
177
|
+
**read_kwargs,
|
178
|
+
) -> pa.Table:
|
179
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
180
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
181
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
182
|
+
return papq.read_table(f, **read_kwargs)
|
183
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
184
|
+
return papq.read_table(f, **read_kwargs)
|
185
|
+
|
186
|
+
|
106
187
|
CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
|
107
188
|
ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
|
108
189
|
ContentType.TSV.value: pyarrow_read_csv,
|
@@ -118,24 +199,78 @@ CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
|
|
118
199
|
|
119
200
|
|
120
201
|
def write_feather(
|
121
|
-
table: pa.Table,
|
202
|
+
table: pa.Table,
|
203
|
+
path: str,
|
204
|
+
*,
|
205
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
206
|
+
fs_open_kwargs: Dict[str, any] = {},
|
207
|
+
**write_kwargs,
|
122
208
|
) -> None:
|
123
|
-
|
124
|
-
|
125
|
-
|
209
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
210
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
211
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
212
|
+
paf.write_feather(table, f, **write_kwargs)
|
213
|
+
else:
|
214
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
215
|
+
paf.write_feather(table, f, **write_kwargs)
|
126
216
|
|
127
217
|
|
128
218
|
def write_csv(
|
129
|
-
table: pa.Table,
|
219
|
+
table: pa.Table,
|
220
|
+
path: str,
|
221
|
+
*,
|
222
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
223
|
+
fs_open_kwargs: Dict[str, any] = {},
|
224
|
+
**write_kwargs,
|
225
|
+
) -> None:
|
226
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
227
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
228
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
229
|
+
pacsv.write_csv(table, f, **write_kwargs)
|
230
|
+
else:
|
231
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
232
|
+
# TODO (pdames): Add support for client-specified compression types.
|
233
|
+
with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
|
234
|
+
if write_kwargs.get("write_options") is None:
|
235
|
+
# column names are kept in table metadata, so omit header
|
236
|
+
write_kwargs["write_options"] = pacsv.WriteOptions(
|
237
|
+
include_header=False
|
238
|
+
)
|
239
|
+
pacsv.write_csv(table, out, **write_kwargs)
|
240
|
+
|
241
|
+
|
242
|
+
def write_orc(
|
243
|
+
table: pa.Table,
|
244
|
+
path: str,
|
245
|
+
*,
|
246
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
247
|
+
fs_open_kwargs: Dict[str, any] = {},
|
248
|
+
**write_kwargs,
|
130
249
|
) -> None:
|
250
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
251
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
252
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
253
|
+
paorc.write_table(table, f, **write_kwargs)
|
254
|
+
else:
|
255
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
256
|
+
paorc.write_table(table, f, **write_kwargs)
|
257
|
+
|
131
258
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
259
|
+
def write_parquet(
|
260
|
+
table: pa.Table,
|
261
|
+
path: str,
|
262
|
+
*,
|
263
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
264
|
+
fs_open_kwargs: Dict[str, any] = {},
|
265
|
+
**write_kwargs,
|
266
|
+
) -> None:
|
267
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
268
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
269
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
270
|
+
papq.write_table(table, f, **write_kwargs)
|
271
|
+
else:
|
272
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
273
|
+
papq.write_table(table, f, **write_kwargs)
|
139
274
|
|
140
275
|
|
141
276
|
CONTENT_TYPE_TO_PA_WRITE_FUNC: Dict[str, Callable] = {
|
@@ -143,7 +278,8 @@ CONTENT_TYPE_TO_PA_WRITE_FUNC: Dict[str, Callable] = {
|
|
143
278
|
# pyarrow adds support for custom delimiters, escaping, and None value
|
144
279
|
# representations to pyarrow.csv.WriteOptions.
|
145
280
|
ContentType.CSV.value: write_csv,
|
146
|
-
ContentType.
|
281
|
+
ContentType.ORC.value: write_orc,
|
282
|
+
ContentType.PARQUET.value: write_parquet,
|
147
283
|
ContentType.FEATHER.value: write_feather,
|
148
284
|
}
|
149
285
|
|
@@ -180,7 +316,7 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
|
|
180
316
|
ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
|
181
317
|
ContentEncoding.GZIP.value: partial(gzip.open, mode="rb"),
|
182
318
|
ContentEncoding.BZIP2.value: partial(bz2.open, mode="rb"),
|
183
|
-
ContentEncoding.IDENTITY.value: lambda
|
319
|
+
ContentEncoding.IDENTITY.value: lambda file_path: file_path,
|
184
320
|
}
|
185
321
|
|
186
322
|
|
@@ -522,7 +658,7 @@ def parquet_file_size(table: papq.ParquetFile) -> int:
|
|
522
658
|
def table_to_file(
|
523
659
|
table: pa.Table,
|
524
660
|
base_path: str,
|
525
|
-
file_system: AbstractFileSystem,
|
661
|
+
file_system: Optional[AbstractFileSystem],
|
526
662
|
block_path_provider: Union[Callable, FilenameProvider],
|
527
663
|
content_type: str = ContentType.PARQUET.value,
|
528
664
|
**kwargs,
|
@@ -88,7 +88,7 @@ def round_robin_options_provider(
|
|
88
88
|
**kwargs,
|
89
89
|
) -> Dict[str, Any]:
|
90
90
|
"""Returns a resource dictionary that can be included with ray remote
|
91
|
-
options to round
|
91
|
+
options to round-robin indexed tasks or actors across a list of resource
|
92
92
|
keys. For example, the following code round-robins 100 tasks across all
|
93
93
|
live cluster nodes:
|
94
94
|
```
|
@@ -21,7 +21,7 @@ def node_resource_keys(
|
|
21
21
|
keys = []
|
22
22
|
node_dict = ray.nodes()
|
23
23
|
if node_dict:
|
24
|
-
for node in
|
24
|
+
for node in node_dict:
|
25
25
|
if filter_fn(node):
|
26
26
|
for key in node["Resources"].keys():
|
27
27
|
if key.startswith("node:"):
|
@@ -37,7 +37,7 @@ def current_node_resource_key() -> str:
|
|
37
37
|
actors on that node via:
|
38
38
|
`foo.options(resources={get_current_node_resource_key(): 0.01}).remote()`
|
39
39
|
"""
|
40
|
-
current_node_id = ray.get_runtime_context().get_node_id()
|
40
|
+
current_node_id = ray.get_runtime_context().get_node_id()
|
41
41
|
keys = node_resource_keys(lambda n: n["NodeID"] == current_node_id)
|
42
42
|
assert (
|
43
43
|
len(keys) <= 1
|
@@ -45,6 +45,47 @@ def current_node_resource_key() -> str:
|
|
45
45
|
return keys[0] if len(keys) == 1 else None
|
46
46
|
|
47
47
|
|
48
|
+
def current_node_resources() -> Dict[str, float]:
|
49
|
+
"""Get's Ray's resources for the current node as a dictionary.
|
50
|
+
|
51
|
+
Example Return Value:
|
52
|
+
>>> {
|
53
|
+
>>> 'memory': 17611605607.0,
|
54
|
+
>>> 'node:127.0.0.1': 1.0,
|
55
|
+
>>> 'node:__internal_head__': 1.0,
|
56
|
+
>>> 'object_store_memory': 2147483648.0,
|
57
|
+
>>> 'CPU': 10.0,
|
58
|
+
>>> }
|
59
|
+
"""
|
60
|
+
current_node_id = ray.get_runtime_context().get_node_id()
|
61
|
+
node_dict = ray.nodes()
|
62
|
+
if node_dict:
|
63
|
+
for node in node_dict:
|
64
|
+
if node["NodeID"] == current_node_id:
|
65
|
+
return node["Resources"]
|
66
|
+
else:
|
67
|
+
raise ValueError("No node dictionary found on current node.")
|
68
|
+
return {}
|
69
|
+
|
70
|
+
|
71
|
+
def find_max_single_node_resource_type(resource_type: str) -> float:
|
72
|
+
"""Finds the max resource amount available on any single cluster node
|
73
|
+
for the given resource type. Returns the max resource amount as a float."""
|
74
|
+
node_dict = ray.nodes()
|
75
|
+
max_single_node_resource_amount = 0
|
76
|
+
if node_dict:
|
77
|
+
for node in node_dict:
|
78
|
+
node_resource_amount = node["Resources"].get(resource_type)
|
79
|
+
if node_resource_amount is not None:
|
80
|
+
max_single_node_resource_amount = max(
|
81
|
+
max_single_node_resource_amount,
|
82
|
+
node_resource_amount,
|
83
|
+
)
|
84
|
+
else:
|
85
|
+
raise ValueError("No node dictionary found on current node.")
|
86
|
+
return max_single_node_resource_amount
|
87
|
+
|
88
|
+
|
48
89
|
def is_node_alive(node: Dict[str, Any]) -> bool:
|
49
90
|
"""Takes a node from `ray.nodes()` as input. Returns True if the node is
|
50
91
|
alive, and False otherwise."""
|
@@ -67,6 +108,17 @@ def live_node_waiter(min_live_nodes: int, poll_interval_seconds: float = 0.5) ->
|
|
67
108
|
time.sleep(poll_interval_seconds)
|
68
109
|
|
69
110
|
|
111
|
+
def live_cpu_waiter(min_live_cpus: int, poll_interval_seconds: float = 0.5) -> None:
|
112
|
+
"""Waits until the given minimum number of live CPUs are present in the
|
113
|
+
cluster. Checks the current number of live CPUs every
|
114
|
+
`poll_interval_seconds`."""
|
115
|
+
live_cpus = cluster_cpus()
|
116
|
+
while live_cpus < min_live_cpus:
|
117
|
+
live_cpus = cluster_cpus()
|
118
|
+
logger.info(f"Waiting for Live CPUs: {live_cpus}/{min_live_cpus}")
|
119
|
+
time.sleep(poll_interval_seconds)
|
120
|
+
|
121
|
+
|
70
122
|
def live_node_resource_keys() -> List[str]:
|
71
123
|
"""Get Ray resource keys for all live cluster nodes as a list of strings of
|
72
124
|
the form: "node:{node_resource_name}". The returned keys can be used to
|
@@ -83,7 +135,7 @@ def other_live_node_resource_keys() -> List[str]:
|
|
83
135
|
|
84
136
|
For example, invoking this function from your Ray application driver on the
|
85
137
|
head node returns the resource keys of all live worker nodes."""
|
86
|
-
current_node_id = ray.get_runtime_context().get_node_id()
|
138
|
+
current_node_id = ray.get_runtime_context().get_node_id()
|
87
139
|
return node_resource_keys(
|
88
140
|
lambda n: n["NodeID"] != current_node_id and is_node_alive(n)
|
89
141
|
)
|
@@ -97,7 +149,7 @@ def other_node_resource_keys() -> List[str]:
|
|
97
149
|
|
98
150
|
For example, invoking this function from your Ray application driver on the
|
99
151
|
head node returns the resource keys of all worker nodes."""
|
100
|
-
current_node_id = ray.get_runtime_context().get_node_id()
|
152
|
+
current_node_id = ray.get_runtime_context().get_node_id()
|
101
153
|
return node_resource_keys(lambda n: n["NodeID"] != current_node_id)
|
102
154
|
|
103
155
|
|