deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -88,7 +88,7 @@ def round_robin_options_provider(
|
|
88
88
|
**kwargs,
|
89
89
|
) -> Dict[str, Any]:
|
90
90
|
"""Returns a resource dictionary that can be included with ray remote
|
91
|
-
options to round
|
91
|
+
options to round-robin indexed tasks or actors across a list of resource
|
92
92
|
keys. For example, the following code round-robins 100 tasks across all
|
93
93
|
live cluster nodes:
|
94
94
|
```
|
@@ -2,7 +2,11 @@ import logging
|
|
2
2
|
from typing import Callable, Dict, List, Optional, Union
|
3
3
|
|
4
4
|
from fsspec import AbstractFileSystem
|
5
|
+
|
6
|
+
import pyarrow as pa
|
5
7
|
from pyarrow import csv as pacsv
|
8
|
+
import pyarrow.fs as pafs
|
9
|
+
|
6
10
|
from ray.data import Dataset
|
7
11
|
from ray.data.datasource import FilenameProvider
|
8
12
|
|
@@ -16,7 +20,7 @@ def write_parquet(
|
|
16
20
|
dataset: Dataset,
|
17
21
|
base_path: str,
|
18
22
|
*,
|
19
|
-
filesystem: AbstractFileSystem,
|
23
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
|
20
24
|
block_path_provider: Union[Callable, FilenameProvider],
|
21
25
|
**kwargs,
|
22
26
|
) -> None:
|
@@ -34,16 +38,36 @@ def write_csv(
|
|
34
38
|
dataset: Dataset,
|
35
39
|
base_path: str,
|
36
40
|
*,
|
37
|
-
filesystem: AbstractFileSystem,
|
41
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
|
38
42
|
block_path_provider: Union[Callable, FilenameProvider],
|
39
43
|
**kwargs,
|
40
44
|
) -> None:
|
45
|
+
"""
|
46
|
+
Write a Ray Dataset to a CSV file (or other delimited text format).
|
47
|
+
"""
|
48
|
+
# Extract CSV-specific options from kwargs
|
49
|
+
delimiter = kwargs.pop("delimiter", ",")
|
50
|
+
quoting_style = kwargs.pop("quoting_style", None)
|
51
|
+
include_header = kwargs.pop("include_header", False)
|
52
|
+
|
53
|
+
# Create a function that will generate WriteOptions inside the worker process
|
54
|
+
def arrow_csv_args_fn():
|
55
|
+
write_options = pacsv.WriteOptions(
|
56
|
+
delimiter=delimiter,
|
57
|
+
include_header=include_header,
|
58
|
+
quoting_style=quoting_style,
|
59
|
+
)
|
60
|
+
return {"write_options": write_options}
|
61
|
+
|
62
|
+
# Check if the block_path_provider will generate .gz files to avoid double compression
|
63
|
+
pa_open_stream_args = {}
|
64
|
+
if not (
|
65
|
+
hasattr(block_path_provider, "content_encoding")
|
66
|
+
and block_path_provider.content_encoding == ContentEncoding.GZIP
|
67
|
+
):
|
68
|
+
# Block path provider will not generate .gz files, so we need to apply explicit compression
|
69
|
+
pa_open_stream_args["compression"] = ContentEncoding.GZIP.value
|
41
70
|
|
42
|
-
# column names are kept in table metadata, so omit header
|
43
|
-
arrow_csv_args_fn = lambda: {
|
44
|
-
"write_options": pacsv.WriteOptions(include_header=False)
|
45
|
-
}
|
46
|
-
pa_open_stream_args = {"compression": ContentEncoding.GZIP.value}
|
47
71
|
dataset.write_csv(
|
48
72
|
base_path,
|
49
73
|
arrow_open_stream_args=pa_open_stream_args,
|
@@ -55,12 +79,76 @@ def write_csv(
|
|
55
79
|
)
|
56
80
|
|
57
81
|
|
82
|
+
def write_json(
|
83
|
+
dataset: Dataset,
|
84
|
+
base_path: str,
|
85
|
+
*,
|
86
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
|
87
|
+
block_path_provider: Union[Callable, FilenameProvider],
|
88
|
+
**kwargs,
|
89
|
+
) -> None:
|
90
|
+
"""
|
91
|
+
Write a Ray Dataset to a JSON file using Ray's native JSON writer.
|
92
|
+
"""
|
93
|
+
# Check if the block_path_provider will generate .gz files to avoid double compression
|
94
|
+
pa_open_stream_args = {}
|
95
|
+
if not (
|
96
|
+
hasattr(block_path_provider, "content_encoding")
|
97
|
+
and block_path_provider.content_encoding == ContentEncoding.GZIP
|
98
|
+
):
|
99
|
+
# Block path provider will not generate .gz files, so we need to apply explicit compression
|
100
|
+
pa_open_stream_args["compression"] = ContentEncoding.GZIP.value
|
101
|
+
|
102
|
+
dataset.write_json(
|
103
|
+
base_path,
|
104
|
+
arrow_open_stream_args=pa_open_stream_args,
|
105
|
+
filesystem=filesystem,
|
106
|
+
try_create_dir=False,
|
107
|
+
filename_provider=block_path_provider,
|
108
|
+
**kwargs,
|
109
|
+
)
|
110
|
+
|
111
|
+
|
58
112
|
CONTENT_TYPE_TO_DATASET_WRITE_FUNC: Dict[str, Callable] = {
|
113
|
+
ContentType.UNESCAPED_TSV.value: write_csv,
|
114
|
+
ContentType.TSV.value: write_csv,
|
59
115
|
ContentType.CSV.value: write_csv,
|
116
|
+
ContentType.PSV.value: write_csv,
|
60
117
|
ContentType.PARQUET.value: write_parquet,
|
118
|
+
ContentType.JSON.value: write_json,
|
61
119
|
}
|
62
120
|
|
63
121
|
|
122
|
+
def content_type_to_writer_kwargs(content_type: str) -> Dict[str, any]:
|
123
|
+
"""
|
124
|
+
Returns writer kwargs for the given content type when writing with Ray Dataset.
|
125
|
+
"""
|
126
|
+
if content_type == ContentType.UNESCAPED_TSV.value:
|
127
|
+
return {
|
128
|
+
"delimiter": "\t",
|
129
|
+
"include_header": False,
|
130
|
+
"quoting_style": "none",
|
131
|
+
}
|
132
|
+
if content_type == ContentType.TSV.value:
|
133
|
+
return {
|
134
|
+
"delimiter": "\t",
|
135
|
+
"include_header": False,
|
136
|
+
}
|
137
|
+
if content_type == ContentType.CSV.value:
|
138
|
+
return {
|
139
|
+
"delimiter": ",",
|
140
|
+
"include_header": False,
|
141
|
+
}
|
142
|
+
if content_type == ContentType.PSV.value:
|
143
|
+
return {
|
144
|
+
"delimiter": "|",
|
145
|
+
"include_header": False,
|
146
|
+
}
|
147
|
+
if content_type in {ContentType.PARQUET.value, ContentType.JSON.value}:
|
148
|
+
return {}
|
149
|
+
raise ValueError(f"Unsupported content type: {content_type}")
|
150
|
+
|
151
|
+
|
64
152
|
def slice_dataset(dataset: Dataset, max_len: Optional[int]) -> List[Dataset]:
|
65
153
|
"""
|
66
154
|
Returns equally-sized dataset slices of up to `max_len` records each.
|
@@ -88,9 +176,10 @@ def dataset_size(dataset: Dataset) -> int:
|
|
88
176
|
def dataset_to_file(
|
89
177
|
table: Dataset,
|
90
178
|
base_path: str,
|
91
|
-
|
179
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
|
92
180
|
block_path_provider: Union[Callable, FilenameProvider],
|
93
181
|
content_type: str = ContentType.PARQUET.value,
|
182
|
+
schema: Optional[pa.Schema] = None,
|
94
183
|
**kwargs,
|
95
184
|
) -> None:
|
96
185
|
"""
|
@@ -103,10 +192,12 @@ def dataset_to_file(
|
|
103
192
|
f" implemented. Known content types: "
|
104
193
|
f"{CONTENT_TYPE_TO_DATASET_WRITE_FUNC.keys}"
|
105
194
|
)
|
195
|
+
writer_kwargs = content_type_to_writer_kwargs(content_type)
|
196
|
+
writer_kwargs.update(kwargs)
|
106
197
|
writer(
|
107
198
|
table,
|
108
199
|
base_path,
|
109
|
-
filesystem=
|
200
|
+
filesystem=filesystem,
|
110
201
|
block_path_provider=block_path_provider,
|
111
|
-
**
|
202
|
+
**writer_kwargs,
|
112
203
|
)
|
@@ -21,7 +21,7 @@ def node_resource_keys(
|
|
21
21
|
keys = []
|
22
22
|
node_dict = ray.nodes()
|
23
23
|
if node_dict:
|
24
|
-
for node in
|
24
|
+
for node in node_dict:
|
25
25
|
if filter_fn(node):
|
26
26
|
for key in node["Resources"].keys():
|
27
27
|
if key.startswith("node:"):
|
@@ -37,7 +37,7 @@ def current_node_resource_key() -> str:
|
|
37
37
|
actors on that node via:
|
38
38
|
`foo.options(resources={get_current_node_resource_key(): 0.01}).remote()`
|
39
39
|
"""
|
40
|
-
current_node_id = ray.get_runtime_context().get_node_id()
|
40
|
+
current_node_id = ray.get_runtime_context().get_node_id()
|
41
41
|
keys = node_resource_keys(lambda n: n["NodeID"] == current_node_id)
|
42
42
|
assert (
|
43
43
|
len(keys) <= 1
|
@@ -45,6 +45,47 @@ def current_node_resource_key() -> str:
|
|
45
45
|
return keys[0] if len(keys) == 1 else None
|
46
46
|
|
47
47
|
|
48
|
+
def current_node_resources() -> Dict[str, float]:
|
49
|
+
"""Get's Ray's resources for the current node as a dictionary.
|
50
|
+
|
51
|
+
Example Return Value:
|
52
|
+
>>> {
|
53
|
+
>>> 'memory': 17611605607.0,
|
54
|
+
>>> 'node:127.0.0.1': 1.0,
|
55
|
+
>>> 'node:__internal_head__': 1.0,
|
56
|
+
>>> 'object_store_memory': 2147483648.0,
|
57
|
+
>>> 'CPU': 10.0,
|
58
|
+
>>> }
|
59
|
+
"""
|
60
|
+
current_node_id = ray.get_runtime_context().get_node_id()
|
61
|
+
node_dict = ray.nodes()
|
62
|
+
if node_dict:
|
63
|
+
for node in node_dict:
|
64
|
+
if node["NodeID"] == current_node_id:
|
65
|
+
return node["Resources"]
|
66
|
+
else:
|
67
|
+
raise ValueError("No node dictionary found on current node.")
|
68
|
+
return {}
|
69
|
+
|
70
|
+
|
71
|
+
def find_max_single_node_resource_type(resource_type: str) -> float:
|
72
|
+
"""Finds the max resource amount available on any single cluster node
|
73
|
+
for the given resource type. Returns the max resource amount as a float."""
|
74
|
+
node_dict = ray.nodes()
|
75
|
+
max_single_node_resource_amount = 0
|
76
|
+
if node_dict:
|
77
|
+
for node in node_dict:
|
78
|
+
node_resource_amount = node["Resources"].get(resource_type)
|
79
|
+
if node_resource_amount is not None:
|
80
|
+
max_single_node_resource_amount = max(
|
81
|
+
max_single_node_resource_amount,
|
82
|
+
node_resource_amount,
|
83
|
+
)
|
84
|
+
else:
|
85
|
+
raise ValueError("No node dictionary found on current node.")
|
86
|
+
return max_single_node_resource_amount
|
87
|
+
|
88
|
+
|
48
89
|
def is_node_alive(node: Dict[str, Any]) -> bool:
|
49
90
|
"""Takes a node from `ray.nodes()` as input. Returns True if the node is
|
50
91
|
alive, and False otherwise."""
|
@@ -67,6 +108,17 @@ def live_node_waiter(min_live_nodes: int, poll_interval_seconds: float = 0.5) ->
|
|
67
108
|
time.sleep(poll_interval_seconds)
|
68
109
|
|
69
110
|
|
111
|
+
def live_cpu_waiter(min_live_cpus: int, poll_interval_seconds: float = 0.5) -> None:
|
112
|
+
"""Waits until the given minimum number of live CPUs are present in the
|
113
|
+
cluster. Checks the current number of live CPUs every
|
114
|
+
`poll_interval_seconds`."""
|
115
|
+
live_cpus = cluster_cpus()
|
116
|
+
while live_cpus < min_live_cpus:
|
117
|
+
live_cpus = cluster_cpus()
|
118
|
+
logger.info(f"Waiting for Live CPUs: {live_cpus}/{min_live_cpus}")
|
119
|
+
time.sleep(poll_interval_seconds)
|
120
|
+
|
121
|
+
|
70
122
|
def live_node_resource_keys() -> List[str]:
|
71
123
|
"""Get Ray resource keys for all live cluster nodes as a list of strings of
|
72
124
|
the form: "node:{node_resource_name}". The returned keys can be used to
|
@@ -83,7 +135,7 @@ def other_live_node_resource_keys() -> List[str]:
|
|
83
135
|
|
84
136
|
For example, invoking this function from your Ray application driver on the
|
85
137
|
head node returns the resource keys of all live worker nodes."""
|
86
|
-
current_node_id = ray.get_runtime_context().get_node_id()
|
138
|
+
current_node_id = ray.get_runtime_context().get_node_id()
|
87
139
|
return node_resource_keys(
|
88
140
|
lambda n: n["NodeID"] != current_node_id and is_node_alive(n)
|
89
141
|
)
|
@@ -97,7 +149,7 @@ def other_node_resource_keys() -> List[str]:
|
|
97
149
|
|
98
150
|
For example, invoking this function from your Ray application driver on the
|
99
151
|
head node returns the resource keys of all worker nodes."""
|
100
|
-
current_node_id = ray.get_runtime_context().get_node_id()
|
152
|
+
current_node_id = ray.get_runtime_context().get_node_id()
|
101
153
|
return node_resource_keys(lambda n: n["NodeID"] != current_node_id)
|
102
154
|
|
103
155
|
|