deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/aws/s3u.py
CHANGED
@@ -1,171 +1,37 @@
|
|
1
1
|
import logging
|
2
|
-
import
|
3
|
-
from functools import partial
|
4
|
-
from typing import Any, Callable, Dict, Generator, List, Optional, Union
|
5
|
-
from uuid import uuid4
|
2
|
+
from typing import Any, Dict, Generator, Optional
|
6
3
|
from botocore.config import Config
|
7
4
|
from deltacat.aws.constants import (
|
8
5
|
BOTO_MAX_RETRIES,
|
9
|
-
UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY,
|
10
6
|
BOTO_THROTTLING_ERROR_CODES,
|
7
|
+
)
|
8
|
+
from deltacat.constants import (
|
9
|
+
UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY,
|
11
10
|
RETRYABLE_TRANSIENT_ERRORS,
|
12
|
-
BOTO_TIMEOUT_ERROR_CODES,
|
13
|
-
UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY,
|
14
|
-
DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY,
|
15
11
|
)
|
16
12
|
|
17
|
-
import pyarrow.fs
|
18
|
-
import ray
|
19
|
-
import s3fs
|
20
13
|
from boto3.resources.base import ServiceResource
|
21
14
|
from botocore.client import BaseClient
|
22
15
|
from botocore.exceptions import ClientError
|
23
|
-
from ray.data.block import Block, BlockAccessor, BlockMetadata
|
24
|
-
from ray.data.datasource import FilenameProvider
|
25
|
-
from ray.types import ObjectRef
|
26
16
|
from tenacity import (
|
27
17
|
Retrying,
|
28
18
|
retry_if_exception_type,
|
29
19
|
stop_after_delay,
|
30
20
|
wait_random_exponential,
|
31
21
|
)
|
32
|
-
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
33
22
|
import deltacat.aws.clients as aws_utils
|
34
23
|
from deltacat import logs
|
35
|
-
from deltacat.storage import (
|
36
|
-
DistributedDataset,
|
37
|
-
LocalDataset,
|
38
|
-
LocalTable,
|
39
|
-
Manifest,
|
40
|
-
ManifestEntry,
|
41
|
-
ManifestEntryList,
|
42
|
-
)
|
43
|
-
from deltacat.types.media import (
|
44
|
-
ContentEncoding,
|
45
|
-
ContentType,
|
46
|
-
TableType,
|
47
|
-
DistributedDatasetType,
|
48
|
-
)
|
49
|
-
from deltacat.types.tables import (
|
50
|
-
TABLE_CLASS_TO_SIZE_FUNC,
|
51
|
-
TABLE_TYPE_TO_S3_READER_FUNC,
|
52
|
-
TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS,
|
53
|
-
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
|
54
|
-
get_table_length,
|
55
|
-
)
|
56
24
|
from deltacat.exceptions import (
|
57
25
|
RetryableError,
|
58
|
-
RetryableUploadTableError,
|
59
|
-
RetryableDownloadTableError,
|
60
26
|
RetryableDownloadFileError,
|
61
27
|
RetryableUploadFileError,
|
62
28
|
NonRetryableDownloadFileError,
|
63
29
|
NonRetryableUploadFileError,
|
64
|
-
NonRetryableUploadTableError,
|
65
|
-
NonRetryableDownloadTableError,
|
66
30
|
)
|
67
|
-
from deltacat.types.partial_download import PartialFileDownloadParams
|
68
|
-
from deltacat.utils.common import ReadKwargsProvider
|
69
|
-
from deltacat.exceptions import categorize_errors
|
70
31
|
|
71
32
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
72
33
|
|
73
34
|
|
74
|
-
class CapturedBlockWritePaths:
|
75
|
-
def __init__(self):
|
76
|
-
self._write_paths: List[str] = []
|
77
|
-
self._block_refs: List[ObjectRef[Block]] = []
|
78
|
-
|
79
|
-
def extend(self, write_paths: List[str], block_refs: List[ObjectRef[Block]]):
|
80
|
-
try:
|
81
|
-
iter(write_paths)
|
82
|
-
except TypeError:
|
83
|
-
pass
|
84
|
-
else:
|
85
|
-
self._write_paths.extend(write_paths)
|
86
|
-
try:
|
87
|
-
iter(block_refs)
|
88
|
-
except TypeError:
|
89
|
-
pass
|
90
|
-
else:
|
91
|
-
self._block_refs.extend(block_refs)
|
92
|
-
|
93
|
-
def write_paths(self) -> List[str]:
|
94
|
-
return self._write_paths
|
95
|
-
|
96
|
-
def block_refs(self) -> List[ObjectRef[Block]]:
|
97
|
-
return self._block_refs
|
98
|
-
|
99
|
-
|
100
|
-
class UuidBlockWritePathProvider(FilenameProvider):
|
101
|
-
"""Block write path provider implementation that writes each
|
102
|
-
dataset block out to a file of the form: {base_path}/{uuid}
|
103
|
-
"""
|
104
|
-
|
105
|
-
def __init__(
|
106
|
-
self, capture_object: CapturedBlockWritePaths, base_path: Optional[str] = None
|
107
|
-
):
|
108
|
-
self.base_path = base_path
|
109
|
-
self.write_paths: List[str] = []
|
110
|
-
self.block_refs: List[ObjectRef[Block]] = []
|
111
|
-
self.capture_object = capture_object
|
112
|
-
|
113
|
-
def __del__(self):
|
114
|
-
if self.write_paths or self.block_refs:
|
115
|
-
self.capture_object.extend(
|
116
|
-
self.write_paths,
|
117
|
-
self.block_refs,
|
118
|
-
)
|
119
|
-
|
120
|
-
def get_filename_for_block(
|
121
|
-
self, block: Any, task_index: int, block_index: int
|
122
|
-
) -> str:
|
123
|
-
if self.base_path is None:
|
124
|
-
raise ValueError(
|
125
|
-
"Base path must be provided to UuidBlockWritePathProvider",
|
126
|
-
)
|
127
|
-
return self._get_write_path_for_block(
|
128
|
-
base_path=self.base_path,
|
129
|
-
block=block,
|
130
|
-
block_index=block_index,
|
131
|
-
)
|
132
|
-
|
133
|
-
def _get_write_path_for_block(
|
134
|
-
self,
|
135
|
-
base_path: str,
|
136
|
-
*,
|
137
|
-
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
138
|
-
dataset_uuid: Optional[str] = None,
|
139
|
-
block: Optional[ObjectRef[Block]] = None,
|
140
|
-
block_index: Optional[int] = None,
|
141
|
-
file_format: Optional[str] = None,
|
142
|
-
) -> str:
|
143
|
-
write_path = f"{base_path}/{str(uuid4())}"
|
144
|
-
self.write_paths.append(write_path)
|
145
|
-
if block:
|
146
|
-
self.block_refs.append(block)
|
147
|
-
return write_path
|
148
|
-
|
149
|
-
def __call__(
|
150
|
-
self,
|
151
|
-
base_path: str,
|
152
|
-
*,
|
153
|
-
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
154
|
-
dataset_uuid: Optional[str] = None,
|
155
|
-
block: Optional[ObjectRef[Block]] = None,
|
156
|
-
block_index: Optional[int] = None,
|
157
|
-
file_format: Optional[str] = None,
|
158
|
-
) -> str:
|
159
|
-
return self._get_write_path_for_block(
|
160
|
-
base_path,
|
161
|
-
filesystem=filesystem,
|
162
|
-
dataset_uuid=dataset_uuid,
|
163
|
-
block=block,
|
164
|
-
block_index=block_index,
|
165
|
-
file_format=file_format,
|
166
|
-
)
|
167
|
-
|
168
|
-
|
169
35
|
class S3Url:
|
170
36
|
def __init__(self, url: str):
|
171
37
|
|
@@ -248,312 +114,6 @@ def filter_objects_by_prefix(
|
|
248
114
|
more_objects_to_list = params["ContinuationToken"] is not None
|
249
115
|
|
250
116
|
|
251
|
-
@categorize_errors
|
252
|
-
def read_file(
|
253
|
-
s3_url: str,
|
254
|
-
content_type: ContentType,
|
255
|
-
content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
|
256
|
-
table_type: TableType = TableType.PYARROW,
|
257
|
-
column_names: Optional[List[str]] = None,
|
258
|
-
include_columns: Optional[List[str]] = None,
|
259
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
260
|
-
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
261
|
-
**s3_client_kwargs,
|
262
|
-
) -> LocalTable:
|
263
|
-
|
264
|
-
reader = TABLE_TYPE_TO_S3_READER_FUNC[table_type.value]
|
265
|
-
try:
|
266
|
-
table = reader(
|
267
|
-
s3_url,
|
268
|
-
content_type.value,
|
269
|
-
content_encoding.value,
|
270
|
-
column_names,
|
271
|
-
include_columns,
|
272
|
-
file_reader_kwargs_provider,
|
273
|
-
partial_file_download_params,
|
274
|
-
**s3_client_kwargs,
|
275
|
-
)
|
276
|
-
return table
|
277
|
-
except ClientError as e:
|
278
|
-
if (
|
279
|
-
e.response["Error"]["Code"]
|
280
|
-
in BOTO_TIMEOUT_ERROR_CODES | BOTO_THROTTLING_ERROR_CODES
|
281
|
-
):
|
282
|
-
# Timeout error not caught by botocore
|
283
|
-
raise RetryableDownloadTableError(
|
284
|
-
f"Retry table download from: {s3_url} after receiving {type(e).__name__}",
|
285
|
-
) from e
|
286
|
-
raise NonRetryableDownloadTableError(
|
287
|
-
f"Failed table download from: {s3_url} after receiving {type(e).__name__}"
|
288
|
-
) from e
|
289
|
-
except RETRYABLE_TRANSIENT_ERRORS as e:
|
290
|
-
raise RetryableDownloadTableError(
|
291
|
-
f"Retry download for: {s3_url} after receiving {type(e).__name__}"
|
292
|
-
) from e
|
293
|
-
except BaseException as e:
|
294
|
-
logger.warning(
|
295
|
-
f"Read has failed for {s3_url} and content_type={content_type} "
|
296
|
-
f"and encoding={content_encoding}. Error: {e}",
|
297
|
-
exc_info=True,
|
298
|
-
)
|
299
|
-
raise NonRetryableDownloadTableError(
|
300
|
-
f"Read has failed for {s3_url} and content_type={content_type} "
|
301
|
-
f"and encoding={content_encoding}",
|
302
|
-
) from e
|
303
|
-
|
304
|
-
|
305
|
-
def upload_sliced_table(
|
306
|
-
table: Union[LocalTable, DistributedDataset],
|
307
|
-
s3_url_prefix: str,
|
308
|
-
s3_file_system: s3fs.S3FileSystem,
|
309
|
-
max_records_per_entry: Optional[int],
|
310
|
-
s3_table_writer_func: Callable,
|
311
|
-
table_slicer_func: Callable,
|
312
|
-
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
313
|
-
content_type: ContentType = ContentType.PARQUET,
|
314
|
-
**s3_client_kwargs,
|
315
|
-
) -> ManifestEntryList:
|
316
|
-
|
317
|
-
# @retry decorator can't be pickled by Ray, so wrap upload in Retrying
|
318
|
-
retrying = Retrying(
|
319
|
-
wait=wait_random_exponential(multiplier=1, max=60),
|
320
|
-
stop=stop_after_delay(UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY),
|
321
|
-
retry=retry_if_exception_type(RetryableError),
|
322
|
-
)
|
323
|
-
|
324
|
-
manifest_entries = ManifestEntryList()
|
325
|
-
table_record_count = get_table_length(table)
|
326
|
-
|
327
|
-
if max_records_per_entry is None or not table_record_count:
|
328
|
-
# write the whole table to a single s3 file
|
329
|
-
manifest_entries = retrying(
|
330
|
-
upload_table,
|
331
|
-
table,
|
332
|
-
f"{s3_url_prefix}",
|
333
|
-
s3_file_system,
|
334
|
-
s3_table_writer_func,
|
335
|
-
s3_table_writer_kwargs,
|
336
|
-
content_type,
|
337
|
-
**s3_client_kwargs,
|
338
|
-
)
|
339
|
-
else:
|
340
|
-
# iteratively write table slices
|
341
|
-
table_slices = table_slicer_func(table, max_records_per_entry)
|
342
|
-
for table_slice in table_slices:
|
343
|
-
slice_entries = retrying(
|
344
|
-
upload_table,
|
345
|
-
table_slice,
|
346
|
-
f"{s3_url_prefix}",
|
347
|
-
s3_file_system,
|
348
|
-
s3_table_writer_func,
|
349
|
-
s3_table_writer_kwargs,
|
350
|
-
content_type,
|
351
|
-
**s3_client_kwargs,
|
352
|
-
)
|
353
|
-
manifest_entries.extend(slice_entries)
|
354
|
-
return manifest_entries
|
355
|
-
|
356
|
-
|
357
|
-
def upload_table(
|
358
|
-
table: Union[LocalTable, DistributedDataset],
|
359
|
-
s3_base_url: str,
|
360
|
-
s3_file_system: s3fs.S3FileSystem,
|
361
|
-
s3_table_writer_func: Callable,
|
362
|
-
s3_table_writer_kwargs: Optional[Dict[str, Any]],
|
363
|
-
content_type: ContentType = ContentType.PARQUET,
|
364
|
-
**s3_client_kwargs,
|
365
|
-
) -> ManifestEntryList:
|
366
|
-
"""
|
367
|
-
Writes the given table to 1 or more S3 files and return
|
368
|
-
manifest entries describing the uploaded files.
|
369
|
-
"""
|
370
|
-
if s3_table_writer_kwargs is None:
|
371
|
-
s3_table_writer_kwargs = {}
|
372
|
-
|
373
|
-
capture_object = CapturedBlockWritePaths()
|
374
|
-
block_write_path_provider = UuidBlockWritePathProvider(capture_object)
|
375
|
-
s3_table_writer_func(
|
376
|
-
table,
|
377
|
-
s3_base_url,
|
378
|
-
s3_file_system,
|
379
|
-
block_write_path_provider,
|
380
|
-
content_type.value,
|
381
|
-
**s3_table_writer_kwargs,
|
382
|
-
)
|
383
|
-
# TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
|
384
|
-
del block_write_path_provider
|
385
|
-
block_refs = capture_object.block_refs()
|
386
|
-
write_paths = capture_object.write_paths()
|
387
|
-
metadata = _get_metadata(table, write_paths, block_refs)
|
388
|
-
manifest_entries = ManifestEntryList()
|
389
|
-
for block_idx, s3_url in enumerate(write_paths):
|
390
|
-
try:
|
391
|
-
manifest_entry = ManifestEntry.from_s3_obj_url(
|
392
|
-
s3_url,
|
393
|
-
metadata[block_idx].num_rows,
|
394
|
-
metadata[block_idx].size_bytes,
|
395
|
-
**s3_client_kwargs,
|
396
|
-
)
|
397
|
-
manifest_entries.append(manifest_entry)
|
398
|
-
except ClientError as e:
|
399
|
-
if e.response["Error"]["Code"] == "NoSuchKey":
|
400
|
-
# s3fs may swallow S3 errors - we were probably throttled
|
401
|
-
raise RetryableUploadTableError(
|
402
|
-
f"Retry table upload from: {s3_url} after receiving {type(e).__name__}",
|
403
|
-
) from e
|
404
|
-
if (
|
405
|
-
e.response["Error"]["Code"]
|
406
|
-
in BOTO_TIMEOUT_ERROR_CODES | BOTO_THROTTLING_ERROR_CODES
|
407
|
-
):
|
408
|
-
raise RetryableUploadTableError(
|
409
|
-
f"Retry table upload from: {s3_url} after receiving {type(e).__name__}",
|
410
|
-
) from e
|
411
|
-
raise NonRetryableUploadTableError(
|
412
|
-
f"Failed table upload to: {s3_url} after receiving {type(e).__name__}",
|
413
|
-
) from e
|
414
|
-
except RETRYABLE_TRANSIENT_ERRORS as e:
|
415
|
-
raise RetryableUploadTableError(
|
416
|
-
f"Retry upload for: {s3_url} after receiving {type(e).__name__}",
|
417
|
-
) from e
|
418
|
-
except BaseException as e:
|
419
|
-
logger.warning(
|
420
|
-
f"Upload has failed for {s3_url} and content_type={content_type}. Error: {e}",
|
421
|
-
exc_info=True,
|
422
|
-
)
|
423
|
-
raise NonRetryableUploadTableError(
|
424
|
-
f"Upload has failed for {s3_url} and content_type={content_type} because of {type(e).__name__}",
|
425
|
-
) from e
|
426
|
-
return manifest_entries
|
427
|
-
|
428
|
-
|
429
|
-
def download_manifest_entry(
|
430
|
-
manifest_entry: ManifestEntry,
|
431
|
-
token_holder: Optional[Dict[str, Any]] = None,
|
432
|
-
table_type: TableType = TableType.PYARROW,
|
433
|
-
column_names: Optional[List[str]] = None,
|
434
|
-
include_columns: Optional[List[str]] = None,
|
435
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
436
|
-
content_type: Optional[ContentType] = None,
|
437
|
-
content_encoding: Optional[ContentEncoding] = None,
|
438
|
-
) -> LocalTable:
|
439
|
-
|
440
|
-
s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
|
441
|
-
if not content_type:
|
442
|
-
content_type = manifest_entry.meta.content_type
|
443
|
-
assert (
|
444
|
-
content_type
|
445
|
-
), f"Unknown content type for manifest entry: {manifest_entry}"
|
446
|
-
content_type = ContentType(content_type)
|
447
|
-
if not content_encoding:
|
448
|
-
content_encoding = manifest_entry.meta.content_encoding
|
449
|
-
assert (
|
450
|
-
content_encoding
|
451
|
-
), f"Unknown content encoding for manifest entry: {manifest_entry}"
|
452
|
-
content_encoding = ContentEncoding(content_encoding)
|
453
|
-
s3_url = manifest_entry.uri
|
454
|
-
if s3_url is None:
|
455
|
-
s3_url = manifest_entry.url
|
456
|
-
|
457
|
-
partial_file_download_params = None
|
458
|
-
if manifest_entry.meta and manifest_entry.meta.content_type_parameters:
|
459
|
-
for type_params in manifest_entry.meta.content_type_parameters:
|
460
|
-
if isinstance(type_params, PartialFileDownloadParams):
|
461
|
-
partial_file_download_params = type_params
|
462
|
-
break
|
463
|
-
|
464
|
-
# @retry decorator can't be pickled by Ray, so wrap download in Retrying
|
465
|
-
retrying = Retrying(
|
466
|
-
wait=wait_random_exponential(multiplier=1, max=60),
|
467
|
-
stop=stop_after_delay(DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY),
|
468
|
-
retry=retry_if_exception_type(RetryableError),
|
469
|
-
)
|
470
|
-
table = retrying(
|
471
|
-
read_file,
|
472
|
-
s3_url,
|
473
|
-
content_type,
|
474
|
-
content_encoding,
|
475
|
-
table_type,
|
476
|
-
column_names,
|
477
|
-
include_columns,
|
478
|
-
file_reader_kwargs_provider,
|
479
|
-
partial_file_download_params,
|
480
|
-
**s3_client_kwargs,
|
481
|
-
)
|
482
|
-
return table
|
483
|
-
|
484
|
-
|
485
|
-
@ray.remote
|
486
|
-
def download_manifest_entry_ray(*args, **kwargs) -> ObjectRef[LocalTable]:
|
487
|
-
return download_manifest_entry(*args, **kwargs)
|
488
|
-
|
489
|
-
|
490
|
-
def download_manifest_entries(
|
491
|
-
manifest: Manifest,
|
492
|
-
token_holder: Optional[Dict[str, Any]] = None,
|
493
|
-
table_type: TableType = TableType.PYARROW,
|
494
|
-
max_parallelism: Optional[int] = 1,
|
495
|
-
column_names: Optional[List[str]] = None,
|
496
|
-
include_columns: Optional[List[str]] = None,
|
497
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
498
|
-
) -> LocalDataset:
|
499
|
-
|
500
|
-
if max_parallelism and max_parallelism <= 1:
|
501
|
-
return _download_manifest_entries(
|
502
|
-
manifest,
|
503
|
-
token_holder,
|
504
|
-
table_type,
|
505
|
-
column_names,
|
506
|
-
include_columns,
|
507
|
-
file_reader_kwargs_provider,
|
508
|
-
)
|
509
|
-
else:
|
510
|
-
return _download_manifest_entries_parallel(
|
511
|
-
manifest,
|
512
|
-
token_holder,
|
513
|
-
table_type,
|
514
|
-
max_parallelism,
|
515
|
-
column_names,
|
516
|
-
include_columns,
|
517
|
-
file_reader_kwargs_provider,
|
518
|
-
)
|
519
|
-
|
520
|
-
|
521
|
-
def download_manifest_entries_distributed(
|
522
|
-
manifest: Manifest,
|
523
|
-
token_holder: Optional[Dict[str, Any]] = None,
|
524
|
-
table_type: TableType = TableType.PYARROW,
|
525
|
-
max_parallelism: Optional[int] = 1000,
|
526
|
-
column_names: Optional[List[str]] = None,
|
527
|
-
include_columns: Optional[List[str]] = None,
|
528
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
529
|
-
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
530
|
-
distributed_dataset_type: Optional[
|
531
|
-
DistributedDatasetType
|
532
|
-
] = DistributedDatasetType.RAY_DATASET,
|
533
|
-
) -> DistributedDataset:
|
534
|
-
|
535
|
-
params = {
|
536
|
-
"manifest": manifest,
|
537
|
-
"token_holder": token_holder,
|
538
|
-
"table_type": table_type,
|
539
|
-
"max_parallelism": max_parallelism,
|
540
|
-
"column_names": column_names,
|
541
|
-
"include_columns": include_columns,
|
542
|
-
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
543
|
-
"ray_options_provider": ray_options_provider,
|
544
|
-
"distributed_dataset_type": distributed_dataset_type,
|
545
|
-
}
|
546
|
-
|
547
|
-
if distributed_dataset_type == DistributedDatasetType.RAY_DATASET:
|
548
|
-
return _download_manifest_entries_ray_data_distributed(**params)
|
549
|
-
elif distributed_dataset_type is not None:
|
550
|
-
return _download_manifest_entries_all_dataset_distributed(**params)
|
551
|
-
else:
|
552
|
-
raise ValueError(
|
553
|
-
f"Distributed dataset type {distributed_dataset_type} not supported."
|
554
|
-
)
|
555
|
-
|
556
|
-
|
557
117
|
def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
|
558
118
|
|
559
119
|
parsed_s3_url = parse_s3_url(s3_url)
|
@@ -643,61 +203,6 @@ def _get_object(s3_client, bucket: str, key: str, fail_if_not_found: bool = True
|
|
643
203
|
return None
|
644
204
|
|
645
205
|
|
646
|
-
def _download_manifest_entries_parallel(
|
647
|
-
manifest: Manifest,
|
648
|
-
token_holder: Optional[Dict[str, Any]] = None,
|
649
|
-
table_type: TableType = TableType.PYARROW,
|
650
|
-
max_parallelism: Optional[int] = None,
|
651
|
-
column_names: Optional[List[str]] = None,
|
652
|
-
include_columns: Optional[List[str]] = None,
|
653
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
654
|
-
) -> LocalDataset:
|
655
|
-
|
656
|
-
tables = []
|
657
|
-
pool = multiprocessing.Pool(max_parallelism)
|
658
|
-
downloader = partial(
|
659
|
-
download_manifest_entry,
|
660
|
-
token_holder=token_holder,
|
661
|
-
table_type=table_type,
|
662
|
-
column_names=column_names,
|
663
|
-
include_columns=include_columns,
|
664
|
-
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
665
|
-
)
|
666
|
-
for table in pool.map(downloader, [e for e in manifest.entries]):
|
667
|
-
tables.append(table)
|
668
|
-
return tables
|
669
|
-
|
670
|
-
|
671
|
-
def _download_manifest_entries(
|
672
|
-
manifest: Manifest,
|
673
|
-
token_holder: Optional[Dict[str, Any]] = None,
|
674
|
-
table_type: TableType = TableType.PYARROW,
|
675
|
-
column_names: Optional[List[str]] = None,
|
676
|
-
include_columns: Optional[List[str]] = None,
|
677
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
678
|
-
) -> LocalDataset:
|
679
|
-
|
680
|
-
return [
|
681
|
-
download_manifest_entry(
|
682
|
-
manifest_entry=e,
|
683
|
-
token_holder=token_holder,
|
684
|
-
table_type=table_type,
|
685
|
-
column_names=column_names,
|
686
|
-
include_columns=include_columns,
|
687
|
-
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
688
|
-
)
|
689
|
-
for e in manifest.entries
|
690
|
-
]
|
691
|
-
|
692
|
-
|
693
|
-
@ray.remote
|
694
|
-
def _block_metadata(block: Block) -> BlockMetadata:
|
695
|
-
return BlockAccessor.for_block(block).get_metadata(
|
696
|
-
input_files=None,
|
697
|
-
exec_stats=None,
|
698
|
-
)
|
699
|
-
|
700
|
-
|
701
206
|
def _get_s3_client_kwargs_from_token(token_holder) -> Dict[Any, Any]:
|
702
207
|
conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
703
208
|
return (
|
@@ -710,135 +215,3 @@ def _get_s3_client_kwargs_from_token(token_holder) -> Dict[Any, Any]:
|
|
710
215
|
if token_holder
|
711
216
|
else {"config": conf}
|
712
217
|
)
|
713
|
-
|
714
|
-
|
715
|
-
def _get_metadata(
|
716
|
-
table: Union[LocalTable, DistributedDataset],
|
717
|
-
write_paths: List[str],
|
718
|
-
block_refs: List[ObjectRef[Block]],
|
719
|
-
) -> List[BlockMetadata]:
|
720
|
-
metadata: List[BlockMetadata] = []
|
721
|
-
if not block_refs:
|
722
|
-
# this must be a local table - ensure it was written to only 1 file
|
723
|
-
assert len(write_paths) == 1, (
|
724
|
-
f"Expected table of type '{type(table)}' to be written to 1 "
|
725
|
-
f"file, but found {len(write_paths)} files."
|
726
|
-
)
|
727
|
-
table_size = None
|
728
|
-
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
729
|
-
if table_size_func:
|
730
|
-
table_size = table_size_func(table)
|
731
|
-
else:
|
732
|
-
logger.warning(f"Unable to estimate '{type(table)}' table size.")
|
733
|
-
metadata.append(
|
734
|
-
BlockMetadata(
|
735
|
-
num_rows=get_table_length(table),
|
736
|
-
size_bytes=table_size,
|
737
|
-
schema=None,
|
738
|
-
input_files=None,
|
739
|
-
exec_stats=None,
|
740
|
-
)
|
741
|
-
)
|
742
|
-
else:
|
743
|
-
# TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
|
744
|
-
# ray 1.10
|
745
|
-
# metadata = dataset._blocks.get_metadata()
|
746
|
-
# ray 2.0.0dev
|
747
|
-
metadata = table._plan.execute().get_metadata()
|
748
|
-
if (
|
749
|
-
not metadata
|
750
|
-
or metadata[0].size_bytes is None
|
751
|
-
or metadata[0].num_rows is None
|
752
|
-
):
|
753
|
-
metadata_futures = [
|
754
|
-
_block_metadata.remote(block_ref) for block_ref in block_refs
|
755
|
-
]
|
756
|
-
metadata = ray.get(metadata_futures)
|
757
|
-
return metadata
|
758
|
-
|
759
|
-
|
760
|
-
def _download_manifest_entries_ray_data_distributed(
|
761
|
-
manifest: Manifest,
|
762
|
-
token_holder: Optional[Dict[str, Any]] = None,
|
763
|
-
table_type: TableType = TableType.PYARROW,
|
764
|
-
max_parallelism: Optional[int] = 1000,
|
765
|
-
column_names: Optional[List[str]] = None,
|
766
|
-
include_columns: Optional[List[str]] = None,
|
767
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
768
|
-
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
769
|
-
) -> DistributedDataset:
|
770
|
-
|
771
|
-
table_pending_ids = []
|
772
|
-
manifest_entries = manifest.entries
|
773
|
-
if manifest_entries:
|
774
|
-
table_pending_ids = invoke_parallel(
|
775
|
-
manifest_entries,
|
776
|
-
download_manifest_entry_ray,
|
777
|
-
token_holder,
|
778
|
-
table_type,
|
779
|
-
column_names,
|
780
|
-
include_columns,
|
781
|
-
file_reader_kwargs_provider,
|
782
|
-
max_parallelism=max_parallelism,
|
783
|
-
options_provider=ray_options_provider,
|
784
|
-
)
|
785
|
-
return TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS[table_type](table_pending_ids)
|
786
|
-
|
787
|
-
|
788
|
-
def _download_manifest_entries_all_dataset_distributed(
|
789
|
-
manifest: Manifest,
|
790
|
-
token_holder: Optional[Dict[str, Any]] = None,
|
791
|
-
table_type: TableType = TableType.PYARROW,
|
792
|
-
max_parallelism: Optional[int] = 1000,
|
793
|
-
column_names: Optional[List[str]] = None,
|
794
|
-
include_columns: Optional[List[str]] = None,
|
795
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
796
|
-
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
797
|
-
distributed_dataset_type: Optional[
|
798
|
-
DistributedDatasetType
|
799
|
-
] = DistributedDatasetType.RAY_DATASET,
|
800
|
-
) -> DistributedDataset:
|
801
|
-
|
802
|
-
entry_content_type = None
|
803
|
-
entry_content_encoding = None
|
804
|
-
uris = []
|
805
|
-
for entry in manifest.entries or []:
|
806
|
-
if (
|
807
|
-
entry_content_type is not None
|
808
|
-
and entry_content_type != entry.meta.content_type
|
809
|
-
):
|
810
|
-
raise ValueError(
|
811
|
-
f"Mixed content types of ({entry_content_type},"
|
812
|
-
f" {entry.meta.content_type}) is not supported."
|
813
|
-
)
|
814
|
-
|
815
|
-
if (
|
816
|
-
entry_content_encoding is not None
|
817
|
-
and entry_content_encoding != entry.meta.content_encoding
|
818
|
-
):
|
819
|
-
raise ValueError(
|
820
|
-
f"Mixed content encoding of {entry_content_encoding},"
|
821
|
-
f" {entry.meta.content_encoding} is not supported."
|
822
|
-
)
|
823
|
-
|
824
|
-
entry_content_type = entry.meta.content_type
|
825
|
-
entry_content_encoding = entry.meta.content_encoding
|
826
|
-
uris.append(entry.uri)
|
827
|
-
|
828
|
-
s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
|
829
|
-
|
830
|
-
if distributed_dataset_type in DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC:
|
831
|
-
return DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[distributed_dataset_type.value](
|
832
|
-
uris=uris,
|
833
|
-
content_type=entry_content_type,
|
834
|
-
content_encoding=entry_content_encoding,
|
835
|
-
column_names=column_names,
|
836
|
-
include_columns=include_columns,
|
837
|
-
read_func_kwargs_provider=file_reader_kwargs_provider,
|
838
|
-
ray_options_provider=ray_options_provider,
|
839
|
-
s3_client_kwargs=s3_client_kwargs,
|
840
|
-
)
|
841
|
-
else:
|
842
|
-
raise ValueError(
|
843
|
-
f"Unsupported distributed dataset type={distributed_dataset_type}"
|
844
|
-
)
|
@@ -4,8 +4,10 @@ from contextlib import contextmanager
|
|
4
4
|
from typing import Generator, Tuple
|
5
5
|
|
6
6
|
from deltacat.benchmarking.benchmark_report import BenchmarkMetric, BenchmarkStep
|
7
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
8
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
7
|
+
from deltacat.experimental.storage.rivulet.dataset import Dataset
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
9
|
+
QueryExpression,
|
10
|
+
)
|
9
11
|
|
10
12
|
|
11
13
|
@contextmanager
|