deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/utils/export.py
CHANGED
@@ -5,7 +5,9 @@ import pyarrow.parquet
|
|
5
5
|
import pyarrow.feather
|
6
6
|
from typing import Callable, Dict
|
7
7
|
|
8
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
9
|
+
QueryExpression,
|
10
|
+
)
|
9
11
|
from deltacat import logs
|
10
12
|
|
11
13
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
deltacat/utils/filesystem.py
CHANGED
@@ -2,12 +2,13 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import re
|
4
4
|
from typing import Optional, Tuple, Union, List
|
5
|
+
from datetime import timedelta
|
6
|
+
from enum import Enum
|
5
7
|
|
6
8
|
import sys
|
7
9
|
import urllib
|
8
10
|
import pathlib
|
9
11
|
|
10
|
-
import pyarrow
|
11
12
|
import pyarrow as pa
|
12
13
|
from pyarrow.fs import (
|
13
14
|
_resolve_filesystem_and_path,
|
@@ -17,15 +18,59 @@ from pyarrow.fs import (
|
|
17
18
|
FileSystem,
|
18
19
|
FSSpecHandler,
|
19
20
|
PyFileSystem,
|
21
|
+
GcsFileSystem,
|
22
|
+
LocalFileSystem,
|
23
|
+
S3FileSystem,
|
24
|
+
AzureFileSystem,
|
25
|
+
HadoopFileSystem,
|
20
26
|
)
|
21
27
|
|
22
28
|
_LOCAL_SCHEME = "local"
|
23
29
|
|
24
30
|
|
31
|
+
class FilesystemType(str, Enum):
|
32
|
+
LOCAL = "local"
|
33
|
+
S3 = "s3"
|
34
|
+
GCS = "gcs"
|
35
|
+
AZURE = "azure"
|
36
|
+
HADOOP = "hadoop"
|
37
|
+
UNKNOWN = "unknown"
|
38
|
+
|
39
|
+
@classmethod
|
40
|
+
def from_filesystem(cls, filesystem: FileSystem) -> FilesystemType:
|
41
|
+
if isinstance(filesystem, LocalFileSystem):
|
42
|
+
return cls.LOCAL
|
43
|
+
elif isinstance(filesystem, S3FileSystem):
|
44
|
+
return cls.S3
|
45
|
+
elif isinstance(filesystem, GcsFileSystem):
|
46
|
+
return cls.GCS
|
47
|
+
elif isinstance(filesystem, AzureFileSystem):
|
48
|
+
return cls.AZURE
|
49
|
+
elif isinstance(filesystem, HadoopFileSystem):
|
50
|
+
return cls.HADOOP
|
51
|
+
else:
|
52
|
+
return cls.UNKNOWN
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def to_filesystem(cls, filesystem_type: FilesystemType) -> FileSystem:
|
56
|
+
if filesystem_type == cls.LOCAL:
|
57
|
+
return LocalFileSystem()
|
58
|
+
elif filesystem_type == cls.S3:
|
59
|
+
return S3FileSystem()
|
60
|
+
elif filesystem_type == cls.GCS:
|
61
|
+
return GcsFileSystem()
|
62
|
+
elif filesystem_type == cls.AZURE:
|
63
|
+
return AzureFileSystem()
|
64
|
+
elif filesystem_type == cls.HADOOP:
|
65
|
+
return HadoopFileSystem()
|
66
|
+
else:
|
67
|
+
raise ValueError(f"Unsupported filesystem type: {filesystem_type}")
|
68
|
+
|
69
|
+
|
25
70
|
def resolve_paths_and_filesystem(
|
26
71
|
paths: Union[str, List[str]],
|
27
|
-
filesystem:
|
28
|
-
) -> Tuple[List[str],
|
72
|
+
filesystem: FileSystem = None,
|
73
|
+
) -> Tuple[List[str], FileSystem]:
|
29
74
|
"""
|
30
75
|
Resolves and normalizes all provided paths, infers a filesystem from the
|
31
76
|
paths or validates the provided filesystem against the paths and ensures
|
@@ -113,19 +158,26 @@ def resolve_paths_and_filesystem(
|
|
113
158
|
else:
|
114
159
|
raise
|
115
160
|
if filesystem is None:
|
116
|
-
|
161
|
+
if isinstance(resolved_filesystem, GcsFileSystem):
|
162
|
+
# Configure a retry time limit for GcsFileSystem so that it
|
163
|
+
# doesn't hang forever trying to get file info (e.g., when
|
164
|
+
# trying to get a public file w/o anonymous=True).
|
165
|
+
filesystem = GcsFileSystem(
|
166
|
+
retry_time_limit=timedelta(seconds=60),
|
167
|
+
)
|
168
|
+
else:
|
169
|
+
filesystem = resolved_filesystem
|
117
170
|
elif need_unwrap_path_protocol:
|
118
171
|
resolved_path = _unwrap_protocol(resolved_path)
|
119
172
|
resolved_path = filesystem.normalize_path(resolved_path)
|
120
173
|
resolved_paths.append(resolved_path)
|
121
|
-
|
122
174
|
return resolved_paths, filesystem
|
123
175
|
|
124
176
|
|
125
177
|
def resolve_path_and_filesystem(
|
126
178
|
path: str,
|
127
|
-
filesystem: Optional[
|
128
|
-
) -> Tuple[str,
|
179
|
+
filesystem: Optional[FileSystem] = None,
|
180
|
+
) -> Tuple[str, FileSystem]:
|
129
181
|
"""
|
130
182
|
Resolves and normalizes the provided path, infers a filesystem from the
|
131
183
|
path or validates the provided filesystem against the path.
|
@@ -148,7 +200,7 @@ def resolve_path_and_filesystem(
|
|
148
200
|
|
149
201
|
def list_directory(
|
150
202
|
path: str,
|
151
|
-
filesystem:
|
203
|
+
filesystem: FileSystem,
|
152
204
|
exclude_prefixes: Optional[List[str]] = None,
|
153
205
|
ignore_missing_path: bool = False,
|
154
206
|
recursive: bool = False,
|
@@ -199,7 +251,7 @@ def list_directory(
|
|
199
251
|
|
200
252
|
def get_file_info(
|
201
253
|
path: str,
|
202
|
-
filesystem:
|
254
|
+
filesystem: FileSystem,
|
203
255
|
ignore_missing_path: bool = False,
|
204
256
|
) -> FileInfo:
|
205
257
|
"""Get the file info for the provided path."""
|
@@ -213,6 +265,62 @@ def get_file_info(
|
|
213
265
|
return file_info
|
214
266
|
|
215
267
|
|
268
|
+
def write_file(
|
269
|
+
path: str,
|
270
|
+
data: Union[str, bytes],
|
271
|
+
filesystem: Optional[FileSystem] = None,
|
272
|
+
) -> None:
|
273
|
+
"""
|
274
|
+
Write data to a file using any filesystem.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
path: The file path to write to.
|
278
|
+
data: The data to write (string or bytes).
|
279
|
+
filesystem: The filesystem implementation to use. If None, will be inferred from the path.
|
280
|
+
"""
|
281
|
+
resolved_path, resolved_filesystem = resolve_path_and_filesystem(
|
282
|
+
path=path,
|
283
|
+
filesystem=filesystem,
|
284
|
+
)
|
285
|
+
|
286
|
+
# Convert string to bytes if necessary
|
287
|
+
if isinstance(data, str):
|
288
|
+
data = data.encode("utf-8")
|
289
|
+
|
290
|
+
with resolved_filesystem.open_output_stream(resolved_path) as f:
|
291
|
+
f.write(data)
|
292
|
+
|
293
|
+
|
294
|
+
def read_file(
|
295
|
+
path: str,
|
296
|
+
filesystem: Optional[FileSystem] = None,
|
297
|
+
fail_if_not_found: bool = True,
|
298
|
+
) -> Optional[bytes]:
|
299
|
+
"""
|
300
|
+
Read data from a file using any filesystem.
|
301
|
+
|
302
|
+
Args:
|
303
|
+
path: The file path to read from.
|
304
|
+
filesystem: The filesystem implementation to use. If None, will be inferred from the path.
|
305
|
+
fail_if_not_found: Whether to raise an error if the file is not found.
|
306
|
+
|
307
|
+
Returns:
|
308
|
+
The file data as bytes, or None if file not found and fail_if_not_found is False.
|
309
|
+
"""
|
310
|
+
try:
|
311
|
+
resolved_path, resolved_filesystem = resolve_path_and_filesystem(
|
312
|
+
path=path,
|
313
|
+
filesystem=filesystem,
|
314
|
+
)
|
315
|
+
|
316
|
+
with resolved_filesystem.open_input_stream(resolved_path) as f:
|
317
|
+
return f.read()
|
318
|
+
except FileNotFoundError:
|
319
|
+
if fail_if_not_found:
|
320
|
+
raise
|
321
|
+
return None
|
322
|
+
|
323
|
+
|
216
324
|
def _handle_read_os_error(
|
217
325
|
error: OSError,
|
218
326
|
paths: Union[str, List[str]],
|
@@ -227,6 +335,9 @@ def _handle_read_os_error(
|
|
227
335
|
r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
|
228
336
|
r"body\.(.*))$"
|
229
337
|
)
|
338
|
+
gcp_error_pattern = (
|
339
|
+
r"^(?:(.*)google::cloud::Status\(UNAVAILABLE:(.*?)Couldn't resolve host name)"
|
340
|
+
)
|
230
341
|
if re.match(aws_error_pattern, str(error)):
|
231
342
|
# Specially handle AWS error when reading files, to give a clearer error
|
232
343
|
# message to avoid confusing users. The real issue is most likely that the AWS
|
@@ -243,9 +354,28 @@ def _handle_read_os_error(
|
|
243
354
|
"You can also run AWS CLI command to get more detailed error message "
|
244
355
|
"(e.g., aws s3 ls <file-name>). "
|
245
356
|
"See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html " # noqa
|
357
|
+
"and https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html "
|
358
|
+
"for more information."
|
359
|
+
)
|
360
|
+
)
|
361
|
+
elif re.match(gcp_error_pattern, str(error)):
|
362
|
+
# Special handling for GCP errors (e.g., handling the special case of
|
363
|
+
# requiring the filesystem to be instantiated with anonymous access to
|
364
|
+
# read public files).
|
365
|
+
if isinstance(paths, str):
|
366
|
+
paths = f'"{paths}"'
|
367
|
+
raise OSError(
|
368
|
+
(
|
369
|
+
f"Failing to read GCP GS file(s): {paths}. "
|
370
|
+
"Please check that file exists and has properly configured access. "
|
371
|
+
"If this is a public file, please instantiate a filesystem with "
|
372
|
+
"anonymous access via `pyarrow.fs.GcsFileSystem(anonymous=True)` "
|
373
|
+
"to read it. See https://google.aip.dev/auth/4110 and "
|
374
|
+
"https://arrow.apache.org/docs/python/generated/pyarrow.fs.GcsFileSystem.html" # noqa
|
246
375
|
"for more information."
|
247
376
|
)
|
248
377
|
)
|
378
|
+
|
249
379
|
else:
|
250
380
|
raise error
|
251
381
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import posixpath
|
2
2
|
import pyarrow.fs
|
3
3
|
|
4
|
+
from deltacat.constants import REV_DIR_NAME
|
4
5
|
from deltacat.storage.model.partition import PartitionLocator
|
5
6
|
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
6
7
|
|
@@ -28,7 +29,7 @@ def _find_first_child_with_rev(
|
|
28
29
|
)
|
29
30
|
for child in children:
|
30
31
|
if child.type == pyarrow.fs.FileType.Directory:
|
31
|
-
rev_path = posixpath.join(child.path,
|
32
|
+
rev_path = posixpath.join(child.path, REV_DIR_NAME)
|
32
33
|
if filesystem.get_file_info(rev_path).type == pyarrow.fs.FileType.Directory:
|
33
34
|
return child.base_name
|
34
35
|
raise ValueError(f"No directory with 'rev/' found under {parent_path}")
|
deltacat/utils/numpy.py
CHANGED
@@ -1,14 +1,21 @@
|
|
1
|
-
from typing import List, Optional, Callable, Union
|
1
|
+
from typing import List, Optional, Callable, Union, Dict, Any
|
2
2
|
|
3
|
+
import pandas as pd
|
3
4
|
import numpy as np
|
4
|
-
import pyarrow as pa
|
5
5
|
from fsspec import AbstractFileSystem
|
6
|
+
import pyarrow.fs as pafs
|
7
|
+
import logging
|
6
8
|
|
7
9
|
from ray.data.datasource import FilenameProvider
|
8
|
-
from deltacat.types.media import ContentType
|
10
|
+
from deltacat.types.media import ContentType, ContentEncoding
|
9
11
|
from deltacat.utils import pandas as pd_utils
|
10
|
-
|
12
|
+
|
11
13
|
from deltacat.utils.common import ReadKwargsProvider
|
14
|
+
from deltacat import logs
|
15
|
+
from deltacat.utils.performance import timed_invocation
|
16
|
+
from deltacat.types.partial_download import PartialFileDownloadParams
|
17
|
+
|
18
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
19
|
|
13
20
|
|
14
21
|
def slice_ndarray(np_array: np.ndarray, max_len: Optional[int]) -> List[np.ndarray]:
|
@@ -22,26 +29,61 @@ def slice_ndarray(np_array: np.ndarray, max_len: Optional[int]) -> List[np.ndarr
|
|
22
29
|
return [np_array[i : i + max_len] for i in range(0, len(np_array), max_len)]
|
23
30
|
|
24
31
|
|
25
|
-
def
|
26
|
-
|
32
|
+
def file_to_ndarray(
|
33
|
+
path: str,
|
27
34
|
content_type: str,
|
28
|
-
content_encoding: str,
|
35
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
36
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
29
37
|
column_names: Optional[List[str]] = None,
|
30
38
|
include_columns: Optional[List[str]] = None,
|
31
39
|
pd_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
32
|
-
|
40
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
41
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
42
|
+
**kwargs,
|
33
43
|
) -> np.ndarray:
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
44
|
+
"""
|
45
|
+
Read a file into a NumPy ndarray using any filesystem.
|
46
|
+
|
47
|
+
This function delegates to the pandas file_to_dataframe function and converts
|
48
|
+
the resulting DataFrame to a NumPy ndarray.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
path: The file path to read
|
52
|
+
content_type: The content type of the file (e.g., ContentType.CSV.value)
|
53
|
+
content_encoding: The content encoding (default: IDENTITY)
|
54
|
+
filesystem: The filesystem to use (if None, will be inferred from path)
|
55
|
+
column_names: Optional column names to assign
|
56
|
+
include_columns: Optional columns to include in the result
|
57
|
+
pd_read_func_kwargs_provider: Optional kwargs provider for customization
|
58
|
+
fs_open_kwargs: Optional kwargs for filesystem open operations
|
59
|
+
**kwargs: Additional kwargs passed to the reader function
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
np.ndarray: The loaded data as a NumPy ndarray
|
63
|
+
"""
|
64
|
+
logger.debug(
|
65
|
+
f"Reading {path} to NumPy ndarray. Content type: {content_type}. "
|
66
|
+
f"Encoding: {content_encoding}"
|
43
67
|
)
|
44
|
-
|
68
|
+
|
69
|
+
dataframe, latency = timed_invocation(
|
70
|
+
pd_utils.file_to_dataframe,
|
71
|
+
path=path,
|
72
|
+
content_type=content_type,
|
73
|
+
content_encoding=content_encoding,
|
74
|
+
filesystem=filesystem,
|
75
|
+
column_names=column_names,
|
76
|
+
include_columns=include_columns,
|
77
|
+
pd_read_func_kwargs_provider=pd_read_func_kwargs_provider,
|
78
|
+
partial_file_download_params=partial_file_download_params,
|
79
|
+
fs_open_kwargs=fs_open_kwargs,
|
80
|
+
**kwargs,
|
81
|
+
)
|
82
|
+
|
83
|
+
ndarray, conversion_latency = timed_invocation(dataframe.to_numpy)
|
84
|
+
total_latency = latency + conversion_latency
|
85
|
+
logger.debug(f"Time to read {path} into NumPy ndarray: {total_latency}s")
|
86
|
+
return ndarray
|
45
87
|
|
46
88
|
|
47
89
|
def ndarray_size(np_array: np.ndarray) -> int:
|
@@ -51,22 +93,72 @@ def ndarray_size(np_array: np.ndarray) -> int:
|
|
51
93
|
def ndarray_to_file(
|
52
94
|
np_array: np.ndarray,
|
53
95
|
path: str,
|
54
|
-
|
96
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
|
55
97
|
block_path_provider: Union[FilenameProvider, Callable],
|
56
98
|
content_type: str = ContentType.PARQUET.value,
|
57
|
-
**kwargs
|
99
|
+
**kwargs,
|
58
100
|
) -> None:
|
59
101
|
"""
|
60
102
|
Writes the given Numpy ndarray to a file.
|
61
103
|
"""
|
104
|
+
import pyarrow as pa
|
62
105
|
|
63
|
-
#
|
64
|
-
|
65
|
-
|
66
|
-
|
106
|
+
# Extract schema from kwargs if available
|
107
|
+
schema = kwargs.pop("schema", None)
|
108
|
+
|
109
|
+
# Convert to pandas DataFrame with proper column names if schema is available
|
110
|
+
if schema and isinstance(schema, pa.Schema):
|
111
|
+
if np_array.ndim == 1:
|
112
|
+
# 1D array: single column
|
113
|
+
column_names = [schema.names[0]] if schema.names else ["0"]
|
114
|
+
df = pd.DataFrame({column_names[0]: np_array})
|
115
|
+
elif np_array.ndim == 2:
|
116
|
+
# 2D array: multiple columns
|
117
|
+
column_names = (
|
118
|
+
schema.names
|
119
|
+
if len(schema.names) == np_array.shape[1]
|
120
|
+
else [f"{i}" for i in range(np_array.shape[1])]
|
121
|
+
)
|
122
|
+
df = pd.DataFrame(np_array, columns=column_names)
|
123
|
+
else:
|
124
|
+
raise ValueError(
|
125
|
+
f"NumPy arrays with {np_array.ndim} dimensions are not supported"
|
126
|
+
)
|
127
|
+
else:
|
128
|
+
# Fallback to generic column names
|
129
|
+
df = pd.DataFrame(np_array)
|
130
|
+
|
131
|
+
pd_utils.dataframe_to_file(
|
132
|
+
df,
|
67
133
|
path,
|
68
|
-
|
134
|
+
filesystem,
|
69
135
|
block_path_provider,
|
70
136
|
content_type,
|
71
|
-
**kwargs
|
137
|
+
**kwargs,
|
72
138
|
)
|
139
|
+
|
140
|
+
|
141
|
+
def concat_ndarrays(arrays: List[np.ndarray]) -> Optional[np.ndarray]:
|
142
|
+
"""
|
143
|
+
Concatenate a list of NumPy ndarrays into a single ndarray.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
arrays: List of NumPy ndarrays to concatenate
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
Concatenated NumPy ndarray, or None if input is empty
|
150
|
+
"""
|
151
|
+
if arrays is None or not len(arrays):
|
152
|
+
return None
|
153
|
+
if len(arrays) == 1:
|
154
|
+
return next(iter(arrays))
|
155
|
+
return np.concatenate(arrays, axis=0)
|
156
|
+
|
157
|
+
|
158
|
+
def append_column_to_ndarray(
|
159
|
+
np_array: np.ndarray,
|
160
|
+
column_name: str,
|
161
|
+
column_value: Any,
|
162
|
+
) -> np.ndarray:
|
163
|
+
# Add a new column with value repeating for each row of np_array
|
164
|
+
return np.concatenate((np_array, np.full((len(np_array), 1), column_value)), axis=1)
|