deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/utils/export.py
CHANGED
@@ -5,7 +5,9 @@ import pyarrow.parquet
|
|
5
5
|
import pyarrow.feather
|
6
6
|
from typing import Callable, Dict
|
7
7
|
|
8
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
9
|
+
QueryExpression,
|
10
|
+
)
|
9
11
|
from deltacat import logs
|
10
12
|
|
11
13
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
deltacat/utils/filesystem.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import re
|
4
4
|
from typing import Optional, Tuple, Union, List
|
5
5
|
from datetime import timedelta
|
6
|
+
from enum import Enum
|
6
7
|
|
7
8
|
import sys
|
8
9
|
import urllib
|
@@ -18,11 +19,54 @@ from pyarrow.fs import (
|
|
18
19
|
FSSpecHandler,
|
19
20
|
PyFileSystem,
|
20
21
|
GcsFileSystem,
|
22
|
+
LocalFileSystem,
|
23
|
+
S3FileSystem,
|
24
|
+
AzureFileSystem,
|
25
|
+
HadoopFileSystem,
|
21
26
|
)
|
22
27
|
|
23
28
|
_LOCAL_SCHEME = "local"
|
24
29
|
|
25
30
|
|
31
|
+
class FilesystemType(str, Enum):
|
32
|
+
LOCAL = "local"
|
33
|
+
S3 = "s3"
|
34
|
+
GCS = "gcs"
|
35
|
+
AZURE = "azure"
|
36
|
+
HADOOP = "hadoop"
|
37
|
+
UNKNOWN = "unknown"
|
38
|
+
|
39
|
+
@classmethod
|
40
|
+
def from_filesystem(cls, filesystem: FileSystem) -> FilesystemType:
|
41
|
+
if isinstance(filesystem, LocalFileSystem):
|
42
|
+
return cls.LOCAL
|
43
|
+
elif isinstance(filesystem, S3FileSystem):
|
44
|
+
return cls.S3
|
45
|
+
elif isinstance(filesystem, GcsFileSystem):
|
46
|
+
return cls.GCS
|
47
|
+
elif isinstance(filesystem, AzureFileSystem):
|
48
|
+
return cls.AZURE
|
49
|
+
elif isinstance(filesystem, HadoopFileSystem):
|
50
|
+
return cls.HADOOP
|
51
|
+
else:
|
52
|
+
return cls.UNKNOWN
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def to_filesystem(cls, filesystem_type: FilesystemType) -> FileSystem:
|
56
|
+
if filesystem_type == cls.LOCAL:
|
57
|
+
return LocalFileSystem()
|
58
|
+
elif filesystem_type == cls.S3:
|
59
|
+
return S3FileSystem()
|
60
|
+
elif filesystem_type == cls.GCS:
|
61
|
+
return GcsFileSystem()
|
62
|
+
elif filesystem_type == cls.AZURE:
|
63
|
+
return AzureFileSystem()
|
64
|
+
elif filesystem_type == cls.HADOOP:
|
65
|
+
return HadoopFileSystem()
|
66
|
+
else:
|
67
|
+
raise ValueError(f"Unsupported filesystem type: {filesystem_type}")
|
68
|
+
|
69
|
+
|
26
70
|
def resolve_paths_and_filesystem(
|
27
71
|
paths: Union[str, List[str]],
|
28
72
|
filesystem: FileSystem = None,
|
@@ -221,6 +265,62 @@ def get_file_info(
|
|
221
265
|
return file_info
|
222
266
|
|
223
267
|
|
268
|
+
def write_file(
|
269
|
+
path: str,
|
270
|
+
data: Union[str, bytes],
|
271
|
+
filesystem: Optional[FileSystem] = None,
|
272
|
+
) -> None:
|
273
|
+
"""
|
274
|
+
Write data to a file using any filesystem.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
path: The file path to write to.
|
278
|
+
data: The data to write (string or bytes).
|
279
|
+
filesystem: The filesystem implementation to use. If None, will be inferred from the path.
|
280
|
+
"""
|
281
|
+
resolved_path, resolved_filesystem = resolve_path_and_filesystem(
|
282
|
+
path=path,
|
283
|
+
filesystem=filesystem,
|
284
|
+
)
|
285
|
+
|
286
|
+
# Convert string to bytes if necessary
|
287
|
+
if isinstance(data, str):
|
288
|
+
data = data.encode("utf-8")
|
289
|
+
|
290
|
+
with resolved_filesystem.open_output_stream(resolved_path) as f:
|
291
|
+
f.write(data)
|
292
|
+
|
293
|
+
|
294
|
+
def read_file(
|
295
|
+
path: str,
|
296
|
+
filesystem: Optional[FileSystem] = None,
|
297
|
+
fail_if_not_found: bool = True,
|
298
|
+
) -> Optional[bytes]:
|
299
|
+
"""
|
300
|
+
Read data from a file using any filesystem.
|
301
|
+
|
302
|
+
Args:
|
303
|
+
path: The file path to read from.
|
304
|
+
filesystem: The filesystem implementation to use. If None, will be inferred from the path.
|
305
|
+
fail_if_not_found: Whether to raise an error if the file is not found.
|
306
|
+
|
307
|
+
Returns:
|
308
|
+
The file data as bytes, or None if file not found and fail_if_not_found is False.
|
309
|
+
"""
|
310
|
+
try:
|
311
|
+
resolved_path, resolved_filesystem = resolve_path_and_filesystem(
|
312
|
+
path=path,
|
313
|
+
filesystem=filesystem,
|
314
|
+
)
|
315
|
+
|
316
|
+
with resolved_filesystem.open_input_stream(resolved_path) as f:
|
317
|
+
return f.read()
|
318
|
+
except FileNotFoundError:
|
319
|
+
if fail_if_not_found:
|
320
|
+
raise
|
321
|
+
return None
|
322
|
+
|
323
|
+
|
224
324
|
def _handle_read_os_error(
|
225
325
|
error: OSError,
|
226
326
|
paths: Union[str, List[str]],
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import posixpath
|
2
2
|
import pyarrow.fs
|
3
3
|
|
4
|
+
from deltacat.constants import REV_DIR_NAME
|
4
5
|
from deltacat.storage.model.partition import PartitionLocator
|
5
6
|
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
6
7
|
|
@@ -28,7 +29,7 @@ def _find_first_child_with_rev(
|
|
28
29
|
)
|
29
30
|
for child in children:
|
30
31
|
if child.type == pyarrow.fs.FileType.Directory:
|
31
|
-
rev_path = posixpath.join(child.path,
|
32
|
+
rev_path = posixpath.join(child.path, REV_DIR_NAME)
|
32
33
|
if filesystem.get_file_info(rev_path).type == pyarrow.fs.FileType.Directory:
|
33
34
|
return child.base_name
|
34
35
|
raise ValueError(f"No directory with 'rev/' found under {parent_path}")
|
deltacat/utils/numpy.py
CHANGED
@@ -1,14 +1,21 @@
|
|
1
|
-
from typing import List, Optional, Callable, Union
|
1
|
+
from typing import List, Optional, Callable, Union, Dict, Any
|
2
2
|
|
3
|
+
import pandas as pd
|
3
4
|
import numpy as np
|
4
|
-
import pyarrow as pa
|
5
5
|
from fsspec import AbstractFileSystem
|
6
|
+
import pyarrow.fs as pafs
|
7
|
+
import logging
|
6
8
|
|
7
9
|
from ray.data.datasource import FilenameProvider
|
8
|
-
from deltacat.types.media import ContentType
|
10
|
+
from deltacat.types.media import ContentType, ContentEncoding
|
9
11
|
from deltacat.utils import pandas as pd_utils
|
10
|
-
|
12
|
+
|
11
13
|
from deltacat.utils.common import ReadKwargsProvider
|
14
|
+
from deltacat import logs
|
15
|
+
from deltacat.utils.performance import timed_invocation
|
16
|
+
from deltacat.types.partial_download import PartialFileDownloadParams
|
17
|
+
|
18
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
19
|
|
13
20
|
|
14
21
|
def slice_ndarray(np_array: np.ndarray, max_len: Optional[int]) -> List[np.ndarray]:
|
@@ -22,26 +29,61 @@ def slice_ndarray(np_array: np.ndarray, max_len: Optional[int]) -> List[np.ndarr
|
|
22
29
|
return [np_array[i : i + max_len] for i in range(0, len(np_array), max_len)]
|
23
30
|
|
24
31
|
|
25
|
-
def
|
26
|
-
|
32
|
+
def file_to_ndarray(
|
33
|
+
path: str,
|
27
34
|
content_type: str,
|
28
|
-
content_encoding: str,
|
35
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
36
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
29
37
|
column_names: Optional[List[str]] = None,
|
30
38
|
include_columns: Optional[List[str]] = None,
|
31
39
|
pd_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
32
|
-
|
40
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
41
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
42
|
+
**kwargs,
|
33
43
|
) -> np.ndarray:
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
44
|
+
"""
|
45
|
+
Read a file into a NumPy ndarray using any filesystem.
|
46
|
+
|
47
|
+
This function delegates to the pandas file_to_dataframe function and converts
|
48
|
+
the resulting DataFrame to a NumPy ndarray.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
path: The file path to read
|
52
|
+
content_type: The content type of the file (e.g., ContentType.CSV.value)
|
53
|
+
content_encoding: The content encoding (default: IDENTITY)
|
54
|
+
filesystem: The filesystem to use (if None, will be inferred from path)
|
55
|
+
column_names: Optional column names to assign
|
56
|
+
include_columns: Optional columns to include in the result
|
57
|
+
pd_read_func_kwargs_provider: Optional kwargs provider for customization
|
58
|
+
fs_open_kwargs: Optional kwargs for filesystem open operations
|
59
|
+
**kwargs: Additional kwargs passed to the reader function
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
np.ndarray: The loaded data as a NumPy ndarray
|
63
|
+
"""
|
64
|
+
logger.debug(
|
65
|
+
f"Reading {path} to NumPy ndarray. Content type: {content_type}. "
|
66
|
+
f"Encoding: {content_encoding}"
|
43
67
|
)
|
44
|
-
|
68
|
+
|
69
|
+
dataframe, latency = timed_invocation(
|
70
|
+
pd_utils.file_to_dataframe,
|
71
|
+
path=path,
|
72
|
+
content_type=content_type,
|
73
|
+
content_encoding=content_encoding,
|
74
|
+
filesystem=filesystem,
|
75
|
+
column_names=column_names,
|
76
|
+
include_columns=include_columns,
|
77
|
+
pd_read_func_kwargs_provider=pd_read_func_kwargs_provider,
|
78
|
+
partial_file_download_params=partial_file_download_params,
|
79
|
+
fs_open_kwargs=fs_open_kwargs,
|
80
|
+
**kwargs,
|
81
|
+
)
|
82
|
+
|
83
|
+
ndarray, conversion_latency = timed_invocation(dataframe.to_numpy)
|
84
|
+
total_latency = latency + conversion_latency
|
85
|
+
logger.debug(f"Time to read {path} into NumPy ndarray: {total_latency}s")
|
86
|
+
return ndarray
|
45
87
|
|
46
88
|
|
47
89
|
def ndarray_size(np_array: np.ndarray) -> int:
|
@@ -51,22 +93,72 @@ def ndarray_size(np_array: np.ndarray) -> int:
|
|
51
93
|
def ndarray_to_file(
|
52
94
|
np_array: np.ndarray,
|
53
95
|
path: str,
|
54
|
-
|
96
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
|
55
97
|
block_path_provider: Union[FilenameProvider, Callable],
|
56
98
|
content_type: str = ContentType.PARQUET.value,
|
57
|
-
**kwargs
|
99
|
+
**kwargs,
|
58
100
|
) -> None:
|
59
101
|
"""
|
60
102
|
Writes the given Numpy ndarray to a file.
|
61
103
|
"""
|
104
|
+
import pyarrow as pa
|
62
105
|
|
63
|
-
#
|
64
|
-
|
65
|
-
|
66
|
-
|
106
|
+
# Extract schema from kwargs if available
|
107
|
+
schema = kwargs.pop("schema", None)
|
108
|
+
|
109
|
+
# Convert to pandas DataFrame with proper column names if schema is available
|
110
|
+
if schema and isinstance(schema, pa.Schema):
|
111
|
+
if np_array.ndim == 1:
|
112
|
+
# 1D array: single column
|
113
|
+
column_names = [schema.names[0]] if schema.names else ["0"]
|
114
|
+
df = pd.DataFrame({column_names[0]: np_array})
|
115
|
+
elif np_array.ndim == 2:
|
116
|
+
# 2D array: multiple columns
|
117
|
+
column_names = (
|
118
|
+
schema.names
|
119
|
+
if len(schema.names) == np_array.shape[1]
|
120
|
+
else [f"{i}" for i in range(np_array.shape[1])]
|
121
|
+
)
|
122
|
+
df = pd.DataFrame(np_array, columns=column_names)
|
123
|
+
else:
|
124
|
+
raise ValueError(
|
125
|
+
f"NumPy arrays with {np_array.ndim} dimensions are not supported"
|
126
|
+
)
|
127
|
+
else:
|
128
|
+
# Fallback to generic column names
|
129
|
+
df = pd.DataFrame(np_array)
|
130
|
+
|
131
|
+
pd_utils.dataframe_to_file(
|
132
|
+
df,
|
67
133
|
path,
|
68
|
-
|
134
|
+
filesystem,
|
69
135
|
block_path_provider,
|
70
136
|
content_type,
|
71
|
-
**kwargs
|
137
|
+
**kwargs,
|
72
138
|
)
|
139
|
+
|
140
|
+
|
141
|
+
def concat_ndarrays(arrays: List[np.ndarray]) -> Optional[np.ndarray]:
|
142
|
+
"""
|
143
|
+
Concatenate a list of NumPy ndarrays into a single ndarray.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
arrays: List of NumPy ndarrays to concatenate
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
Concatenated NumPy ndarray, or None if input is empty
|
150
|
+
"""
|
151
|
+
if arrays is None or not len(arrays):
|
152
|
+
return None
|
153
|
+
if len(arrays) == 1:
|
154
|
+
return next(iter(arrays))
|
155
|
+
return np.concatenate(arrays, axis=0)
|
156
|
+
|
157
|
+
|
158
|
+
def append_column_to_ndarray(
|
159
|
+
np_array: np.ndarray,
|
160
|
+
column_name: str,
|
161
|
+
column_value: Any,
|
162
|
+
) -> np.ndarray:
|
163
|
+
# Add a new column with value repeating for each row of np_array
|
164
|
+
return np.concatenate((np_array, np.full((len(np_array), 1), column_value)), axis=1)
|