deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/utils/polars.py
CHANGED
@@ -1,58 +1,156 @@
|
|
1
1
|
import logging
|
2
|
-
|
2
|
+
import bz2
|
3
|
+
import gzip
|
4
|
+
from functools import partial
|
5
|
+
from typing import Optional, List, Dict, Callable, Union, Iterable, Any
|
3
6
|
|
4
7
|
import polars as pl
|
8
|
+
import pyarrow as pa
|
9
|
+
import pyarrow.fs as pafs
|
5
10
|
|
6
11
|
from fsspec import AbstractFileSystem
|
7
12
|
from ray.data.datasource import FilenameProvider
|
8
13
|
|
9
14
|
from deltacat import logs
|
15
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
16
|
+
from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
|
17
|
+
from deltacat.utils.performance import timed_invocation
|
10
18
|
|
11
|
-
from deltacat.types.media import
|
19
|
+
from deltacat.types.media import (
|
20
|
+
ContentType,
|
21
|
+
ContentEncoding,
|
22
|
+
DELIMITED_TEXT_CONTENT_TYPES,
|
23
|
+
TABULAR_CONTENT_TYPES,
|
24
|
+
)
|
25
|
+
from deltacat.types.partial_download import PartialFileDownloadParams
|
12
26
|
|
13
27
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
14
28
|
|
29
|
+
# Encoding to file initialization function mapping
|
30
|
+
ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
|
31
|
+
ContentEncoding.GZIP.value: partial(gzip.open, mode="rb"),
|
32
|
+
ContentEncoding.BZIP2.value: partial(bz2.open, mode="rb"),
|
33
|
+
ContentEncoding.IDENTITY.value: lambda file_path: file_path,
|
34
|
+
}
|
35
|
+
|
15
36
|
|
16
37
|
def write_json(
|
17
38
|
table: pl.DataFrame,
|
18
39
|
path: str,
|
19
40
|
*,
|
20
|
-
filesystem: Optional[AbstractFileSystem] = None,
|
41
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
21
42
|
fs_open_kwargs: Dict[str, any] = {},
|
22
43
|
**write_kwargs,
|
23
44
|
) -> None:
|
24
|
-
if
|
25
|
-
|
45
|
+
# Check if the path already indicates compression to avoid double compression
|
46
|
+
should_compress = path.endswith(".gz")
|
47
|
+
|
48
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
49
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
50
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
51
|
+
if should_compress:
|
52
|
+
# Path ends with .gz, PyArrow filesystem automatically compresses
|
53
|
+
table.write_ndjson(f, **write_kwargs)
|
54
|
+
else:
|
55
|
+
# No compression indicated, write uncompressed
|
56
|
+
table.write_ndjson(f, **write_kwargs)
|
26
57
|
else:
|
27
58
|
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
28
|
-
|
59
|
+
if should_compress:
|
60
|
+
# For fsspec filesystems, we need to apply compression explicitly
|
61
|
+
with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
|
62
|
+
table.write_ndjson(out, **write_kwargs)
|
63
|
+
else:
|
64
|
+
# No compression indicated, write uncompressed
|
65
|
+
table.write_ndjson(f, **write_kwargs)
|
66
|
+
|
67
|
+
|
68
|
+
def content_type_to_writer_kwargs(content_type: str) -> Dict[str, any]:
|
69
|
+
"""
|
70
|
+
Returns writer kwargs for the given content type when writing with polars.
|
71
|
+
"""
|
72
|
+
if content_type == ContentType.UNESCAPED_TSV.value:
|
73
|
+
return {
|
74
|
+
"separator": "\t",
|
75
|
+
"include_header": False,
|
76
|
+
"null_value": "",
|
77
|
+
"quote_style": "never", # Equivalent to QUOTE_NONE in pandas
|
78
|
+
}
|
79
|
+
if content_type == ContentType.TSV.value:
|
80
|
+
return {
|
81
|
+
"separator": "\t",
|
82
|
+
"include_header": False,
|
83
|
+
"quote_style": "necessary",
|
84
|
+
}
|
85
|
+
if content_type == ContentType.CSV.value:
|
86
|
+
return {
|
87
|
+
"separator": ",",
|
88
|
+
"include_header": False,
|
89
|
+
"quote_style": "necessary",
|
90
|
+
}
|
91
|
+
if content_type == ContentType.PSV.value:
|
92
|
+
return {
|
93
|
+
"separator": "|",
|
94
|
+
"include_header": False,
|
95
|
+
"quote_style": "necessary",
|
96
|
+
}
|
97
|
+
if content_type in {
|
98
|
+
ContentType.PARQUET.value,
|
99
|
+
ContentType.FEATHER.value,
|
100
|
+
ContentType.JSON.value,
|
101
|
+
ContentType.AVRO.value,
|
102
|
+
ContentType.ORC.value,
|
103
|
+
}:
|
104
|
+
return {}
|
105
|
+
raise ValueError(f"Unsupported content type: {content_type}")
|
29
106
|
|
30
107
|
|
31
108
|
def write_csv(
|
32
109
|
table: pl.DataFrame,
|
33
110
|
path: str,
|
34
111
|
*,
|
35
|
-
filesystem: Optional[AbstractFileSystem] = None,
|
112
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
36
113
|
fs_open_kwargs: Dict[str, any] = {},
|
37
|
-
**
|
114
|
+
**kwargs,
|
38
115
|
) -> None:
|
39
|
-
|
40
|
-
|
116
|
+
"""
|
117
|
+
Write a polars DataFrame to a CSV file (or other delimited text format).
|
118
|
+
"""
|
119
|
+
# Check if the path already indicates compression to avoid double compression
|
120
|
+
should_compress = path.endswith(".gz")
|
121
|
+
|
122
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
123
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
124
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
125
|
+
if should_compress:
|
126
|
+
# Path ends with .gz, PyArrow filesystem automatically compresses
|
127
|
+
table.write_csv(f, **kwargs)
|
128
|
+
else:
|
129
|
+
# No compression indicated, write uncompressed
|
130
|
+
table.write_csv(f, **kwargs)
|
41
131
|
else:
|
42
132
|
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
43
|
-
|
133
|
+
if should_compress:
|
134
|
+
# For fsspec filesystems, we need to apply compression explicitly
|
135
|
+
with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
|
136
|
+
table.write_csv(out, **kwargs)
|
137
|
+
else:
|
138
|
+
# No compression indicated, write uncompressed
|
139
|
+
table.write_csv(f, **kwargs)
|
44
140
|
|
45
141
|
|
46
142
|
def write_avro(
|
47
143
|
table: pl.DataFrame,
|
48
144
|
path: str,
|
49
145
|
*,
|
50
|
-
filesystem: Optional[AbstractFileSystem] = None,
|
146
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
51
147
|
fs_open_kwargs: Dict[str, any] = {},
|
52
148
|
**write_kwargs,
|
53
149
|
) -> None:
|
54
|
-
if not filesystem:
|
55
|
-
|
150
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
151
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
152
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
153
|
+
table.write_avro(f, **write_kwargs)
|
56
154
|
else:
|
57
155
|
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
58
156
|
table.write_avro(f, **write_kwargs)
|
@@ -62,25 +160,75 @@ def write_parquet(
|
|
62
160
|
table: pl.DataFrame,
|
63
161
|
path: str,
|
64
162
|
*,
|
65
|
-
filesystem: Optional[AbstractFileSystem] = None,
|
163
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
66
164
|
fs_open_kwargs: Dict[str, any] = {},
|
67
165
|
**write_kwargs,
|
68
166
|
) -> None:
|
69
|
-
if not filesystem:
|
70
|
-
|
167
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
168
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
169
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
170
|
+
table.write_parquet(f, **write_kwargs)
|
71
171
|
else:
|
72
172
|
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
73
173
|
table.write_parquet(f, **write_kwargs)
|
74
174
|
|
75
175
|
|
176
|
+
def write_feather(
|
177
|
+
table: pl.DataFrame,
|
178
|
+
path: str,
|
179
|
+
*,
|
180
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
181
|
+
fs_open_kwargs: Dict[str, any] = {},
|
182
|
+
**kwargs,
|
183
|
+
) -> None:
|
184
|
+
"""
|
185
|
+
Write a polars DataFrame to a Feather file.
|
186
|
+
"""
|
187
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
188
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
189
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
190
|
+
table.write_ipc(f, **kwargs)
|
191
|
+
else:
|
192
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
193
|
+
table.write_ipc(f, **kwargs)
|
194
|
+
|
195
|
+
|
196
|
+
def write_orc(
|
197
|
+
table: pl.DataFrame,
|
198
|
+
path: str,
|
199
|
+
*,
|
200
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
201
|
+
fs_open_kwargs: Dict[str, any] = {},
|
202
|
+
**write_kwargs,
|
203
|
+
) -> None:
|
204
|
+
"""
|
205
|
+
Write a polars DataFrame to an ORC file by delegating to PyArrow implementation.
|
206
|
+
"""
|
207
|
+
from deltacat.utils.pyarrow import write_orc as pyarrow_write_orc
|
208
|
+
|
209
|
+
# Convert polars DataFrame to PyArrow Table
|
210
|
+
pa_table = table.to_arrow()
|
211
|
+
|
212
|
+
# Delegate to PyArrow write_orc implementation
|
213
|
+
pyarrow_write_orc(
|
214
|
+
pa_table,
|
215
|
+
path,
|
216
|
+
filesystem=filesystem,
|
217
|
+
fs_open_kwargs=fs_open_kwargs,
|
218
|
+
**write_kwargs,
|
219
|
+
)
|
220
|
+
|
221
|
+
|
76
222
|
CONTENT_TYPE_TO_PL_WRITE_FUNC: Dict[str, Callable] = {
|
77
|
-
|
78
|
-
|
79
|
-
# representations to pyarrow.csv.WriteOptions.
|
80
|
-
ContentType.AVRO.value: write_avro,
|
223
|
+
ContentType.UNESCAPED_TSV.value: write_csv,
|
224
|
+
ContentType.TSV.value: write_csv,
|
81
225
|
ContentType.CSV.value: write_csv,
|
226
|
+
ContentType.PSV.value: write_csv,
|
82
227
|
ContentType.PARQUET.value: write_parquet,
|
228
|
+
ContentType.FEATHER.value: write_feather,
|
83
229
|
ContentType.JSON.value: write_json,
|
230
|
+
ContentType.AVRO.value: write_avro,
|
231
|
+
ContentType.ORC.value: write_orc,
|
84
232
|
}
|
85
233
|
|
86
234
|
|
@@ -108,21 +256,504 @@ def dataframe_size(table: pl.DataFrame) -> int:
|
|
108
256
|
def dataframe_to_file(
|
109
257
|
table: pl.DataFrame,
|
110
258
|
base_path: str,
|
111
|
-
|
259
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
|
112
260
|
block_path_provider: Union[Callable, FilenameProvider],
|
113
261
|
content_type: str = ContentType.PARQUET.value,
|
262
|
+
schema: Optional[pa.Schema] = None,
|
114
263
|
**kwargs,
|
115
264
|
) -> None:
|
116
265
|
"""
|
117
|
-
Writes the given
|
266
|
+
Writes the given Polars DataFrame to a file.
|
118
267
|
"""
|
119
268
|
writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
|
269
|
+
writer_kwargs = content_type_to_writer_kwargs(content_type)
|
270
|
+
writer_kwargs.update(kwargs)
|
120
271
|
if not writer:
|
121
272
|
raise NotImplementedError(
|
122
|
-
f"
|
273
|
+
f"Polars writer for content type '{content_type}' not "
|
123
274
|
f"implemented. Known content types: "
|
124
|
-
f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys}"
|
275
|
+
f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys()}"
|
125
276
|
)
|
126
277
|
path = block_path_provider(base_path)
|
127
|
-
logger.debug(f"Writing table: {table} with kwargs: {
|
128
|
-
writer(table, path, filesystem=
|
278
|
+
logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
|
279
|
+
writer(table, path, filesystem=filesystem, **writer_kwargs)
|
280
|
+
|
281
|
+
|
282
|
+
def write_table(
|
283
|
+
table: pl.DataFrame,
|
284
|
+
path: str,
|
285
|
+
*,
|
286
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
287
|
+
fs_open_kwargs: Dict[str, any] = {},
|
288
|
+
content_type: str = ContentType.PARQUET.value,
|
289
|
+
**kwargs,
|
290
|
+
) -> None:
|
291
|
+
"""
|
292
|
+
Write a polars DataFrame to a file in the specified format.
|
293
|
+
"""
|
294
|
+
writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
|
295
|
+
writer_kwargs = content_type_to_writer_kwargs(content_type)
|
296
|
+
writer_kwargs.update(kwargs)
|
297
|
+
if not writer:
|
298
|
+
raise NotImplementedError(
|
299
|
+
f"Polars writer for content type '{content_type}' not "
|
300
|
+
f"implemented. Known content types: "
|
301
|
+
f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys()}"
|
302
|
+
)
|
303
|
+
writer(
|
304
|
+
table,
|
305
|
+
path,
|
306
|
+
filesystem=filesystem,
|
307
|
+
fs_open_kwargs=fs_open_kwargs,
|
308
|
+
**writer_kwargs,
|
309
|
+
)
|
310
|
+
|
311
|
+
|
312
|
+
CONTENT_TYPE_TO_PL_READ_FUNC: Dict[str, Callable] = {
|
313
|
+
ContentType.UNESCAPED_TSV.value: pl.read_csv,
|
314
|
+
ContentType.TSV.value: pl.read_csv,
|
315
|
+
ContentType.CSV.value: pl.read_csv,
|
316
|
+
ContentType.PSV.value: pl.read_csv,
|
317
|
+
ContentType.PARQUET.value: pl.read_parquet,
|
318
|
+
ContentType.FEATHER.value: pl.read_ipc,
|
319
|
+
ContentType.JSON.value: pl.read_ndjson,
|
320
|
+
ContentType.AVRO.value: pl.read_avro,
|
321
|
+
}
|
322
|
+
|
323
|
+
|
324
|
+
class ReadKwargsProviderPolarsStringTypes(ContentTypeKwargsProvider):
|
325
|
+
"""ReadKwargsProvider impl that reads columns of delimited text files
|
326
|
+
as UTF-8 strings (i.e. disables type inference). Useful for ensuring
|
327
|
+
lossless reads of UTF-8 delimited text datasets and improving read
|
328
|
+
performance in cases where type casting is not required."""
|
329
|
+
|
330
|
+
def __init__(self, include_columns: Optional[Iterable[str]] = None):
|
331
|
+
self.include_columns = include_columns
|
332
|
+
|
333
|
+
def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
334
|
+
if content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
335
|
+
include_columns = (
|
336
|
+
self.include_columns if self.include_columns else kwargs.get("columns")
|
337
|
+
)
|
338
|
+
if not include_columns:
|
339
|
+
# read all columns as strings - disable schema inference
|
340
|
+
kwargs["infer_schema"] = False
|
341
|
+
else:
|
342
|
+
# read only the included columns as strings
|
343
|
+
kwargs["schema_overrides"] = {
|
344
|
+
column_name: pl.Utf8 for column_name in include_columns
|
345
|
+
}
|
346
|
+
return kwargs
|
347
|
+
|
348
|
+
|
349
|
+
def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
|
350
|
+
if content_type == ContentType.UNESCAPED_TSV.value:
|
351
|
+
return {
|
352
|
+
"separator": "\t",
|
353
|
+
"has_header": False,
|
354
|
+
"null_values": [""],
|
355
|
+
"quote_char": None,
|
356
|
+
}
|
357
|
+
if content_type == ContentType.TSV.value:
|
358
|
+
return {"separator": "\t", "has_header": False}
|
359
|
+
if content_type == ContentType.CSV.value:
|
360
|
+
return {"separator": ",", "has_header": False}
|
361
|
+
if content_type == ContentType.PSV.value:
|
362
|
+
return {"separator": "|", "has_header": False}
|
363
|
+
if content_type in {
|
364
|
+
ContentType.PARQUET.value,
|
365
|
+
ContentType.FEATHER.value,
|
366
|
+
ContentType.ORC.value,
|
367
|
+
ContentType.JSON.value,
|
368
|
+
ContentType.AVRO.value,
|
369
|
+
}:
|
370
|
+
return {}
|
371
|
+
raise ValueError(f"Unsupported content type: {content_type}")
|
372
|
+
|
373
|
+
|
374
|
+
def _add_column_kwargs(
|
375
|
+
content_type: str,
|
376
|
+
column_names: Optional[List[str]],
|
377
|
+
include_columns: Optional[List[str]],
|
378
|
+
kwargs: Dict[str, Any],
|
379
|
+
):
|
380
|
+
if content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
381
|
+
if column_names:
|
382
|
+
kwargs["new_columns"] = column_names
|
383
|
+
if include_columns:
|
384
|
+
kwargs["columns"] = include_columns
|
385
|
+
else:
|
386
|
+
if content_type in TABULAR_CONTENT_TYPES:
|
387
|
+
if include_columns:
|
388
|
+
kwargs["columns"] = include_columns
|
389
|
+
else:
|
390
|
+
if include_columns:
|
391
|
+
logger.warning(
|
392
|
+
f"Ignoring request to include columns {include_columns} "
|
393
|
+
f"for non-tabular content type {content_type}"
|
394
|
+
)
|
395
|
+
|
396
|
+
|
397
|
+
def concat_dataframes(dataframes: List[pl.DataFrame]) -> Optional[pl.DataFrame]:
|
398
|
+
if dataframes is None or not len(dataframes):
|
399
|
+
return None
|
400
|
+
if len(dataframes) == 1:
|
401
|
+
return next(iter(dataframes))
|
402
|
+
return pl.concat(dataframes)
|
403
|
+
|
404
|
+
|
405
|
+
def append_column_to_table(
|
406
|
+
table: pl.DataFrame,
|
407
|
+
column_name: str,
|
408
|
+
column_value: Any,
|
409
|
+
) -> pl.DataFrame:
|
410
|
+
return table.with_columns(pl.lit(column_value).alias(column_name))
|
411
|
+
|
412
|
+
|
413
|
+
def select_columns(
|
414
|
+
table: pl.DataFrame,
|
415
|
+
column_names: List[str],
|
416
|
+
) -> pl.DataFrame:
|
417
|
+
return table.select(column_names)
|
418
|
+
|
419
|
+
|
420
|
+
def file_to_dataframe(
|
421
|
+
path: str,
|
422
|
+
content_type: str,
|
423
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
424
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
425
|
+
column_names: Optional[List[str]] = None,
|
426
|
+
include_columns: Optional[List[str]] = None,
|
427
|
+
pl_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
428
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
429
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
430
|
+
**kwargs,
|
431
|
+
) -> pl.DataFrame:
|
432
|
+
"""
|
433
|
+
Read a file into a Polars DataFrame using any filesystem.
|
434
|
+
|
435
|
+
Args:
|
436
|
+
path: The file path to read
|
437
|
+
content_type: The content type of the file (e.g., ContentType.CSV.value)
|
438
|
+
content_encoding: The content encoding (default: IDENTITY)
|
439
|
+
filesystem: The filesystem to use (if None, will be inferred from path)
|
440
|
+
column_names: Optional column names to assign
|
441
|
+
include_columns: Optional columns to include in the result
|
442
|
+
pl_read_func_kwargs_provider: Optional kwargs provider for customization
|
443
|
+
fs_open_kwargs: Optional kwargs for filesystem open operations
|
444
|
+
**kwargs: Additional kwargs passed to the reader function
|
445
|
+
|
446
|
+
Returns:
|
447
|
+
pl.DataFrame: The loaded DataFrame
|
448
|
+
"""
|
449
|
+
logger.debug(
|
450
|
+
f"Reading {path} to Polars. Content type: {content_type}. "
|
451
|
+
f"Encoding: {content_encoding}"
|
452
|
+
)
|
453
|
+
|
454
|
+
pl_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
|
455
|
+
if not pl_read_func:
|
456
|
+
raise NotImplementedError(
|
457
|
+
f"Polars reader for content type '{content_type}' not "
|
458
|
+
f"implemented. Known content types: "
|
459
|
+
f"{list(CONTENT_TYPE_TO_READ_FN.keys())}"
|
460
|
+
)
|
461
|
+
|
462
|
+
reader_kwargs = content_type_to_reader_kwargs(content_type)
|
463
|
+
_add_column_kwargs(content_type, column_names, include_columns, reader_kwargs)
|
464
|
+
|
465
|
+
# Merge with provided kwargs
|
466
|
+
reader_kwargs.update(kwargs)
|
467
|
+
|
468
|
+
if pl_read_func_kwargs_provider:
|
469
|
+
reader_kwargs = pl_read_func_kwargs_provider(content_type, reader_kwargs)
|
470
|
+
|
471
|
+
logger.debug(f"Reading {path} via {pl_read_func} with kwargs: {reader_kwargs}")
|
472
|
+
|
473
|
+
dataframe, latency = timed_invocation(
|
474
|
+
pl_read_func,
|
475
|
+
path,
|
476
|
+
filesystem=filesystem,
|
477
|
+
fs_open_kwargs=fs_open_kwargs,
|
478
|
+
content_encoding=content_encoding,
|
479
|
+
**reader_kwargs,
|
480
|
+
)
|
481
|
+
logger.debug(f"Time to read {path} into Polars DataFrame: {latency}s")
|
482
|
+
return dataframe
|
483
|
+
|
484
|
+
|
485
|
+
def read_csv(
|
486
|
+
path: str,
|
487
|
+
*,
|
488
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
489
|
+
fs_open_kwargs: Dict[str, any] = {},
|
490
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
491
|
+
**read_kwargs,
|
492
|
+
) -> pl.DataFrame:
|
493
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
494
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
495
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
496
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
497
|
+
return pl.read_csv(f, **read_kwargs)
|
498
|
+
else:
|
499
|
+
# For compressed files with PyArrow, we need to be careful because PyArrow
|
500
|
+
# may auto-decompress some formats. Try to read directly first.
|
501
|
+
try:
|
502
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
503
|
+
# Try reading as if it's already decompressed by PyArrow
|
504
|
+
return pl.read_csv(f, **read_kwargs)
|
505
|
+
except Exception:
|
506
|
+
# If that fails, try manual decompression
|
507
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
508
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
509
|
+
content_encoding, lambda x: x
|
510
|
+
)
|
511
|
+
with input_file_init(f) as input_file:
|
512
|
+
content = input_file.read()
|
513
|
+
if isinstance(content, str):
|
514
|
+
content = content.encode("utf-8")
|
515
|
+
return pl.read_csv(content, **read_kwargs)
|
516
|
+
else:
|
517
|
+
# fsspec AbstractFileSystem
|
518
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
519
|
+
# Handle compression
|
520
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
521
|
+
return pl.read_csv(f, **read_kwargs)
|
522
|
+
else:
|
523
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
524
|
+
content_encoding, lambda x: x
|
525
|
+
)
|
526
|
+
with input_file_init(f) as input_file:
|
527
|
+
# Read decompressed content as bytes and pass to polars
|
528
|
+
content = input_file.read()
|
529
|
+
if isinstance(content, str):
|
530
|
+
content = content.encode("utf-8")
|
531
|
+
return pl.read_csv(content, **read_kwargs)
|
532
|
+
|
533
|
+
|
534
|
+
def read_parquet(
|
535
|
+
path: str,
|
536
|
+
*,
|
537
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
538
|
+
fs_open_kwargs: Dict[str, any] = {},
|
539
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
540
|
+
**read_kwargs,
|
541
|
+
) -> pl.DataFrame:
|
542
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
543
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
544
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
545
|
+
# Handle compression
|
546
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
547
|
+
return pl.read_parquet(f, **read_kwargs)
|
548
|
+
else:
|
549
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
550
|
+
content_encoding, lambda x: x
|
551
|
+
)
|
552
|
+
with input_file_init(f) as input_file:
|
553
|
+
# Read decompressed content as bytes and pass to polars
|
554
|
+
content = input_file.read()
|
555
|
+
return pl.read_parquet(content, **read_kwargs)
|
556
|
+
else:
|
557
|
+
# fsspec AbstractFileSystem
|
558
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
559
|
+
# Handle compression
|
560
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
561
|
+
return pl.read_parquet(f, **read_kwargs)
|
562
|
+
else:
|
563
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
564
|
+
content_encoding, lambda x: x
|
565
|
+
)
|
566
|
+
with input_file_init(f) as input_file:
|
567
|
+
# Read decompressed content as bytes and pass to polars
|
568
|
+
content = input_file.read()
|
569
|
+
return pl.read_parquet(content, **read_kwargs)
|
570
|
+
|
571
|
+
|
572
|
+
def read_ipc(
|
573
|
+
path: str,
|
574
|
+
*,
|
575
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
576
|
+
fs_open_kwargs: Dict[str, any] = {},
|
577
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
578
|
+
**read_kwargs,
|
579
|
+
) -> pl.DataFrame:
|
580
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
581
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
582
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
583
|
+
# Handle compression
|
584
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
585
|
+
return pl.read_ipc(f, **read_kwargs)
|
586
|
+
else:
|
587
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
588
|
+
content_encoding, lambda x: x
|
589
|
+
)
|
590
|
+
with input_file_init(f) as input_file:
|
591
|
+
# Read decompressed content as bytes and pass to polars
|
592
|
+
content = input_file.read()
|
593
|
+
return pl.read_ipc(content, **read_kwargs)
|
594
|
+
else:
|
595
|
+
# fsspec AbstractFileSystem
|
596
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
597
|
+
# Handle compression
|
598
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
599
|
+
return pl.read_ipc(f, **read_kwargs)
|
600
|
+
else:
|
601
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
602
|
+
content_encoding, lambda x: x
|
603
|
+
)
|
604
|
+
with input_file_init(f) as input_file:
|
605
|
+
# Read decompressed content as bytes and pass to polars
|
606
|
+
content = input_file.read()
|
607
|
+
return pl.read_ipc(content, **read_kwargs)
|
608
|
+
|
609
|
+
|
610
|
+
def read_ndjson(
|
611
|
+
path: str,
|
612
|
+
*,
|
613
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
614
|
+
fs_open_kwargs: Dict[str, any] = {},
|
615
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
616
|
+
**read_kwargs,
|
617
|
+
) -> pl.DataFrame:
|
618
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
619
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
620
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
621
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
622
|
+
return pl.read_ndjson(f, **read_kwargs)
|
623
|
+
else:
|
624
|
+
# For compressed files with PyArrow, we need to be careful because PyArrow
|
625
|
+
# may auto-decompress some formats. Try to read directly first.
|
626
|
+
try:
|
627
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
628
|
+
# Try reading as if it's already decompressed by PyArrow
|
629
|
+
return pl.read_ndjson(f, **read_kwargs)
|
630
|
+
except Exception:
|
631
|
+
# If that fails, try manual decompression
|
632
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
633
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
634
|
+
content_encoding, lambda x: x
|
635
|
+
)
|
636
|
+
with input_file_init(f) as input_file:
|
637
|
+
content = input_file.read()
|
638
|
+
if isinstance(content, str):
|
639
|
+
content = content.encode("utf-8")
|
640
|
+
return pl.read_ndjson(content, **read_kwargs)
|
641
|
+
else:
|
642
|
+
# fsspec AbstractFileSystem
|
643
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
644
|
+
# Handle compression
|
645
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
646
|
+
return pl.read_ndjson(f, **read_kwargs)
|
647
|
+
else:
|
648
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
649
|
+
content_encoding, lambda x: x
|
650
|
+
)
|
651
|
+
with input_file_init(f) as input_file:
|
652
|
+
# Read decompressed content as bytes and pass to polars
|
653
|
+
content = input_file.read()
|
654
|
+
if isinstance(content, str):
|
655
|
+
content = content.encode("utf-8")
|
656
|
+
return pl.read_ndjson(content, **read_kwargs)
|
657
|
+
|
658
|
+
|
659
|
+
def read_avro(
|
660
|
+
path: str,
|
661
|
+
*,
|
662
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
663
|
+
fs_open_kwargs: Dict[str, any] = {},
|
664
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
665
|
+
**read_kwargs,
|
666
|
+
) -> pl.DataFrame:
|
667
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
668
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
669
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
670
|
+
# Handle compression
|
671
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
672
|
+
return pl.read_avro(f, **read_kwargs)
|
673
|
+
else:
|
674
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
675
|
+
content_encoding, lambda x: x
|
676
|
+
)
|
677
|
+
with input_file_init(f) as input_file:
|
678
|
+
# Read decompressed content as bytes and pass to polars
|
679
|
+
content = input_file.read()
|
680
|
+
return pl.read_avro(content, **read_kwargs)
|
681
|
+
else:
|
682
|
+
# fsspec AbstractFileSystem
|
683
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
684
|
+
# Handle compression
|
685
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
686
|
+
return pl.read_avro(f, **read_kwargs)
|
687
|
+
else:
|
688
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
689
|
+
content_encoding, lambda x: x
|
690
|
+
)
|
691
|
+
with input_file_init(f) as input_file:
|
692
|
+
# Read decompressed content as bytes and pass to polars
|
693
|
+
content = input_file.read()
|
694
|
+
return pl.read_avro(content, **read_kwargs)
|
695
|
+
|
696
|
+
|
697
|
+
def read_orc(
|
698
|
+
path: str,
|
699
|
+
*,
|
700
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
701
|
+
fs_open_kwargs: Dict[str, any] = {},
|
702
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
703
|
+
**read_kwargs,
|
704
|
+
) -> pl.DataFrame:
|
705
|
+
"""
|
706
|
+
Read an ORC file using pandas and convert to polars since polars doesn't have native ORC support.
|
707
|
+
"""
|
708
|
+
import pandas as pd
|
709
|
+
|
710
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
711
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
712
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
713
|
+
# Handle compression
|
714
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
715
|
+
pd_df = pd.read_orc(f, **read_kwargs)
|
716
|
+
return pl.from_pandas(pd_df)
|
717
|
+
else:
|
718
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
719
|
+
content_encoding, lambda x: x
|
720
|
+
)
|
721
|
+
with input_file_init(f) as input_file:
|
722
|
+
# Read decompressed content and pass to pandas
|
723
|
+
content = input_file.read()
|
724
|
+
import io
|
725
|
+
|
726
|
+
pd_df = pd.read_orc(io.BytesIO(content), **read_kwargs)
|
727
|
+
return pl.from_pandas(pd_df)
|
728
|
+
else:
|
729
|
+
# fsspec AbstractFileSystem
|
730
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
731
|
+
# Handle compression
|
732
|
+
if content_encoding == ContentEncoding.IDENTITY.value:
|
733
|
+
pd_df = pd.read_orc(f, **read_kwargs)
|
734
|
+
return pl.from_pandas(pd_df)
|
735
|
+
else:
|
736
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
737
|
+
content_encoding, lambda x: x
|
738
|
+
)
|
739
|
+
with input_file_init(f) as input_file:
|
740
|
+
# Read decompressed content and pass to pandas
|
741
|
+
content = input_file.read()
|
742
|
+
import io
|
743
|
+
|
744
|
+
pd_df = pd.read_orc(io.BytesIO(content), **read_kwargs)
|
745
|
+
return pl.from_pandas(pd_df)
|
746
|
+
|
747
|
+
|
748
|
+
# New mapping for encoding-aware reader functions used by file_to_dataframe
|
749
|
+
CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
|
750
|
+
ContentType.UNESCAPED_TSV.value: read_csv,
|
751
|
+
ContentType.TSV.value: read_csv,
|
752
|
+
ContentType.CSV.value: read_csv,
|
753
|
+
ContentType.PSV.value: read_csv,
|
754
|
+
ContentType.PARQUET.value: read_parquet,
|
755
|
+
ContentType.FEATHER.value: read_ipc,
|
756
|
+
ContentType.JSON.value: read_ndjson,
|
757
|
+
ContentType.AVRO.value: read_avro,
|
758
|
+
ContentType.ORC.value: read_orc,
|
759
|
+
}
|