deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/utils/pandas.py
CHANGED
@@ -1,27 +1,305 @@
|
|
1
1
|
import csv
|
2
|
-
import io
|
3
2
|
import logging
|
4
3
|
import math
|
4
|
+
import bz2
|
5
|
+
import gzip
|
6
|
+
from functools import partial
|
5
7
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
6
8
|
|
7
9
|
import pandas as pd
|
8
10
|
import pyarrow as pa
|
11
|
+
import pyarrow.fs as pafs
|
9
12
|
from fsspec import AbstractFileSystem
|
10
13
|
from ray.data.datasource import FilenameProvider
|
11
14
|
|
12
15
|
from deltacat import logs
|
13
16
|
from deltacat.types.media import (
|
14
17
|
DELIMITED_TEXT_CONTENT_TYPES,
|
15
|
-
EXPLICIT_COMPRESSION_CONTENT_TYPES,
|
16
18
|
TABULAR_CONTENT_TYPES,
|
17
19
|
ContentEncoding,
|
18
20
|
ContentType,
|
19
21
|
)
|
20
22
|
from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
|
21
23
|
from deltacat.utils.performance import timed_invocation
|
24
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
25
|
+
from deltacat.types.partial_download import PartialFileDownloadParams
|
22
26
|
|
23
27
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
24
28
|
|
29
|
+
# Encoding to file initialization function mapping
|
30
|
+
ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
|
31
|
+
ContentEncoding.GZIP.value: partial(gzip.open, mode="rb"),
|
32
|
+
ContentEncoding.BZIP2.value: partial(bz2.open, mode="rb"),
|
33
|
+
ContentEncoding.IDENTITY.value: lambda file_path: file_path,
|
34
|
+
}
|
35
|
+
|
36
|
+
|
37
|
+
def read_csv(
|
38
|
+
path: str,
|
39
|
+
*,
|
40
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
41
|
+
fs_open_kwargs: Dict[str, any] = {},
|
42
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
43
|
+
**read_kwargs,
|
44
|
+
) -> pd.DataFrame:
|
45
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
46
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
47
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
48
|
+
# Handle compression with smart detection for PyArrow auto-decompression
|
49
|
+
if content_encoding in [
|
50
|
+
ContentEncoding.GZIP.value,
|
51
|
+
ContentEncoding.BZIP2.value,
|
52
|
+
]:
|
53
|
+
try:
|
54
|
+
# First try to read as if already decompressed by PyArrow
|
55
|
+
return pd.read_csv(f, **read_kwargs)
|
56
|
+
except (
|
57
|
+
gzip.BadGzipFile,
|
58
|
+
OSError,
|
59
|
+
UnicodeDecodeError,
|
60
|
+
pd.errors.EmptyDataError,
|
61
|
+
Exception,
|
62
|
+
):
|
63
|
+
# If that fails, we need to reopen the file since the stream may be closed/corrupted
|
64
|
+
pass
|
65
|
+
|
66
|
+
# Reopen and try manual decompression
|
67
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f_retry:
|
68
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
69
|
+
content_encoding, lambda x: x
|
70
|
+
)
|
71
|
+
with input_file_init(f_retry) as input_file:
|
72
|
+
return pd.read_csv(input_file, **read_kwargs)
|
73
|
+
else:
|
74
|
+
return pd.read_csv(f, **read_kwargs)
|
75
|
+
else:
|
76
|
+
# fsspec AbstractFileSystem
|
77
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
78
|
+
# Handle compression
|
79
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
80
|
+
with input_file_init(f) as input_file:
|
81
|
+
return pd.read_csv(input_file, **read_kwargs)
|
82
|
+
|
83
|
+
|
84
|
+
def read_parquet(
|
85
|
+
path: str,
|
86
|
+
*,
|
87
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
88
|
+
fs_open_kwargs: Dict[str, any] = {},
|
89
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
90
|
+
**read_kwargs,
|
91
|
+
) -> pd.DataFrame:
|
92
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
93
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
94
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
95
|
+
# Handle compression with smart detection for PyArrow auto-decompression
|
96
|
+
if content_encoding in [
|
97
|
+
ContentEncoding.GZIP.value,
|
98
|
+
ContentEncoding.BZIP2.value,
|
99
|
+
]:
|
100
|
+
try:
|
101
|
+
# First try to read as if already decompressed by PyArrow
|
102
|
+
return pd.read_parquet(f, **read_kwargs)
|
103
|
+
except (gzip.BadGzipFile, OSError, pa.ArrowInvalid, Exception):
|
104
|
+
# If that fails, we need to reopen the file
|
105
|
+
pass
|
106
|
+
|
107
|
+
# Reopen and try manual decompression
|
108
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f_retry:
|
109
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
110
|
+
content_encoding, lambda x: x
|
111
|
+
)
|
112
|
+
with input_file_init(f_retry) as input_file:
|
113
|
+
return pd.read_parquet(input_file, **read_kwargs)
|
114
|
+
else:
|
115
|
+
return pd.read_parquet(f, **read_kwargs)
|
116
|
+
else:
|
117
|
+
# fsspec AbstractFileSystem
|
118
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
119
|
+
# Handle compression
|
120
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
121
|
+
with input_file_init(f) as input_file:
|
122
|
+
return pd.read_parquet(input_file, **read_kwargs)
|
123
|
+
|
124
|
+
|
125
|
+
def read_feather(
|
126
|
+
path: str,
|
127
|
+
*,
|
128
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
129
|
+
fs_open_kwargs: Dict[str, any] = {},
|
130
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
131
|
+
**read_kwargs,
|
132
|
+
) -> pd.DataFrame:
|
133
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
134
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
135
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
136
|
+
# Handle compression with smart detection for PyArrow auto-decompression
|
137
|
+
if content_encoding in [
|
138
|
+
ContentEncoding.GZIP.value,
|
139
|
+
ContentEncoding.BZIP2.value,
|
140
|
+
]:
|
141
|
+
try:
|
142
|
+
# First try to read as if already decompressed by PyArrow
|
143
|
+
return pd.read_feather(f, **read_kwargs)
|
144
|
+
except (gzip.BadGzipFile, OSError, pa.ArrowInvalid, Exception):
|
145
|
+
# If that fails, we need to reopen the file
|
146
|
+
pass
|
147
|
+
|
148
|
+
# Reopen and try manual decompression
|
149
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f_retry:
|
150
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
151
|
+
content_encoding, lambda x: x
|
152
|
+
)
|
153
|
+
with input_file_init(f_retry) as input_file:
|
154
|
+
return pd.read_feather(input_file, **read_kwargs)
|
155
|
+
else:
|
156
|
+
return pd.read_feather(f, **read_kwargs)
|
157
|
+
else:
|
158
|
+
# fsspec AbstractFileSystem
|
159
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
160
|
+
# Handle compression
|
161
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
162
|
+
with input_file_init(f) as input_file:
|
163
|
+
return pd.read_feather(input_file, **read_kwargs)
|
164
|
+
|
165
|
+
|
166
|
+
def read_orc(
|
167
|
+
path: str,
|
168
|
+
*,
|
169
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
170
|
+
fs_open_kwargs: Dict[str, any] = {},
|
171
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
172
|
+
**read_kwargs,
|
173
|
+
) -> pd.DataFrame:
|
174
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
175
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
176
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
177
|
+
# Handle compression with smart detection for PyArrow auto-decompression
|
178
|
+
if content_encoding in [
|
179
|
+
ContentEncoding.GZIP.value,
|
180
|
+
ContentEncoding.BZIP2.value,
|
181
|
+
]:
|
182
|
+
try:
|
183
|
+
# First try to read as if already decompressed by PyArrow
|
184
|
+
return pd.read_orc(f, **read_kwargs)
|
185
|
+
except (gzip.BadGzipFile, OSError, pa.ArrowInvalid, Exception):
|
186
|
+
# If that fails, we need to reopen the file
|
187
|
+
pass
|
188
|
+
|
189
|
+
# Reopen and try manual decompression
|
190
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f_retry:
|
191
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
192
|
+
content_encoding, lambda x: x
|
193
|
+
)
|
194
|
+
with input_file_init(f_retry) as input_file:
|
195
|
+
return pd.read_orc(input_file, **read_kwargs)
|
196
|
+
else:
|
197
|
+
return pd.read_orc(f, **read_kwargs)
|
198
|
+
else:
|
199
|
+
# fsspec AbstractFileSystem
|
200
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
201
|
+
# Handle compression
|
202
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
203
|
+
with input_file_init(f) as input_file:
|
204
|
+
return pd.read_orc(input_file, **read_kwargs)
|
205
|
+
|
206
|
+
|
207
|
+
def read_json(
|
208
|
+
path: str,
|
209
|
+
*,
|
210
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
211
|
+
fs_open_kwargs: Dict[str, any] = {},
|
212
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
213
|
+
**read_kwargs,
|
214
|
+
) -> pd.DataFrame:
|
215
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
216
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
217
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
218
|
+
# Handle compression with smart detection for PyArrow auto-decompression
|
219
|
+
if content_encoding in [
|
220
|
+
ContentEncoding.GZIP.value,
|
221
|
+
ContentEncoding.BZIP2.value,
|
222
|
+
]:
|
223
|
+
try:
|
224
|
+
# First try to read as if already decompressed by PyArrow
|
225
|
+
return pd.read_json(f, **read_kwargs)
|
226
|
+
except (
|
227
|
+
gzip.BadGzipFile,
|
228
|
+
OSError,
|
229
|
+
UnicodeDecodeError,
|
230
|
+
ValueError,
|
231
|
+
Exception,
|
232
|
+
):
|
233
|
+
# If that fails, we need to reopen the file
|
234
|
+
pass
|
235
|
+
|
236
|
+
# Reopen and try manual decompression
|
237
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f_retry:
|
238
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
239
|
+
content_encoding, lambda x: x
|
240
|
+
)
|
241
|
+
with input_file_init(f_retry) as input_file:
|
242
|
+
return pd.read_json(input_file, **read_kwargs)
|
243
|
+
else:
|
244
|
+
return pd.read_json(f, **read_kwargs)
|
245
|
+
else:
|
246
|
+
# fsspec AbstractFileSystem
|
247
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
248
|
+
# Handle compression
|
249
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
250
|
+
with input_file_init(f) as input_file:
|
251
|
+
return pd.read_json(input_file, **read_kwargs)
|
252
|
+
|
253
|
+
|
254
|
+
def read_avro(
|
255
|
+
path: str,
|
256
|
+
*,
|
257
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
258
|
+
fs_open_kwargs: Dict[str, any] = {},
|
259
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
260
|
+
**read_kwargs,
|
261
|
+
) -> pd.DataFrame:
|
262
|
+
"""
|
263
|
+
Read an Avro file using polars and convert to pandas.
|
264
|
+
"""
|
265
|
+
import polars as pl
|
266
|
+
|
267
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
268
|
+
path, filesystem = resolve_path_and_filesystem(path)
|
269
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
270
|
+
# Handle compression with smart detection for PyArrow auto-decompression
|
271
|
+
if content_encoding in [
|
272
|
+
ContentEncoding.GZIP.value,
|
273
|
+
ContentEncoding.BZIP2.value,
|
274
|
+
]:
|
275
|
+
try:
|
276
|
+
# First try to read as if already decompressed by PyArrow
|
277
|
+
pl_df = pl.read_avro(f, **read_kwargs)
|
278
|
+
return pl_df.to_pandas()
|
279
|
+
except (gzip.BadGzipFile, OSError, Exception):
|
280
|
+
# If that fails, we need to reopen the file
|
281
|
+
pass
|
282
|
+
|
283
|
+
# Reopen and try manual decompression
|
284
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f_retry:
|
285
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
286
|
+
content_encoding, lambda x: x
|
287
|
+
)
|
288
|
+
with input_file_init(f_retry) as input_file:
|
289
|
+
pl_df = pl.read_avro(input_file, **read_kwargs)
|
290
|
+
return pl_df.to_pandas()
|
291
|
+
else:
|
292
|
+
pl_df = pl.read_avro(f, **read_kwargs)
|
293
|
+
return pl_df.to_pandas()
|
294
|
+
else:
|
295
|
+
# fsspec AbstractFileSystem
|
296
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
297
|
+
# Handle compression
|
298
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
299
|
+
with input_file_init(f) as input_file:
|
300
|
+
pl_df = pl.read_avro(input_file, **read_kwargs)
|
301
|
+
return pl_df.to_pandas()
|
302
|
+
|
25
303
|
|
26
304
|
CONTENT_TYPE_TO_PD_READ_FUNC: Dict[str, Callable] = {
|
27
305
|
ContentType.UNESCAPED_TSV.value: pd.read_csv,
|
@@ -32,6 +310,21 @@ CONTENT_TYPE_TO_PD_READ_FUNC: Dict[str, Callable] = {
|
|
32
310
|
ContentType.FEATHER.value: pd.read_feather,
|
33
311
|
ContentType.ORC.value: pd.read_orc,
|
34
312
|
ContentType.JSON.value: pd.read_json,
|
313
|
+
ContentType.AVRO.value: read_avro,
|
314
|
+
}
|
315
|
+
|
316
|
+
|
317
|
+
# New mapping for encoding-aware reader functions used by file_to_dataframe
|
318
|
+
CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
|
319
|
+
ContentType.UNESCAPED_TSV.value: read_csv,
|
320
|
+
ContentType.TSV.value: read_csv,
|
321
|
+
ContentType.CSV.value: read_csv,
|
322
|
+
ContentType.PSV.value: read_csv,
|
323
|
+
ContentType.PARQUET.value: read_parquet,
|
324
|
+
ContentType.FEATHER.value: read_feather,
|
325
|
+
ContentType.ORC.value: read_orc,
|
326
|
+
ContentType.JSON.value: read_json,
|
327
|
+
ContentType.AVRO.value: read_avro,
|
35
328
|
}
|
36
329
|
|
37
330
|
|
@@ -67,6 +360,7 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
|
|
67
360
|
"header": None,
|
68
361
|
"na_values": [""],
|
69
362
|
"keep_default_na": False,
|
363
|
+
"quoting": csv.QUOTE_NONE,
|
70
364
|
}
|
71
365
|
if content_type == ContentType.TSV.value:
|
72
366
|
return {"sep": "\t", "header": None}
|
@@ -74,11 +368,13 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
|
|
74
368
|
return {"sep": ",", "header": None}
|
75
369
|
if content_type == ContentType.PSV.value:
|
76
370
|
return {"sep": "|", "header": None}
|
371
|
+
if content_type == ContentType.JSON.value:
|
372
|
+
return {"lines": True} # Support NDJSON format
|
77
373
|
if content_type in {
|
78
374
|
ContentType.PARQUET.value,
|
79
375
|
ContentType.FEATHER.value,
|
80
376
|
ContentType.ORC.value,
|
81
|
-
ContentType.
|
377
|
+
ContentType.AVRO.value,
|
82
378
|
}:
|
83
379
|
return {}
|
84
380
|
raise ValueError(f"Unsupported content type: {content_type}")
|
@@ -92,7 +388,8 @@ ENCODING_TO_PD_COMPRESSION: Dict[str, str] = {
|
|
92
388
|
|
93
389
|
|
94
390
|
def slice_dataframe(
|
95
|
-
dataframe: pd.DataFrame,
|
391
|
+
dataframe: pd.DataFrame,
|
392
|
+
max_len: Optional[int],
|
96
393
|
) -> List[pd.DataFrame]:
|
97
394
|
"""
|
98
395
|
Iteratively create dataframe slices.
|
@@ -114,6 +411,22 @@ def concat_dataframes(dataframes: List[pd.DataFrame]) -> Optional[pd.DataFrame]:
|
|
114
411
|
return pd.concat(dataframes, axis=0, copy=False)
|
115
412
|
|
116
413
|
|
414
|
+
def append_column_to_dataframe(
|
415
|
+
dataframe: pd.DataFrame,
|
416
|
+
column_name: str,
|
417
|
+
column_value: Any,
|
418
|
+
) -> pd.DataFrame:
|
419
|
+
dataframe[column_name] = column_value
|
420
|
+
return dataframe
|
421
|
+
|
422
|
+
|
423
|
+
def select_columns(
|
424
|
+
dataframe: pd.DataFrame,
|
425
|
+
column_names: List[str],
|
426
|
+
) -> pd.DataFrame:
|
427
|
+
return dataframe[column_names]
|
428
|
+
|
429
|
+
|
117
430
|
def _add_column_kwargs(
|
118
431
|
content_type: str,
|
119
432
|
column_names: Optional[List[str]],
|
@@ -135,38 +448,68 @@ def _add_column_kwargs(
|
|
135
448
|
)
|
136
449
|
|
137
450
|
|
138
|
-
def
|
139
|
-
|
451
|
+
def file_to_dataframe(
|
452
|
+
path: str,
|
140
453
|
content_type: str,
|
141
|
-
content_encoding: str,
|
454
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
455
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
142
456
|
column_names: Optional[List[str]] = None,
|
143
457
|
include_columns: Optional[List[str]] = None,
|
144
458
|
pd_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
145
|
-
|
459
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
460
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
461
|
+
**kwargs,
|
146
462
|
) -> pd.DataFrame:
|
147
|
-
|
148
|
-
|
149
|
-
|
463
|
+
"""
|
464
|
+
Read a file into a Pandas DataFrame using any filesystem.
|
465
|
+
|
466
|
+
Args:
|
467
|
+
path: The file path to read
|
468
|
+
content_type: The content type of the file (e.g., ContentType.CSV.value)
|
469
|
+
content_encoding: The content encoding (default: IDENTITY)
|
470
|
+
filesystem: The filesystem to use (if None, will be inferred from path)
|
471
|
+
column_names: Optional column names to assign
|
472
|
+
include_columns: Optional columns to include in the result
|
473
|
+
pd_read_func_kwargs_provider: Optional kwargs provider for customization
|
474
|
+
fs_open_kwargs: Optional kwargs for filesystem open operations
|
475
|
+
**kwargs: Additional kwargs passed to the reader function
|
476
|
+
|
477
|
+
Returns:
|
478
|
+
pd.DataFrame: The loaded DataFrame
|
479
|
+
"""
|
150
480
|
logger.debug(
|
151
|
-
f"Reading {
|
481
|
+
f"Reading {path} to Pandas. Content type: {content_type}. "
|
152
482
|
f"Encoding: {content_encoding}"
|
153
483
|
)
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
if content_type in EXPLICIT_COMPRESSION_CONTENT_TYPES:
|
162
|
-
kwargs["compression"] = ENCODING_TO_PD_COMPRESSION.get(
|
163
|
-
content_encoding, "infer"
|
484
|
+
|
485
|
+
pd_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
|
486
|
+
if not pd_read_func:
|
487
|
+
raise NotImplementedError(
|
488
|
+
f"Pandas reader for content type '{content_type}' not "
|
489
|
+
f"implemented. Known content types: "
|
490
|
+
f"{list(CONTENT_TYPE_TO_READ_FN.keys())}"
|
164
491
|
)
|
492
|
+
|
493
|
+
reader_kwargs = content_type_to_reader_kwargs(content_type)
|
494
|
+
_add_column_kwargs(content_type, column_names, include_columns, reader_kwargs)
|
495
|
+
|
496
|
+
# Merge with provided kwargs
|
497
|
+
reader_kwargs.update(kwargs)
|
498
|
+
|
165
499
|
if pd_read_func_kwargs_provider:
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
500
|
+
reader_kwargs = pd_read_func_kwargs_provider(content_type, reader_kwargs)
|
501
|
+
|
502
|
+
logger.debug(f"Reading {path} via {pd_read_func} with kwargs: {reader_kwargs}")
|
503
|
+
|
504
|
+
dataframe, latency = timed_invocation(
|
505
|
+
pd_read_func,
|
506
|
+
path,
|
507
|
+
filesystem=filesystem,
|
508
|
+
fs_open_kwargs=fs_open_kwargs,
|
509
|
+
content_encoding=content_encoding,
|
510
|
+
**reader_kwargs,
|
511
|
+
)
|
512
|
+
logger.debug(f"Time to read {path} into Pandas DataFrame: {latency}s")
|
170
513
|
return dataframe
|
171
514
|
|
172
515
|
|
@@ -176,35 +519,210 @@ def dataframe_size(dataframe: pd.DataFrame) -> int:
|
|
176
519
|
|
177
520
|
|
178
521
|
def write_csv(
|
179
|
-
dataframe: pd.DataFrame,
|
522
|
+
dataframe: pd.DataFrame,
|
523
|
+
path: str,
|
524
|
+
*,
|
525
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
526
|
+
fs_open_kwargs: Dict[str, any] = {},
|
527
|
+
**kwargs,
|
180
528
|
) -> None:
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
529
|
+
# TODO (pdames): Add support for client-specified compression types.
|
530
|
+
if kwargs.get("header") is None:
|
531
|
+
kwargs["header"] = False
|
532
|
+
|
533
|
+
# Check if the path already indicates compression to avoid double compression
|
534
|
+
should_compress = path.endswith(".gz")
|
535
|
+
|
536
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
537
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
538
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
539
|
+
if should_compress:
|
540
|
+
# Path ends with .gz, PyArrow filesystem automatically compresses, no need for additional compression
|
541
|
+
dataframe.to_csv(f, **kwargs)
|
542
|
+
else:
|
543
|
+
# No compression indicated, apply explicit compression
|
544
|
+
with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
|
545
|
+
dataframe.to_csv(out, **kwargs)
|
546
|
+
else:
|
547
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
548
|
+
if should_compress:
|
549
|
+
# For fsspec filesystems, we need to apply compression explicitly
|
550
|
+
with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
|
551
|
+
dataframe.to_csv(out, **kwargs)
|
552
|
+
else:
|
553
|
+
# No compression indicated, apply explicit compression
|
554
|
+
with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
|
555
|
+
dataframe.to_csv(out, **kwargs)
|
556
|
+
|
557
|
+
|
558
|
+
def _preprocess_dataframe_for_parquet(dataframe: pd.DataFrame) -> pd.DataFrame:
|
559
|
+
"""
|
560
|
+
Preprocess DataFrame to convert PyArrow types to native Python types for parquet compatibility.
|
561
|
+
|
562
|
+
This handles the case where from_pyarrow() creates pandas DataFrames with PyArrow array objects
|
563
|
+
that cannot be serialized by pandas.to_parquet().
|
564
|
+
"""
|
565
|
+
# Check if any columns contain PyArrow arrays
|
566
|
+
needs_conversion = False
|
567
|
+
for col in dataframe.columns:
|
568
|
+
if dataframe[col].dtype == object:
|
569
|
+
# Check if the column contains PyArrow arrays
|
570
|
+
sample_val = dataframe[col].iloc[0] if len(dataframe) > 0 else None
|
571
|
+
if (
|
572
|
+
sample_val is not None
|
573
|
+
and hasattr(sample_val, "__class__")
|
574
|
+
and "pyarrow" in str(type(sample_val))
|
575
|
+
):
|
576
|
+
needs_conversion = True
|
577
|
+
break
|
578
|
+
|
579
|
+
if not needs_conversion:
|
580
|
+
return dataframe
|
581
|
+
|
582
|
+
# Create a copy and convert PyArrow types
|
583
|
+
df_copy = dataframe.copy()
|
584
|
+
|
585
|
+
for col in df_copy.columns:
|
586
|
+
if df_copy[col].dtype == object and len(df_copy) > 0:
|
587
|
+
sample_val = df_copy[col].iloc[0]
|
588
|
+
|
589
|
+
# Convert PyArrow arrays to Python lists
|
590
|
+
if hasattr(sample_val, "__class__") and "pyarrow" in str(type(sample_val)):
|
591
|
+
try:
|
592
|
+
if hasattr(sample_val, "to_pylist"):
|
593
|
+
# PyArrow array - convert to Python list
|
594
|
+
df_copy[col] = df_copy[col].apply(
|
595
|
+
lambda x: x.to_pylist() if hasattr(x, "to_pylist") else x
|
596
|
+
)
|
597
|
+
elif hasattr(sample_val, "as_py"):
|
598
|
+
# PyArrow scalar - convert to Python value
|
599
|
+
df_copy[col] = df_copy[col].apply(
|
600
|
+
lambda x: x.as_py() if hasattr(x, "as_py") else x
|
601
|
+
)
|
602
|
+
except Exception as e:
|
603
|
+
logger.warning(
|
604
|
+
f"Could not convert PyArrow column {col}: {e}. Keeping original values."
|
605
|
+
)
|
606
|
+
|
607
|
+
return df_copy
|
185
608
|
|
186
609
|
|
187
610
|
def write_parquet(
|
188
|
-
dataframe: pd.DataFrame,
|
611
|
+
dataframe: pd.DataFrame,
|
612
|
+
path: str,
|
613
|
+
*,
|
614
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
615
|
+
fs_open_kwargs: Dict[str, any] = {},
|
616
|
+
**kwargs,
|
189
617
|
) -> None:
|
190
|
-
|
191
|
-
|
618
|
+
# Preprocess DataFrame to handle PyArrow types
|
619
|
+
processed_df = _preprocess_dataframe_for_parquet(dataframe)
|
620
|
+
|
621
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
622
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
623
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
624
|
+
processed_df.to_parquet(f, **kwargs)
|
625
|
+
else:
|
626
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
627
|
+
processed_df.to_parquet(f, **kwargs)
|
628
|
+
|
629
|
+
|
630
|
+
def write_orc(
|
631
|
+
dataframe: pd.DataFrame,
|
632
|
+
path: str,
|
633
|
+
*,
|
634
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
635
|
+
fs_open_kwargs: Dict[str, any] = {},
|
636
|
+
**kwargs,
|
637
|
+
) -> None:
|
638
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
639
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
640
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
641
|
+
dataframe.to_orc(f, **kwargs)
|
642
|
+
else:
|
643
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
644
|
+
dataframe.to_orc(f, **kwargs)
|
192
645
|
|
193
646
|
|
194
647
|
def write_feather(
|
195
|
-
dataframe: pd.DataFrame,
|
648
|
+
dataframe: pd.DataFrame,
|
649
|
+
path: str,
|
650
|
+
*,
|
651
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
652
|
+
fs_open_kwargs: Dict[str, any] = {},
|
653
|
+
**kwargs,
|
196
654
|
) -> None:
|
197
|
-
|
198
|
-
|
655
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
656
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
657
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
658
|
+
dataframe.to_feather(f, **kwargs)
|
659
|
+
else:
|
660
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
661
|
+
dataframe.to_feather(f, **kwargs)
|
199
662
|
|
200
663
|
|
201
664
|
def write_json(
|
202
|
-
dataframe: pd.DataFrame,
|
665
|
+
dataframe: pd.DataFrame,
|
666
|
+
path: str,
|
667
|
+
*,
|
668
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
669
|
+
fs_open_kwargs: Dict[str, any] = {},
|
670
|
+
**kwargs,
|
671
|
+
) -> None:
|
672
|
+
# Check if the path already indicates compression to avoid double compression
|
673
|
+
should_compress = path.endswith(".gz")
|
674
|
+
|
675
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
676
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
677
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
678
|
+
if should_compress:
|
679
|
+
# Path ends with .gz, PyArrow filesystem automatically compresses, no need for additional compression
|
680
|
+
dataframe.to_json(f, **kwargs)
|
681
|
+
else:
|
682
|
+
# No compression indicated, apply explicit compression
|
683
|
+
with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
|
684
|
+
dataframe.to_json(out, **kwargs)
|
685
|
+
else:
|
686
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
687
|
+
if should_compress:
|
688
|
+
# For fsspec filesystems, we need to apply compression explicitly
|
689
|
+
with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
|
690
|
+
dataframe.to_json(out, **kwargs)
|
691
|
+
else:
|
692
|
+
# No compression indicated, apply explicit compression
|
693
|
+
with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
|
694
|
+
dataframe.to_json(out, **kwargs)
|
695
|
+
|
696
|
+
|
697
|
+
def write_avro(
|
698
|
+
dataframe: pd.DataFrame,
|
699
|
+
path: str,
|
700
|
+
*,
|
701
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
702
|
+
fs_open_kwargs: Dict[str, any] = {},
|
703
|
+
**kwargs,
|
203
704
|
) -> None:
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
705
|
+
"""
|
706
|
+
Write a pandas DataFrame to an AVRO file by delegating to polars implementation.
|
707
|
+
"""
|
708
|
+
import polars as pl
|
709
|
+
from deltacat.utils.polars import write_avro as polars_write_avro
|
710
|
+
|
711
|
+
# Convert pandas DataFrame to polars
|
712
|
+
include_index = kwargs.pop("index", False)
|
713
|
+
pl_df = pl.from_pandas(dataframe, include_index=include_index)
|
714
|
+
|
715
|
+
# Remove pandas-specific kwargs before passing to polars
|
716
|
+
polars_kwargs = {k: v for k, v in kwargs.items() if k not in ["index"]}
|
717
|
+
|
718
|
+
# Delegate to polars write_avro implementation
|
719
|
+
polars_write_avro(
|
720
|
+
pl_df,
|
721
|
+
path,
|
722
|
+
filesystem=filesystem,
|
723
|
+
fs_open_kwargs=fs_open_kwargs,
|
724
|
+
**polars_kwargs,
|
725
|
+
)
|
208
726
|
|
209
727
|
|
210
728
|
CONTENT_TYPE_TO_PD_WRITE_FUNC: Dict[str, Callable] = {
|
@@ -215,6 +733,8 @@ CONTENT_TYPE_TO_PD_WRITE_FUNC: Dict[str, Callable] = {
|
|
215
733
|
ContentType.PARQUET.value: write_parquet,
|
216
734
|
ContentType.FEATHER.value: write_feather,
|
217
735
|
ContentType.JSON.value: write_json,
|
736
|
+
ContentType.AVRO.value: write_avro,
|
737
|
+
ContentType.ORC.value: write_orc,
|
218
738
|
}
|
219
739
|
|
220
740
|
|
@@ -224,7 +744,7 @@ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, Any]:
|
|
224
744
|
"sep": "\t",
|
225
745
|
"header": False,
|
226
746
|
"na_rep": [""],
|
227
|
-
"
|
747
|
+
"lineterminator": "\n",
|
228
748
|
"quoting": csv.QUOTE_NONE,
|
229
749
|
"index": False,
|
230
750
|
}
|
@@ -232,28 +752,36 @@ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, Any]:
|
|
232
752
|
return {
|
233
753
|
"sep": "\t",
|
234
754
|
"header": False,
|
235
|
-
"
|
755
|
+
"lineterminator": "\n",
|
756
|
+
"quoting": csv.QUOTE_MINIMAL,
|
236
757
|
"index": False,
|
237
758
|
}
|
238
759
|
if content_type == ContentType.CSV.value:
|
239
760
|
return {
|
240
761
|
"sep": ",",
|
241
762
|
"header": False,
|
242
|
-
"
|
763
|
+
"index": False,
|
764
|
+
"lineterminator": "\n",
|
765
|
+
"quoting": csv.QUOTE_MINIMAL,
|
243
766
|
"index": False,
|
244
767
|
}
|
245
768
|
if content_type == ContentType.PSV.value:
|
246
769
|
return {
|
247
770
|
"sep": "|",
|
248
771
|
"header": False,
|
249
|
-
"line_terminator": "\n",
|
250
772
|
"index": False,
|
773
|
+
"lineterminator": "\n",
|
774
|
+
"quoting": csv.QUOTE_MINIMAL,
|
251
775
|
}
|
252
776
|
if content_type == ContentType.PARQUET.value:
|
253
777
|
return {"index": False}
|
254
778
|
if content_type == ContentType.FEATHER.value:
|
255
779
|
return {}
|
256
780
|
if content_type == ContentType.JSON.value:
|
781
|
+
return {"index": False, "orient": "records", "lines": True}
|
782
|
+
if content_type == ContentType.AVRO.value:
|
783
|
+
return {"index": False}
|
784
|
+
if content_type == ContentType.ORC.value:
|
257
785
|
return {"index": False}
|
258
786
|
raise ValueError(f"Unsupported content type: {content_type}")
|
259
787
|
|
@@ -261,9 +789,10 @@ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, Any]:
|
|
261
789
|
def dataframe_to_file(
|
262
790
|
dataframe: pd.DataFrame,
|
263
791
|
base_path: str,
|
264
|
-
|
792
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
|
265
793
|
block_path_provider: Union[Callable, FilenameProvider],
|
266
794
|
content_type: str = ContentType.PARQUET.value,
|
795
|
+
schema: Optional[pa.Schema] = None,
|
267
796
|
**kwargs,
|
268
797
|
) -> None:
|
269
798
|
"""
|
@@ -279,4 +808,4 @@ def dataframe_to_file(
|
|
279
808
|
f"{CONTENT_TYPE_TO_PD_WRITE_FUNC.keys}"
|
280
809
|
)
|
281
810
|
path = block_path_provider(base_path)
|
282
|
-
writer(dataframe, path, filesystem=
|
811
|
+
writer(dataframe, path, filesystem=filesystem, **writer_kwargs)
|