deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/utils/pyarrow.py
CHANGED
@@ -1,27 +1,32 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
import copy
|
4
5
|
import bz2
|
5
6
|
import gzip
|
6
7
|
import io
|
7
8
|
import logging
|
8
9
|
from functools import partial
|
9
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
10
|
-
from
|
11
|
-
from
|
10
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union, Tuple
|
11
|
+
from datetime import datetime, date
|
12
|
+
from decimal import Decimal
|
12
13
|
|
13
14
|
import pyarrow as pa
|
14
15
|
import numpy as np
|
15
16
|
import pyarrow.compute as pc
|
17
|
+
import pyarrow.fs as pafs
|
18
|
+
from pyarrow.parquet import ParquetFile
|
19
|
+
|
16
20
|
from fsspec import AbstractFileSystem
|
17
21
|
from pyarrow import csv as pacsv
|
18
22
|
from pyarrow import feather as paf
|
19
23
|
from pyarrow import json as pajson
|
20
24
|
from pyarrow import parquet as papq
|
25
|
+
from pyarrow import orc as paorc
|
21
26
|
from ray.data.datasource import FilenameProvider
|
22
|
-
from deltacat.utils.s3fs import create_s3_file_system
|
23
27
|
|
24
28
|
from deltacat import logs
|
29
|
+
from deltacat.exceptions import ContentTypeValidationError
|
25
30
|
from deltacat.types.media import (
|
26
31
|
DELIMITED_TEXT_CONTENT_TYPES,
|
27
32
|
TABULAR_CONTENT_TYPES,
|
@@ -34,18 +39,37 @@ from deltacat.types.partial_download import (
|
|
34
39
|
)
|
35
40
|
from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
|
36
41
|
from deltacat.utils.performance import timed_invocation
|
37
|
-
from deltacat.utils.daft import daft_s3_file_to_table
|
38
42
|
from deltacat.utils.schema import coerce_pyarrow_table_to_schema
|
39
43
|
from deltacat.utils.arguments import (
|
40
44
|
sanitize_kwargs_to_callable,
|
41
45
|
sanitize_kwargs_by_supported_kwargs,
|
42
46
|
)
|
47
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
43
48
|
from functools import lru_cache
|
49
|
+
from typing import TYPE_CHECKING
|
50
|
+
|
51
|
+
if TYPE_CHECKING:
|
52
|
+
from deltacat.storage.model.manifest import Manifest
|
53
|
+
from deltacat.storage.model.delta import Delta
|
54
|
+
|
44
55
|
|
45
56
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
46
57
|
|
47
58
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
48
59
|
READER_TYPE_KWARG = "reader_type"
|
60
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG = "override_content_encoding_for_parquet"
|
61
|
+
|
62
|
+
"""
|
63
|
+
By default, round decimal values using half_to_even round mode when
|
64
|
+
rescaling a decimal to the given scale and precision in the schema would cause
|
65
|
+
data loss. Setting any non null value of this argument will result
|
66
|
+
in an error instead.
|
67
|
+
"""
|
68
|
+
RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
|
69
|
+
# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
|
70
|
+
DECIMAL256_DEFAULT_SCALE = 38
|
71
|
+
DECIMAL256_MAX_PRECISION = 76
|
72
|
+
MAX_INT_BYTES = 2147483646
|
49
73
|
|
50
74
|
|
51
75
|
def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
|
@@ -64,90 +88,644 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
|
|
64
88
|
return target_schema
|
65
89
|
|
66
90
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
91
|
+
def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
|
92
|
+
schema = None
|
93
|
+
if (
|
94
|
+
"convert_options" in kwargs
|
95
|
+
and kwargs["convert_options"].column_types is not None
|
96
|
+
):
|
97
|
+
schema = kwargs["convert_options"].column_types
|
98
|
+
if not isinstance(schema, pa.Schema):
|
99
|
+
schema = pa.schema(schema)
|
100
|
+
if kwargs["convert_options"].include_columns:
|
101
|
+
schema = _filter_schema_for_columns(
|
102
|
+
schema, kwargs["convert_options"].include_columns
|
103
|
+
)
|
104
|
+
elif (
|
105
|
+
kwargs.get("read_options") is not None
|
106
|
+
and kwargs["read_options"].column_names
|
107
|
+
):
|
108
|
+
schema = _filter_schema_for_columns(
|
109
|
+
schema, kwargs["read_options"].column_names
|
110
|
+
)
|
111
|
+
else:
|
112
|
+
logger.debug(
|
113
|
+
"Schema not specified in the kwargs."
|
114
|
+
" Hence, schema could not be inferred from the empty CSV."
|
71
115
|
)
|
116
|
+
|
117
|
+
return schema
|
118
|
+
|
119
|
+
|
120
|
+
def _new_schema_with_replaced_fields(
|
121
|
+
schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
|
122
|
+
) -> pa.Schema:
|
123
|
+
if schema is None:
|
124
|
+
return None
|
125
|
+
|
126
|
+
new_schema_fields = []
|
127
|
+
for field in schema:
|
128
|
+
new_field = field_to_replace(field)
|
129
|
+
if new_field is not None:
|
130
|
+
new_schema_fields.append(new_field)
|
131
|
+
else:
|
132
|
+
new_schema_fields.append(field)
|
133
|
+
|
134
|
+
return pa.schema(new_schema_fields, metadata=schema.metadata)
|
135
|
+
|
136
|
+
|
137
|
+
def _read_csv_rounding_decimal_columns_to_fit_scale(
|
138
|
+
schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
|
139
|
+
) -> pa.Table:
|
140
|
+
# Note: We read decimals as strings first because CSV
|
141
|
+
# conversion to decimal256 isn't implemented as of pyarrow==12.0.1
|
142
|
+
new_schema = _new_schema_with_replaced_fields(
|
143
|
+
schema,
|
144
|
+
lambda fld: (
|
145
|
+
pa.field(fld.name, pa.string(), metadata=fld.metadata)
|
146
|
+
if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
|
147
|
+
else None
|
148
|
+
),
|
149
|
+
)
|
150
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
151
|
+
["read_options", "parse_options", "convert_options", "memory_pool"],
|
152
|
+
reader_kwargs,
|
153
|
+
)
|
154
|
+
# Creating a shallow copy for efficiency
|
155
|
+
new_convert_options = copy.copy(new_kwargs["convert_options"])
|
156
|
+
new_convert_options.column_types = new_schema
|
157
|
+
new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
|
158
|
+
arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
|
159
|
+
|
160
|
+
for column_index, field in enumerate(schema):
|
161
|
+
if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
|
162
|
+
column_array = arrow_table[field.name]
|
163
|
+
# We always cast to decimal256 to accomodate fixed scale of 38
|
164
|
+
cast_to_type = pa.decimal256(
|
165
|
+
DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
|
166
|
+
)
|
167
|
+
casted_decimal_array = pc.cast(column_array, cast_to_type)
|
168
|
+
# Note that scale can be negative
|
169
|
+
rounded_column_array = pc.round(
|
170
|
+
casted_decimal_array, ndigits=field.type.scale
|
171
|
+
)
|
172
|
+
final_decimal_array = pc.cast(rounded_column_array, field.type)
|
173
|
+
arrow_table = arrow_table.set_column(
|
174
|
+
column_index,
|
175
|
+
field,
|
176
|
+
final_decimal_array,
|
177
|
+
)
|
178
|
+
logger.debug(
|
179
|
+
f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
|
180
|
+
f" {field.type.precision} precision"
|
181
|
+
)
|
182
|
+
|
183
|
+
return arrow_table
|
184
|
+
|
185
|
+
|
186
|
+
def pyarrow_read_csv_default(*args, **kwargs):
|
187
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
188
|
+
["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
|
189
|
+
)
|
190
|
+
|
191
|
+
try:
|
72
192
|
return pacsv.read_csv(*args, **new_kwargs)
|
73
193
|
except pa.lib.ArrowInvalid as e:
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
194
|
+
error_str = e.__str__()
|
195
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
196
|
+
|
197
|
+
if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
|
198
|
+
logger.debug(f"Read CSV empty schema being used: {schema}")
|
199
|
+
return pa.Table.from_pylist([], schema=schema)
|
200
|
+
if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
201
|
+
# Note, this logic requires expensive casting. To prevent downgrading performance
|
202
|
+
# for happy path reads, we are handling this case in response to an error.
|
203
|
+
logger.warning(
|
204
|
+
"Rescaling Decimal to the given scale in the schema. "
|
205
|
+
f"Original error: {error_str}"
|
206
|
+
)
|
207
|
+
|
208
|
+
if schema is not None and "convert_options" in kwargs:
|
209
|
+
if (
|
210
|
+
"Rescaling Decimal" in error_str
|
211
|
+
and "value would cause data loss" in error_str
|
90
212
|
):
|
91
|
-
|
92
|
-
|
213
|
+
logger.debug(f"Checking if the file: {args[0]}...")
|
214
|
+
# Since we are re-reading the file, we have to seek to beginning
|
215
|
+
if isinstance(args[0], io.IOBase) and args[0].seekable():
|
216
|
+
logger.debug(f"Seeking to the beginning of the file {args[0]}")
|
217
|
+
args[0].seek(0)
|
218
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
219
|
+
schema=schema, reader_args=args, reader_kwargs=kwargs
|
93
220
|
)
|
94
|
-
|
95
221
|
else:
|
96
222
|
logger.debug(
|
97
|
-
"Schema
|
98
|
-
"
|
223
|
+
"Schema is None when trying to adjust decimal values. "
|
224
|
+
"Hence, bubbling up exception..."
|
99
225
|
)
|
100
226
|
|
101
|
-
logger.debug(f"Read CSV empty schema being used: {schema}")
|
102
|
-
return pa.Table.from_pylist([], schema=schema)
|
103
227
|
raise e
|
104
228
|
|
105
229
|
|
106
|
-
|
230
|
+
# TODO(pdames): Remove deprecated S3-only readers.
|
231
|
+
def read_csv(
|
232
|
+
path: str,
|
233
|
+
*,
|
234
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
235
|
+
fs_open_kwargs: Dict[str, any] = {},
|
236
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
237
|
+
**read_kwargs,
|
238
|
+
) -> pa.Table:
|
239
|
+
# Filter out DeltaCAT-specific parameters that PyArrow doesn't understand
|
240
|
+
from deltacat.types.tables import _filter_kwargs_for_external_readers
|
241
|
+
|
242
|
+
pyarrow_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
|
243
|
+
# TODO(pdames): Merge in decimal256 support from pure S3 path reader.
|
244
|
+
|
245
|
+
# Check if compression is already indicated by file path
|
246
|
+
should_decompress = path.endswith(".gz")
|
247
|
+
|
248
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
249
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
250
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
251
|
+
# Handle decompression - avoid double decompression for PyArrow filesystem
|
252
|
+
if should_decompress:
|
253
|
+
# PyArrow filesystem already handles .gz decompression automatically
|
254
|
+
return pacsv.read_csv(f, **pyarrow_kwargs)
|
255
|
+
else:
|
256
|
+
# Apply explicit decompression if needed
|
257
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
258
|
+
content_encoding, lambda x: x
|
259
|
+
)
|
260
|
+
with input_file_init(f) as input_file:
|
261
|
+
return pacsv.read_csv(input_file, **pyarrow_kwargs)
|
262
|
+
else:
|
263
|
+
# fsspec AbstractFileSystem
|
264
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
265
|
+
# Handle decompression - apply explicit decompression for fsspec
|
266
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
267
|
+
with input_file_init(f) as input_file:
|
268
|
+
return pacsv.read_csv(input_file, **pyarrow_kwargs)
|
269
|
+
|
270
|
+
|
271
|
+
def read_feather(
|
272
|
+
path: str,
|
273
|
+
*,
|
274
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
275
|
+
fs_open_kwargs: Dict[str, any] = {},
|
276
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
277
|
+
**read_kwargs,
|
278
|
+
) -> pa.Table:
|
279
|
+
# Filter out DeltaCAT-specific parameters that PyArrow doesn't understand
|
280
|
+
from deltacat.types.tables import _filter_kwargs_for_external_readers
|
281
|
+
|
282
|
+
pyarrow_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
|
283
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
284
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
285
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
286
|
+
# Handle compression
|
287
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
288
|
+
with input_file_init(f) as input_file:
|
289
|
+
return paf.read_table(input_file, **pyarrow_kwargs)
|
290
|
+
else:
|
291
|
+
# fsspec AbstractFileSystem - Feather requires seekable files
|
292
|
+
# For local files, we can use the file path directly
|
293
|
+
if hasattr(filesystem, "protocol") and filesystem.protocol == "file":
|
294
|
+
if content_encoding != ContentEncoding.IDENTITY.value:
|
295
|
+
# For compressed files, decompress to a temporary file
|
296
|
+
import tempfile
|
297
|
+
import shutil
|
298
|
+
|
299
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
300
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
301
|
+
content_encoding, lambda x: x
|
302
|
+
)
|
303
|
+
with input_file_init(f) as input_file:
|
304
|
+
# Create temporary file to hold decompressed data
|
305
|
+
with tempfile.NamedTemporaryFile() as temp_file:
|
306
|
+
shutil.copyfileobj(input_file, temp_file)
|
307
|
+
temp_file.flush()
|
308
|
+
return paf.read_table(temp_file.name, **read_kwargs)
|
309
|
+
else:
|
310
|
+
# No compression, can read directly from file path
|
311
|
+
return paf.read_table(path, **pyarrow_kwargs)
|
312
|
+
else:
|
313
|
+
# For non-local filesystems, always read from temporary file
|
314
|
+
import tempfile
|
315
|
+
import shutil
|
316
|
+
|
317
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
318
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
319
|
+
content_encoding, lambda x: x
|
320
|
+
)
|
321
|
+
with input_file_init(f) as input_file:
|
322
|
+
# Create temporary file to hold data
|
323
|
+
with tempfile.NamedTemporaryFile() as temp_file:
|
324
|
+
shutil.copyfileobj(input_file, temp_file)
|
325
|
+
temp_file.flush()
|
326
|
+
return paf.read_table(temp_file.name, **read_kwargs)
|
327
|
+
|
328
|
+
|
329
|
+
def read_json(
|
330
|
+
path: str,
|
331
|
+
*,
|
332
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
333
|
+
fs_open_kwargs: Dict[str, any] = {},
|
334
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
335
|
+
**read_kwargs,
|
336
|
+
) -> pa.Table:
|
337
|
+
# Filter out DeltaCAT-specific parameters that PyArrow doesn't understand
|
338
|
+
from deltacat.types.tables import _filter_kwargs_for_external_readers
|
339
|
+
|
340
|
+
pyarrow_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
|
341
|
+
# Check if decompression is already indicated by file path
|
342
|
+
should_decompress = path.endswith(".gz")
|
343
|
+
|
344
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
345
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
346
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
347
|
+
# Handle decompression - avoid double decompression for PyArrow filesystem
|
348
|
+
if should_decompress:
|
349
|
+
# PyArrow filesystem already handles .gz decompression automatically
|
350
|
+
return pajson.read_json(f, **pyarrow_kwargs)
|
351
|
+
else:
|
352
|
+
# Apply explicit decompression if needed
|
353
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
354
|
+
content_encoding, lambda x: x
|
355
|
+
)
|
356
|
+
with input_file_init(f) as input_file:
|
357
|
+
return pajson.read_json(input_file, **pyarrow_kwargs)
|
358
|
+
else:
|
359
|
+
# fsspec AbstractFileSystem
|
360
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
361
|
+
# Handle decompression - apply explicit decompression for fsspec
|
362
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
363
|
+
with input_file_init(f) as input_file:
|
364
|
+
return pajson.read_json(input_file, **pyarrow_kwargs)
|
365
|
+
|
366
|
+
|
367
|
+
def read_orc(
|
368
|
+
path: str,
|
369
|
+
*,
|
370
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
371
|
+
fs_open_kwargs: Dict[str, any] = {},
|
372
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
373
|
+
**read_kwargs,
|
374
|
+
) -> pa.Table:
|
375
|
+
# Filter out DeltaCAT-specific parameters that PyArrow doesn't understand
|
376
|
+
from deltacat.types.tables import _filter_kwargs_for_external_readers
|
377
|
+
|
378
|
+
pyarrow_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
|
379
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
380
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
381
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
382
|
+
# Handle compression
|
383
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
384
|
+
with input_file_init(f) as input_file:
|
385
|
+
return paorc.read_table(input_file, **pyarrow_kwargs)
|
386
|
+
else:
|
387
|
+
# fsspec AbstractFileSystem - ORC requires seekable files, so handle compression differently
|
388
|
+
if content_encoding != ContentEncoding.IDENTITY.value:
|
389
|
+
# For compressed files with fsspec, we need to decompress to a temporary file
|
390
|
+
# since ORC requires seekable streams
|
391
|
+
import tempfile
|
392
|
+
import shutil
|
393
|
+
|
394
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
395
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(
|
396
|
+
content_encoding, lambda x: x
|
397
|
+
)
|
398
|
+
with input_file_init(f) as input_file:
|
399
|
+
# Create temporary file to hold decompressed data
|
400
|
+
with tempfile.NamedTemporaryFile() as temp_file:
|
401
|
+
shutil.copyfileobj(input_file, temp_file)
|
402
|
+
temp_file.flush()
|
403
|
+
return paorc.read_table(temp_file.name, **pyarrow_kwargs)
|
404
|
+
else:
|
405
|
+
# No compression, can read directly
|
406
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
407
|
+
return paorc.read_table(f, **pyarrow_kwargs)
|
408
|
+
|
409
|
+
|
410
|
+
def read_parquet(
|
411
|
+
path: str,
|
412
|
+
*,
|
413
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
414
|
+
fs_open_kwargs: Dict[str, any] = {},
|
415
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
416
|
+
**read_kwargs,
|
417
|
+
) -> pa.Table:
|
418
|
+
# Convert DeltaCAT Schema to PyArrow Schema if present
|
419
|
+
if "schema" in read_kwargs:
|
420
|
+
from deltacat.storage.model.schema import Schema as DeltaCATSchema
|
421
|
+
|
422
|
+
schema = read_kwargs["schema"]
|
423
|
+
if isinstance(schema, DeltaCATSchema):
|
424
|
+
read_kwargs["schema"] = schema.arrow
|
425
|
+
|
426
|
+
# Filter out DeltaCAT-specific parameters that PyArrow doesn't understand
|
427
|
+
# Use local import to avoid circular dependency
|
428
|
+
from deltacat.types.tables import _filter_kwargs_for_external_readers
|
429
|
+
|
430
|
+
pyarrow_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
|
431
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
432
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
433
|
+
with filesystem.open_input_file(path, **fs_open_kwargs) as f:
|
434
|
+
# Handle compression
|
435
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
436
|
+
with input_file_init(f) as input_file:
|
437
|
+
return papq.read_table(input_file, **pyarrow_kwargs)
|
438
|
+
else:
|
439
|
+
# fsspec AbstractFileSystem
|
440
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
441
|
+
# Handle compression
|
442
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
443
|
+
with input_file_init(f) as input_file:
|
444
|
+
return papq.read_table(input_file, **pyarrow_kwargs)
|
445
|
+
|
446
|
+
|
447
|
+
def read_avro(
|
448
|
+
path: str,
|
449
|
+
*,
|
450
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
451
|
+
fs_open_kwargs: Dict[str, any] = {},
|
452
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
453
|
+
**read_kwargs,
|
454
|
+
) -> pa.Table:
|
455
|
+
# Filter out DeltaCAT-specific parameters that Polars doesn't understand
|
456
|
+
from deltacat.types.tables import _filter_kwargs_for_external_readers
|
457
|
+
|
458
|
+
polars_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
|
459
|
+
"""
|
460
|
+
Read an Avro file using polars and convert to PyArrow.
|
461
|
+
"""
|
462
|
+
import polars as pl
|
463
|
+
|
464
|
+
# If path is a file-like object, read directly
|
465
|
+
if hasattr(path, "read"):
|
466
|
+
pl_df = pl.read_avro(path, **polars_kwargs)
|
467
|
+
return pl_df.to_arrow()
|
468
|
+
|
469
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
470
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
471
|
+
with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
|
472
|
+
# Handle compression
|
473
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
474
|
+
with input_file_init(f) as input_file:
|
475
|
+
pl_df = pl.read_avro(input_file, **polars_kwargs)
|
476
|
+
return pl_df.to_arrow()
|
477
|
+
with filesystem.open(path, "rb", **fs_open_kwargs) as f:
|
478
|
+
input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
|
479
|
+
with input_file_init(f) as input_file:
|
480
|
+
pl_df = pl.read_avro(input_file, **polars_kwargs)
|
481
|
+
return pl_df.to_arrow()
|
482
|
+
|
483
|
+
|
484
|
+
def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
|
485
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
486
|
+
|
487
|
+
# CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
|
488
|
+
# Below ensures decimal256 is casted properly.
|
489
|
+
schema_includes_decimal256 = (
|
490
|
+
(True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
|
491
|
+
if schema is not None
|
492
|
+
else None
|
493
|
+
)
|
494
|
+
if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
495
|
+
# falling back to expensive method of reading CSV
|
496
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
497
|
+
schema, reader_args=args, reader_kwargs=kwargs
|
498
|
+
)
|
499
|
+
else:
|
500
|
+
return pyarrow_read_csv_default(*args, **kwargs)
|
501
|
+
|
502
|
+
|
503
|
+
CONTENT_TYPE_TO_PA_S3_READ_FUNC: Dict[str, Callable] = {
|
107
504
|
ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
|
108
505
|
ContentType.TSV.value: pyarrow_read_csv,
|
109
506
|
ContentType.CSV.value: pyarrow_read_csv,
|
110
507
|
ContentType.PSV.value: pyarrow_read_csv,
|
111
508
|
ContentType.PARQUET.value: papq.read_table,
|
112
509
|
ContentType.FEATHER.value: paf.read_table,
|
113
|
-
# Pyarrow.orc is disabled in Pyarrow 0.15, 0.16:
|
114
|
-
# https://issues.apache.org/jira/browse/ARROW-7811
|
115
|
-
# ContentType.ORC.value: paorc.ContentType.ORCFile,
|
116
510
|
ContentType.JSON.value: pajson.read_json,
|
511
|
+
ContentType.ORC.value: paorc.read_table,
|
512
|
+
ContentType.AVRO.value: read_avro,
|
513
|
+
}
|
514
|
+
|
515
|
+
|
516
|
+
CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
|
517
|
+
ContentType.UNESCAPED_TSV.value: read_csv,
|
518
|
+
ContentType.TSV.value: read_csv,
|
519
|
+
ContentType.CSV.value: read_csv,
|
520
|
+
ContentType.PSV.value: read_csv,
|
521
|
+
ContentType.PARQUET.value: read_parquet,
|
522
|
+
ContentType.FEATHER.value: read_feather,
|
523
|
+
ContentType.JSON.value: read_json,
|
524
|
+
ContentType.ORC.value: read_orc,
|
525
|
+
ContentType.AVRO.value: read_avro,
|
117
526
|
}
|
118
527
|
|
119
528
|
|
120
529
|
def write_feather(
|
121
|
-
table: pa.Table,
|
530
|
+
table: pa.Table,
|
531
|
+
path: str,
|
532
|
+
*,
|
533
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
534
|
+
fs_open_kwargs: Dict[str, any] = {},
|
535
|
+
**write_kwargs,
|
122
536
|
) -> None:
|
123
|
-
|
124
|
-
|
125
|
-
|
537
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
538
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
539
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
540
|
+
paf.write_feather(table, f, **write_kwargs)
|
541
|
+
else:
|
542
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
543
|
+
paf.write_feather(table, f, **write_kwargs)
|
126
544
|
|
127
545
|
|
128
546
|
def write_csv(
|
129
|
-
table: pa.Table,
|
547
|
+
table: pa.Table,
|
548
|
+
path: str,
|
549
|
+
*,
|
550
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
551
|
+
fs_open_kwargs: Dict[str, any] = {},
|
552
|
+
**write_kwargs,
|
130
553
|
) -> None:
|
554
|
+
if write_kwargs.get("write_options") is None:
|
555
|
+
# column names are kept in table metadata, so omit header
|
556
|
+
write_kwargs["write_options"] = pacsv.WriteOptions(include_header=False)
|
557
|
+
|
558
|
+
# Check if the path already indicates compression to avoid double compression
|
559
|
+
should_compress = path.endswith(".gz")
|
560
|
+
|
561
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
562
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
563
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
564
|
+
if should_compress:
|
565
|
+
# Path ends with .gz, PyArrow filesystem automatically compresses, no need for additional compression
|
566
|
+
pacsv.write_csv(table, f, **write_kwargs)
|
567
|
+
else:
|
568
|
+
# No compression indicated, write uncompressed
|
569
|
+
pacsv.write_csv(table, f, **write_kwargs)
|
570
|
+
else:
|
571
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
572
|
+
if should_compress:
|
573
|
+
# For fsspec filesystems, we need to apply compression explicitly
|
574
|
+
with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
|
575
|
+
pacsv.write_csv(table, out, **write_kwargs)
|
576
|
+
else:
|
577
|
+
# No compression indicated, write uncompressed
|
578
|
+
pacsv.write_csv(table, f, **write_kwargs)
|
579
|
+
|
580
|
+
|
581
|
+
def write_orc(
|
582
|
+
table: pa.Table,
|
583
|
+
path: str,
|
584
|
+
*,
|
585
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
586
|
+
fs_open_kwargs: Dict[str, any] = {},
|
587
|
+
**write_kwargs,
|
588
|
+
) -> None:
|
589
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
590
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
591
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
592
|
+
paorc.write_table(table, f, **write_kwargs)
|
593
|
+
else:
|
594
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
595
|
+
paorc.write_table(table, f, **write_kwargs)
|
596
|
+
|
597
|
+
|
598
|
+
def write_parquet(
|
599
|
+
table: pa.Table,
|
600
|
+
path: str,
|
601
|
+
*,
|
602
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
603
|
+
fs_open_kwargs: Dict[str, any] = {},
|
604
|
+
**write_kwargs,
|
605
|
+
) -> None:
|
606
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
607
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
608
|
+
with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
|
609
|
+
papq.write_table(table, f, **write_kwargs)
|
610
|
+
else:
|
611
|
+
with filesystem.open(path, "wb", **fs_open_kwargs) as f:
|
612
|
+
papq.write_table(table, f, **write_kwargs)
|
613
|
+
|
131
614
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
615
|
+
def write_json(
|
616
|
+
table: pa.Table,
|
617
|
+
path: str,
|
618
|
+
*,
|
619
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
620
|
+
fs_open_kwargs: Dict[str, any] = {},
|
621
|
+
**write_kwargs,
|
622
|
+
) -> None:
|
623
|
+
"""
|
624
|
+
Write a PyArrow Table to a JSON file by delegating to polars implementation.
|
625
|
+
"""
|
626
|
+
import polars as pl
|
627
|
+
from deltacat.utils.polars import write_json as polars_write_json
|
628
|
+
|
629
|
+
# Convert PyArrow Table to polars DataFrame
|
630
|
+
pl_df = pl.from_arrow(table)
|
631
|
+
|
632
|
+
# Delegate to polars write_json implementation with GZIP compression
|
633
|
+
polars_write_json(
|
634
|
+
pl_df,
|
635
|
+
path,
|
636
|
+
filesystem=filesystem,
|
637
|
+
fs_open_kwargs=fs_open_kwargs,
|
638
|
+
**write_kwargs,
|
639
|
+
)
|
640
|
+
|
641
|
+
|
642
|
+
def write_avro(
|
643
|
+
table: pa.Table,
|
644
|
+
path: str,
|
645
|
+
*,
|
646
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
647
|
+
fs_open_kwargs: Dict[str, any] = {},
|
648
|
+
**write_kwargs,
|
649
|
+
) -> None:
|
650
|
+
"""
|
651
|
+
Write a PyArrow Table to an AVRO file by delegating to polars implementation.
|
652
|
+
"""
|
653
|
+
import polars as pl
|
654
|
+
from deltacat.utils.polars import write_avro as polars_write_avro
|
655
|
+
|
656
|
+
# Convert PyArrow Table to polars DataFrame
|
657
|
+
pl_df = pl.from_arrow(table)
|
658
|
+
|
659
|
+
# Delegate to polars write_avro implementation
|
660
|
+
polars_write_avro(
|
661
|
+
pl_df,
|
662
|
+
path,
|
663
|
+
filesystem=filesystem,
|
664
|
+
fs_open_kwargs=fs_open_kwargs,
|
665
|
+
**write_kwargs,
|
666
|
+
)
|
139
667
|
|
140
668
|
|
141
669
|
CONTENT_TYPE_TO_PA_WRITE_FUNC: Dict[str, Callable] = {
|
142
|
-
|
143
|
-
|
144
|
-
# representations to pyarrow.csv.WriteOptions.
|
670
|
+
ContentType.UNESCAPED_TSV.value: write_csv,
|
671
|
+
ContentType.TSV.value: write_csv,
|
145
672
|
ContentType.CSV.value: write_csv,
|
146
|
-
ContentType.
|
673
|
+
ContentType.PSV.value: write_csv,
|
674
|
+
ContentType.PARQUET.value: write_parquet,
|
147
675
|
ContentType.FEATHER.value: write_feather,
|
676
|
+
ContentType.JSON.value: write_json,
|
677
|
+
ContentType.AVRO.value: write_avro,
|
678
|
+
ContentType.ORC.value: write_orc,
|
148
679
|
}
|
149
680
|
|
150
681
|
|
682
|
+
def content_type_to_writer_kwargs(content_type: str) -> Dict[str, Any]:
|
683
|
+
"""
|
684
|
+
Returns writer kwargs for the given content type when writing with pyarrow.
|
685
|
+
"""
|
686
|
+
if content_type == ContentType.UNESCAPED_TSV.value:
|
687
|
+
return {
|
688
|
+
"write_options": pacsv.WriteOptions(
|
689
|
+
delimiter="\t",
|
690
|
+
include_header=False,
|
691
|
+
quoting_style="none",
|
692
|
+
)
|
693
|
+
}
|
694
|
+
if content_type == ContentType.TSV.value:
|
695
|
+
return {
|
696
|
+
"write_options": pacsv.WriteOptions(
|
697
|
+
include_header=False,
|
698
|
+
delimiter="\t",
|
699
|
+
quoting_style="needed",
|
700
|
+
)
|
701
|
+
}
|
702
|
+
if content_type == ContentType.CSV.value:
|
703
|
+
return {
|
704
|
+
"write_options": pacsv.WriteOptions(
|
705
|
+
include_header=False,
|
706
|
+
delimiter=",",
|
707
|
+
quoting_style="needed",
|
708
|
+
)
|
709
|
+
}
|
710
|
+
if content_type == ContentType.PSV.value:
|
711
|
+
return {
|
712
|
+
"write_options": pacsv.WriteOptions(
|
713
|
+
include_header=False,
|
714
|
+
delimiter="|",
|
715
|
+
quoting_style="needed",
|
716
|
+
)
|
717
|
+
}
|
718
|
+
if content_type in {
|
719
|
+
ContentType.PARQUET.value,
|
720
|
+
ContentType.FEATHER.value,
|
721
|
+
ContentType.JSON.value,
|
722
|
+
ContentType.AVRO.value,
|
723
|
+
ContentType.ORC.value,
|
724
|
+
}:
|
725
|
+
return {}
|
726
|
+
raise ValueError(f"Unsupported content type: {content_type}")
|
727
|
+
|
728
|
+
|
151
729
|
def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
|
152
730
|
if content_type == ContentType.UNESCAPED_TSV.value:
|
153
731
|
return {
|
@@ -167,12 +745,10 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
|
|
167
745
|
ContentType.PARQUET.value,
|
168
746
|
ContentType.FEATHER.value,
|
169
747
|
ContentType.JSON.value,
|
748
|
+
ContentType.ORC.value,
|
749
|
+
ContentType.AVRO.value,
|
170
750
|
}:
|
171
751
|
return {}
|
172
|
-
# Pyarrow.orc is disabled in Pyarrow 0.15, 0.16:
|
173
|
-
# https://issues.apache.org/jira/browse/ARROW-7811
|
174
|
-
# if DataTypes.ContentType.ORC:
|
175
|
-
# return {},
|
176
752
|
raise ValueError(f"Unsupported content type: {content_type}")
|
177
753
|
|
178
754
|
|
@@ -180,11 +756,14 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
|
|
180
756
|
ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
|
181
757
|
ContentEncoding.GZIP.value: partial(gzip.open, mode="rb"),
|
182
758
|
ContentEncoding.BZIP2.value: partial(bz2.open, mode="rb"),
|
183
|
-
ContentEncoding.IDENTITY.value: lambda
|
759
|
+
ContentEncoding.IDENTITY.value: lambda file_path: file_path,
|
184
760
|
}
|
185
761
|
|
186
762
|
|
187
|
-
def slice_table(
|
763
|
+
def slice_table(
|
764
|
+
table: pa.Table,
|
765
|
+
max_len: Optional[int],
|
766
|
+
) -> List[pa.Table]:
|
188
767
|
"""
|
189
768
|
Iteratively create 0-copy table slices.
|
190
769
|
"""
|
@@ -201,6 +780,23 @@ def slice_table(table: pa.Table, max_len: Optional[int]) -> List[pa.Table]:
|
|
201
780
|
return tables
|
202
781
|
|
203
782
|
|
783
|
+
def append_column_to_table(
|
784
|
+
table: pa.Table,
|
785
|
+
column_name: str,
|
786
|
+
column_value: Any,
|
787
|
+
) -> pa.Table:
|
788
|
+
num_rows = table.num_rows
|
789
|
+
column_array = pa.array([column_value] * num_rows)
|
790
|
+
return table.append_column(column_name, column_array)
|
791
|
+
|
792
|
+
|
793
|
+
def select_columns(
|
794
|
+
table: pa.Table,
|
795
|
+
column_names: List[str],
|
796
|
+
) -> pa.Table:
|
797
|
+
return table.select(column_names)
|
798
|
+
|
799
|
+
|
204
800
|
class ReadKwargsProviderPyArrowCsvPureUtf8(ContentTypeKwargsProvider):
|
205
801
|
"""ReadKwargsProvider impl that reads columns of delimited text files
|
206
802
|
as UTF-8 strings (i.e. disables type inference). Useful for ensuring
|
@@ -322,15 +918,16 @@ def _add_column_kwargs(
|
|
322
918
|
)
|
323
919
|
|
324
920
|
|
325
|
-
def
|
326
|
-
|
921
|
+
def partial_parquet_file_to_table(
|
922
|
+
path: str,
|
327
923
|
content_type: str,
|
328
924
|
content_encoding: str,
|
925
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
329
926
|
column_names: Optional[List[str]] = None,
|
330
927
|
include_columns: Optional[List[str]] = None,
|
331
928
|
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
332
929
|
partial_file_download_params: Optional[PartialParquetParameters] = None,
|
333
|
-
**
|
930
|
+
**kwargs,
|
334
931
|
) -> pa.Table:
|
335
932
|
|
336
933
|
assert (
|
@@ -340,13 +937,18 @@ def s3_partial_parquet_file_to_table(
|
|
340
937
|
partial_file_download_params.row_groups_to_download is not None
|
341
938
|
), "No row groups to download"
|
342
939
|
|
343
|
-
|
344
|
-
|
940
|
+
# Resolve filesystem and path
|
941
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
942
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
943
|
+
|
944
|
+
pq_file = file_to_parquet(
|
945
|
+
path=path,
|
345
946
|
content_type=content_type,
|
346
947
|
content_encoding=content_encoding,
|
948
|
+
filesystem=filesystem,
|
347
949
|
partial_file_download_params=partial_file_download_params,
|
348
950
|
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
349
|
-
**
|
951
|
+
**kwargs,
|
350
952
|
)
|
351
953
|
|
352
954
|
table, latency = timed_invocation(
|
@@ -355,7 +957,7 @@ def s3_partial_parquet_file_to_table(
|
|
355
957
|
columns=include_columns or column_names,
|
356
958
|
)
|
357
959
|
|
358
|
-
logger.debug(f"Successfully read from
|
960
|
+
logger.debug(f"Successfully read from path={path} in {latency}s")
|
359
961
|
|
360
962
|
kwargs = {}
|
361
963
|
|
@@ -389,128 +991,6 @@ def s3_partial_parquet_file_to_table(
|
|
389
991
|
return table
|
390
992
|
|
391
993
|
|
392
|
-
def s3_file_to_table(
|
393
|
-
s3_url: str,
|
394
|
-
content_type: str,
|
395
|
-
content_encoding: str,
|
396
|
-
column_names: Optional[List[str]] = None,
|
397
|
-
include_columns: Optional[List[str]] = None,
|
398
|
-
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
399
|
-
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
400
|
-
**s3_client_kwargs,
|
401
|
-
) -> pa.Table:
|
402
|
-
|
403
|
-
logger.debug(
|
404
|
-
f"Reading {s3_url} to PyArrow. Content type: {content_type}. "
|
405
|
-
f"Encoding: {content_encoding}"
|
406
|
-
)
|
407
|
-
|
408
|
-
kwargs = content_type_to_reader_kwargs(content_type)
|
409
|
-
_add_column_kwargs(content_type, column_names, include_columns, kwargs)
|
410
|
-
|
411
|
-
if pa_read_func_kwargs_provider is not None:
|
412
|
-
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
413
|
-
|
414
|
-
if (
|
415
|
-
content_type == ContentType.PARQUET.value
|
416
|
-
and content_encoding == ContentEncoding.IDENTITY.value
|
417
|
-
):
|
418
|
-
logger.debug(
|
419
|
-
f"Performing read using parquet reader for encoding={content_encoding} "
|
420
|
-
f"and content_type={content_type}"
|
421
|
-
)
|
422
|
-
|
423
|
-
parquet_reader_func = None
|
424
|
-
if kwargs.get(READER_TYPE_KWARG, "daft") == "daft":
|
425
|
-
parquet_reader_func = daft_s3_file_to_table
|
426
|
-
elif partial_file_download_params and isinstance(
|
427
|
-
partial_file_download_params, PartialParquetParameters
|
428
|
-
):
|
429
|
-
parquet_reader_func = s3_partial_parquet_file_to_table
|
430
|
-
|
431
|
-
if parquet_reader_func is not None:
|
432
|
-
return parquet_reader_func(
|
433
|
-
s3_url=s3_url,
|
434
|
-
content_type=content_type,
|
435
|
-
content_encoding=content_encoding,
|
436
|
-
column_names=column_names,
|
437
|
-
include_columns=include_columns,
|
438
|
-
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
439
|
-
partial_file_download_params=partial_file_download_params,
|
440
|
-
**s3_client_kwargs,
|
441
|
-
)
|
442
|
-
|
443
|
-
if READER_TYPE_KWARG in kwargs:
|
444
|
-
kwargs.pop(READER_TYPE_KWARG)
|
445
|
-
|
446
|
-
filesystem = io
|
447
|
-
if s3_url.startswith("s3://"):
|
448
|
-
filesystem = create_s3_file_system(s3_client_kwargs)
|
449
|
-
|
450
|
-
logger.debug(f"Read S3 object from {s3_url} using filesystem: {filesystem}")
|
451
|
-
input_file_init = ENCODING_TO_FILE_INIT[content_encoding]
|
452
|
-
pa_read_func = CONTENT_TYPE_TO_PA_READ_FUNC[content_type]
|
453
|
-
|
454
|
-
with filesystem.open(s3_url, "rb") as s3_file, input_file_init(
|
455
|
-
s3_file
|
456
|
-
) as input_file:
|
457
|
-
args = [input_file]
|
458
|
-
logger.debug(f"Reading {s3_url} via {pa_read_func} with kwargs: {kwargs}")
|
459
|
-
table, latency = timed_invocation(pa_read_func, *args, **kwargs)
|
460
|
-
logger.debug(f"Time to read {s3_url} into PyArrow table: {latency}s")
|
461
|
-
return table
|
462
|
-
|
463
|
-
|
464
|
-
def s3_file_to_parquet(
|
465
|
-
s3_url: str,
|
466
|
-
content_type: str,
|
467
|
-
content_encoding: str,
|
468
|
-
column_names: Optional[List[str]] = None,
|
469
|
-
include_columns: Optional[List[str]] = None,
|
470
|
-
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
471
|
-
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
472
|
-
**s3_client_kwargs,
|
473
|
-
) -> ParquetFile:
|
474
|
-
logger.debug(
|
475
|
-
f"Reading {s3_url} to PyArrow ParquetFile. "
|
476
|
-
f"Content type: {content_type}. Encoding: {content_encoding}"
|
477
|
-
)
|
478
|
-
|
479
|
-
if (
|
480
|
-
content_type != ContentType.PARQUET.value
|
481
|
-
or content_encoding != ContentEncoding.IDENTITY
|
482
|
-
):
|
483
|
-
raise ContentTypeValidationError(
|
484
|
-
f"S3 file with content type: {content_type} and content encoding: {content_encoding} "
|
485
|
-
"cannot be read into pyarrow.parquet.ParquetFile"
|
486
|
-
)
|
487
|
-
|
488
|
-
if s3_client_kwargs is None:
|
489
|
-
s3_client_kwargs = {}
|
490
|
-
|
491
|
-
kwargs = {}
|
492
|
-
|
493
|
-
if s3_url.startswith("s3://"):
|
494
|
-
s3_file_system = create_s3_file_system(s3_client_kwargs)
|
495
|
-
kwargs["filesystem"] = s3_file_system
|
496
|
-
|
497
|
-
if pa_read_func_kwargs_provider:
|
498
|
-
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
499
|
-
|
500
|
-
logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
|
501
|
-
|
502
|
-
kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
|
503
|
-
|
504
|
-
logger.debug(
|
505
|
-
f"Reading the file from {s3_url} into ParquetFile with kwargs: {kwargs}"
|
506
|
-
)
|
507
|
-
pqFile, latency = timed_invocation(ParquetFile, s3_url, **kwargs)
|
508
|
-
|
509
|
-
logger.debug(f"Time to get {s3_url} into parquet file: {latency}s")
|
510
|
-
|
511
|
-
return pqFile
|
512
|
-
|
513
|
-
|
514
994
|
def table_size(table: pa.Table) -> int:
|
515
995
|
return table.nbytes
|
516
996
|
|
@@ -522,13 +1002,23 @@ def parquet_file_size(table: papq.ParquetFile) -> int:
|
|
522
1002
|
def table_to_file(
|
523
1003
|
table: pa.Table,
|
524
1004
|
base_path: str,
|
525
|
-
|
1005
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
|
526
1006
|
block_path_provider: Union[Callable, FilenameProvider],
|
527
1007
|
content_type: str = ContentType.PARQUET.value,
|
1008
|
+
schema: Optional[pa.Schema] = None,
|
528
1009
|
**kwargs,
|
529
1010
|
) -> None:
|
530
1011
|
"""
|
531
1012
|
Writes the given Pyarrow Table to a file.
|
1013
|
+
|
1014
|
+
Args:
|
1015
|
+
table: The PyArrow table to write
|
1016
|
+
base_path: Base path to write to
|
1017
|
+
file_system: Optional filesystem to use
|
1018
|
+
block_path_provider: Provider for block path generation
|
1019
|
+
content_type: Content type for the output file
|
1020
|
+
schema: Optional schema (for compatibility with explicit schema parameter pattern)
|
1021
|
+
kwargs: Keyword arguments passed to the PyArrow write function
|
532
1022
|
"""
|
533
1023
|
writer = CONTENT_TYPE_TO_PA_WRITE_FUNC.get(content_type)
|
534
1024
|
if not writer:
|
@@ -538,8 +1028,10 @@ def table_to_file(
|
|
538
1028
|
f"{CONTENT_TYPE_TO_PA_WRITE_FUNC.keys}"
|
539
1029
|
)
|
540
1030
|
path = block_path_provider(base_path)
|
541
|
-
|
542
|
-
|
1031
|
+
writer_kwargs = content_type_to_writer_kwargs(content_type)
|
1032
|
+
writer_kwargs.update(kwargs)
|
1033
|
+
logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
|
1034
|
+
writer(table, path, filesystem=filesystem, **writer_kwargs)
|
543
1035
|
|
544
1036
|
|
545
1037
|
class RecordBatchTables:
|
@@ -783,7 +1275,6 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
783
1275
|
TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
|
784
1276
|
"""
|
785
1277
|
dtype = array.type
|
786
|
-
MAX_BYTES = 2147483646
|
787
1278
|
max_str_len = None
|
788
1279
|
if pa.types.is_integer(dtype):
|
789
1280
|
max_str_len = _int_max_string_len()
|
@@ -795,7 +1286,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
795
1286
|
max_str_len = _max_decimal256_string_len()
|
796
1287
|
|
797
1288
|
if max_str_len is not None:
|
798
|
-
max_elems_per_chunk =
|
1289
|
+
max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len) # safety factor of 2
|
799
1290
|
all_chunks = []
|
800
1291
|
for chunk in array.chunks:
|
801
1292
|
if len(chunk) < max_elems_per_chunk:
|
@@ -810,3 +1301,693 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
810
1301
|
array = pa.chunked_array(all_chunks, type=dtype)
|
811
1302
|
|
812
1303
|
return pc.cast(array, pa.string())
|
1304
|
+
|
1305
|
+
|
1306
|
+
def file_to_table(
|
1307
|
+
path: str,
|
1308
|
+
content_type: str,
|
1309
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
1310
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
1311
|
+
column_names: Optional[List[str]] = None,
|
1312
|
+
include_columns: Optional[List[str]] = None,
|
1313
|
+
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
1314
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
1315
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
1316
|
+
**kwargs,
|
1317
|
+
) -> pa.Table:
|
1318
|
+
"""
|
1319
|
+
Read a file into a PyArrow Table using any filesystem.
|
1320
|
+
|
1321
|
+
Args:
|
1322
|
+
path: The file path to read
|
1323
|
+
content_type: The content type of the file (e.g., ContentType.CSV.value)
|
1324
|
+
content_encoding: The content encoding (default: IDENTITY)
|
1325
|
+
filesystem: The filesystem to use (if None, will be inferred from path)
|
1326
|
+
column_names: Optional column names to assign
|
1327
|
+
include_columns: Optional columns to include in the result
|
1328
|
+
pa_read_func_kwargs_provider: Optional kwargs provider for customization
|
1329
|
+
fs_open_kwargs: Optional kwargs for filesystem open operations
|
1330
|
+
**kwargs: Additional kwargs passed to the reader function
|
1331
|
+
|
1332
|
+
Returns:
|
1333
|
+
pa.Table: The loaded PyArrow Table
|
1334
|
+
"""
|
1335
|
+
logger.debug(
|
1336
|
+
f"Reading {path} to PyArrow. Content type: {content_type}. "
|
1337
|
+
f"Encoding: {content_encoding}"
|
1338
|
+
)
|
1339
|
+
|
1340
|
+
if (
|
1341
|
+
content_type == ContentType.PARQUET.value
|
1342
|
+
and content_encoding == ContentEncoding.IDENTITY.value
|
1343
|
+
and not filesystem
|
1344
|
+
and path.startswith("s3://")
|
1345
|
+
):
|
1346
|
+
# Use optimized partial parquet reader for s3 if possible
|
1347
|
+
logger.debug(
|
1348
|
+
f"Reading {path} using parquet reader for encoding={content_encoding} "
|
1349
|
+
f"and content_type={content_type}"
|
1350
|
+
)
|
1351
|
+
|
1352
|
+
parquet_reader_func = None
|
1353
|
+
if kwargs.get(READER_TYPE_KWARG, "daft") == "daft":
|
1354
|
+
from deltacat.utils.daft import daft_file_to_pyarrow_table
|
1355
|
+
|
1356
|
+
parquet_reader_func = daft_file_to_pyarrow_table
|
1357
|
+
elif partial_file_download_params and isinstance(
|
1358
|
+
partial_file_download_params, PartialParquetParameters
|
1359
|
+
):
|
1360
|
+
parquet_reader_func = partial_parquet_file_to_table
|
1361
|
+
|
1362
|
+
if parquet_reader_func is not None:
|
1363
|
+
return parquet_reader_func(
|
1364
|
+
path=path,
|
1365
|
+
content_type=content_type,
|
1366
|
+
content_encoding=content_encoding,
|
1367
|
+
filesystem=filesystem,
|
1368
|
+
column_names=column_names,
|
1369
|
+
include_columns=include_columns,
|
1370
|
+
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
1371
|
+
partial_file_download_params=partial_file_download_params,
|
1372
|
+
**kwargs,
|
1373
|
+
)
|
1374
|
+
|
1375
|
+
if READER_TYPE_KWARG in kwargs:
|
1376
|
+
kwargs.pop(READER_TYPE_KWARG)
|
1377
|
+
|
1378
|
+
pa_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
|
1379
|
+
if not pa_read_func:
|
1380
|
+
raise NotImplementedError(
|
1381
|
+
f"PyArrow reader for content type '{content_type}' not "
|
1382
|
+
f"implemented. Known content types: "
|
1383
|
+
f"{list(CONTENT_TYPE_TO_READ_FN.keys())}"
|
1384
|
+
)
|
1385
|
+
|
1386
|
+
reader_kwargs = content_type_to_reader_kwargs(content_type)
|
1387
|
+
|
1388
|
+
_add_column_kwargs(content_type, column_names, include_columns, reader_kwargs)
|
1389
|
+
|
1390
|
+
# Merge with provided kwargs
|
1391
|
+
reader_kwargs.update(kwargs)
|
1392
|
+
|
1393
|
+
if pa_read_func_kwargs_provider:
|
1394
|
+
reader_kwargs = pa_read_func_kwargs_provider(content_type, reader_kwargs)
|
1395
|
+
|
1396
|
+
logger.debug(f"Reading {path} via {pa_read_func} with kwargs: {reader_kwargs}")
|
1397
|
+
|
1398
|
+
table, latency = timed_invocation(
|
1399
|
+
pa_read_func,
|
1400
|
+
path,
|
1401
|
+
filesystem=filesystem,
|
1402
|
+
fs_open_kwargs=fs_open_kwargs,
|
1403
|
+
content_encoding=content_encoding,
|
1404
|
+
**reader_kwargs,
|
1405
|
+
)
|
1406
|
+
logger.debug(f"Time to read {path} into PyArrow Table: {latency}s")
|
1407
|
+
return table
|
1408
|
+
|
1409
|
+
|
1410
|
+
def file_to_parquet(
|
1411
|
+
path: str,
|
1412
|
+
content_type: str = ContentType.PARQUET.value,
|
1413
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
1414
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
1415
|
+
column_names: Optional[List[str]] = None,
|
1416
|
+
include_columns: Optional[List[str]] = None,
|
1417
|
+
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
1418
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
1419
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
1420
|
+
**kwargs,
|
1421
|
+
) -> ParquetFile:
|
1422
|
+
"""
|
1423
|
+
Read a file into a PyArrow ParquetFile using any filesystem.
|
1424
|
+
|
1425
|
+
It returns a ParquetFile object which provides metadata access and lazy loading.
|
1426
|
+
|
1427
|
+
Args:
|
1428
|
+
path: The file path to read
|
1429
|
+
content_type: The content type (must be PARQUET, default: PARQUET)
|
1430
|
+
content_encoding: The content encoding (must be IDENTITY, default: IDENTITY)
|
1431
|
+
filesystem: The filesystem to use (if None, will be inferred from path)
|
1432
|
+
column_names: Optional column names (unused for ParquetFile but kept for API consistency)
|
1433
|
+
include_columns: Optional columns (unused for ParquetFile but kept for API consistency)
|
1434
|
+
pa_read_func_kwargs_provider: Optional kwargs provider for customization
|
1435
|
+
fs_open_kwargs: Optional kwargs for filesystem open operations
|
1436
|
+
**kwargs: Additional kwargs passed to ParquetFile constructor
|
1437
|
+
|
1438
|
+
Returns:
|
1439
|
+
ParquetFile: The ParquetFile object for lazy loading and metadata access
|
1440
|
+
|
1441
|
+
Raises:
|
1442
|
+
ContentTypeValidationError: If content_type is not PARQUET or content_encoding is not IDENTITY
|
1443
|
+
"""
|
1444
|
+
logger.debug(
|
1445
|
+
f"Reading {path} to PyArrow ParquetFile. "
|
1446
|
+
f"Content type: {content_type}. Encoding: {content_encoding}"
|
1447
|
+
)
|
1448
|
+
# Validate content type and encoding
|
1449
|
+
if (
|
1450
|
+
content_type != ContentType.PARQUET.value
|
1451
|
+
or content_encoding != ContentEncoding.IDENTITY.value
|
1452
|
+
):
|
1453
|
+
raise ContentTypeValidationError(
|
1454
|
+
f"File with content type: {content_type} and content encoding: {content_encoding} "
|
1455
|
+
"cannot be read into pyarrow.parquet.ParquetFile"
|
1456
|
+
)
|
1457
|
+
|
1458
|
+
# Resolve filesystem and path
|
1459
|
+
if not filesystem or isinstance(filesystem, pafs.FileSystem):
|
1460
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
1461
|
+
|
1462
|
+
# Build kwargs for ParquetFile constructor
|
1463
|
+
parquet_kwargs = {}
|
1464
|
+
|
1465
|
+
# Add filesystem to kwargs if we have one
|
1466
|
+
if filesystem:
|
1467
|
+
parquet_kwargs["filesystem"] = filesystem
|
1468
|
+
|
1469
|
+
# Apply kwargs provider if provided
|
1470
|
+
if pa_read_func_kwargs_provider:
|
1471
|
+
parquet_kwargs = pa_read_func_kwargs_provider(content_type, parquet_kwargs)
|
1472
|
+
|
1473
|
+
# Merge with provided kwargs
|
1474
|
+
parquet_kwargs.update(kwargs)
|
1475
|
+
|
1476
|
+
logger.debug(f"Pre-sanitize kwargs for {path}: {parquet_kwargs}")
|
1477
|
+
|
1478
|
+
# Sanitize kwargs to only include those supported by ParquetFile.__init__
|
1479
|
+
parquet_kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, parquet_kwargs)
|
1480
|
+
|
1481
|
+
logger.debug(
|
1482
|
+
f"Reading the file from {path} into ParquetFile with kwargs: {parquet_kwargs}"
|
1483
|
+
)
|
1484
|
+
|
1485
|
+
def _create_parquet_file():
|
1486
|
+
return ParquetFile(path, **parquet_kwargs)
|
1487
|
+
|
1488
|
+
pq_file, latency = timed_invocation(_create_parquet_file)
|
1489
|
+
|
1490
|
+
logger.debug(f"Time to get {path} into parquet file: {latency}s")
|
1491
|
+
|
1492
|
+
return pq_file
|
1493
|
+
|
1494
|
+
|
1495
|
+
def concat_tables(
|
1496
|
+
tables: List[Union[pa.Table, papq.ParquetFile]]
|
1497
|
+
) -> Optional[Union[pa.Table, List[papq.ParquetFile]]]:
|
1498
|
+
"""
|
1499
|
+
Concatenate a list of PyArrow Tables or ParquetFiles.
|
1500
|
+
|
1501
|
+
Args:
|
1502
|
+
tables: List of PyArrow Tables or ParquetFiles to concatenate
|
1503
|
+
|
1504
|
+
Returns:
|
1505
|
+
- Single table/ParquetFile if only one input
|
1506
|
+
- List of ParquetFiles if all inputs are ParquetFiles (preserves lazy behavior)
|
1507
|
+
- Concatenated PyArrow Table if mixed types or multiple PyArrow Tables
|
1508
|
+
- None if input is empty
|
1509
|
+
"""
|
1510
|
+
if tables is None or not len(tables):
|
1511
|
+
return None
|
1512
|
+
if len(tables) == 1:
|
1513
|
+
# Return single table as-is to preserve lazy behavior
|
1514
|
+
return next(iter(tables))
|
1515
|
+
|
1516
|
+
# Check if all tables are ParquetFiles - return list to preserve lazy behavior
|
1517
|
+
if all(isinstance(table, papq.ParquetFile) for table in tables):
|
1518
|
+
return list(tables)
|
1519
|
+
|
1520
|
+
# Convert all tables to PyArrow Tables for concatenation
|
1521
|
+
converted_tables = []
|
1522
|
+
for table in tables:
|
1523
|
+
if isinstance(table, papq.ParquetFile):
|
1524
|
+
converted_tables.append(table.read())
|
1525
|
+
else:
|
1526
|
+
converted_tables.append(table)
|
1527
|
+
|
1528
|
+
return pa.concat_tables(converted_tables)
|
1529
|
+
|
1530
|
+
|
1531
|
+
def delta_manifest_to_table(
|
1532
|
+
manifest: "Manifest",
|
1533
|
+
delta: Optional["Delta"] = None,
|
1534
|
+
) -> pa.Table:
|
1535
|
+
"""Create a flattened PyArrow table from a delta manifest.
|
1536
|
+
|
1537
|
+
This implementation can process ~1.4MM records/second on a
|
1538
|
+
10-core 2025 Macbook Air M4 with 16GB of RAM.
|
1539
|
+
|
1540
|
+
Args:
|
1541
|
+
manifest: The manifest to convert to a table
|
1542
|
+
delta: Optional parent delta of the manifest
|
1543
|
+
|
1544
|
+
Returns:
|
1545
|
+
PyArrow table with flattened manifest entry data
|
1546
|
+
"""
|
1547
|
+
if not manifest.entries:
|
1548
|
+
return pa.table({})
|
1549
|
+
|
1550
|
+
num_entries = len(manifest.entries)
|
1551
|
+
|
1552
|
+
# Get manifest-level data once
|
1553
|
+
manifest_author = manifest.author
|
1554
|
+
author_name = manifest_author.name if manifest_author else None
|
1555
|
+
author_version = manifest_author.version if manifest_author else None
|
1556
|
+
|
1557
|
+
# Get delta-level data once
|
1558
|
+
stream_position = delta.stream_position if delta else None
|
1559
|
+
previous_stream_position = delta.previous_stream_position if delta else None
|
1560
|
+
|
1561
|
+
# Pre-allocate lists for core columns to avoid repeated list operations
|
1562
|
+
url_values = [None] * num_entries
|
1563
|
+
id_values = [None] * num_entries
|
1564
|
+
mandatory_values = [None] * num_entries
|
1565
|
+
|
1566
|
+
# Meta columns - most common fields in manifest entries
|
1567
|
+
meta_record_count = [None] * num_entries
|
1568
|
+
meta_content_length = [None] * num_entries
|
1569
|
+
meta_source_content_length = [None] * num_entries
|
1570
|
+
meta_content_type = [None] * num_entries
|
1571
|
+
meta_content_encoding = [None] * num_entries
|
1572
|
+
|
1573
|
+
# Track any additional meta fields we haven't seen before
|
1574
|
+
additional_meta_fields = {}
|
1575
|
+
additional_entry_fields = {}
|
1576
|
+
|
1577
|
+
# Single pass through entries with direct list assignment
|
1578
|
+
for i, entry in enumerate(manifest.entries):
|
1579
|
+
# Handle core entry fields efficiently
|
1580
|
+
url_values[i] = entry.get("url") or entry.get("uri")
|
1581
|
+
id_values[i] = entry.get("id")
|
1582
|
+
mandatory_values[i] = entry.get("mandatory")
|
1583
|
+
|
1584
|
+
# Handle meta fields efficiently
|
1585
|
+
meta = entry.get("meta", {})
|
1586
|
+
meta_record_count[i] = meta.get("record_count")
|
1587
|
+
meta_content_length[i] = meta.get("content_length")
|
1588
|
+
meta_source_content_length[i] = meta.get("source_content_length")
|
1589
|
+
meta_content_type[i] = meta.get("content_type")
|
1590
|
+
meta_content_encoding[i] = meta.get("content_encoding")
|
1591
|
+
|
1592
|
+
# Handle any additional meta fields not in our core set
|
1593
|
+
for meta_key, meta_value in meta.items():
|
1594
|
+
if meta_key not in {
|
1595
|
+
"record_count",
|
1596
|
+
"content_length",
|
1597
|
+
"source_content_length",
|
1598
|
+
"content_type",
|
1599
|
+
"content_encoding",
|
1600
|
+
"entry_type",
|
1601
|
+
}:
|
1602
|
+
field_name = f"meta_{meta_key}"
|
1603
|
+
if field_name not in additional_meta_fields:
|
1604
|
+
additional_meta_fields[field_name] = [None] * num_entries
|
1605
|
+
additional_meta_fields[field_name][i] = meta_value
|
1606
|
+
|
1607
|
+
# Handle any additional entry fields not in our core set
|
1608
|
+
for entry_key, entry_value in entry.items():
|
1609
|
+
if entry_key not in {"url", "uri", "id", "mandatory", "meta"}:
|
1610
|
+
if entry_key not in additional_entry_fields:
|
1611
|
+
additional_entry_fields[entry_key] = [None] * num_entries
|
1612
|
+
additional_entry_fields[entry_key][i] = entry_value
|
1613
|
+
|
1614
|
+
# Build the arrays dict with core columns
|
1615
|
+
arrays_dict = {
|
1616
|
+
"id": pa.array(id_values),
|
1617
|
+
"mandatory": pa.array(mandatory_values),
|
1618
|
+
"meta_content_encoding": pa.array(meta_content_encoding),
|
1619
|
+
"meta_content_length": pa.array(meta_content_length),
|
1620
|
+
"meta_content_type": pa.array(meta_content_type),
|
1621
|
+
"meta_record_count": pa.array(meta_record_count),
|
1622
|
+
"meta_source_content_length": pa.array(meta_source_content_length),
|
1623
|
+
"path": pa.array(url_values),
|
1624
|
+
}
|
1625
|
+
|
1626
|
+
# Add additional fields if they exist
|
1627
|
+
for field_name, field_values in additional_meta_fields.items():
|
1628
|
+
arrays_dict[field_name] = pa.array(field_values)
|
1629
|
+
|
1630
|
+
for field_name, field_values in additional_entry_fields.items():
|
1631
|
+
arrays_dict[field_name] = pa.array(field_values)
|
1632
|
+
|
1633
|
+
# Add manifest/delta columns only if they have data (avoid null columns)
|
1634
|
+
if author_name is not None:
|
1635
|
+
arrays_dict["author_name"] = pa.array([author_name] * num_entries)
|
1636
|
+
if author_version is not None:
|
1637
|
+
arrays_dict["author_version"] = pa.array([author_version] * num_entries)
|
1638
|
+
if stream_position is not None:
|
1639
|
+
arrays_dict["stream_position"] = pa.array([stream_position] * num_entries)
|
1640
|
+
if previous_stream_position is not None:
|
1641
|
+
arrays_dict["previous_stream_position"] = pa.array(
|
1642
|
+
[previous_stream_position] * num_entries
|
1643
|
+
)
|
1644
|
+
|
1645
|
+
return pa.table(arrays_dict)
|
1646
|
+
|
1647
|
+
|
1648
|
+
def get_base_arrow_type_name(arrow_type: pa.DataType) -> str:
|
1649
|
+
"""Get the base type name from a PyArrow DataType for compatibility lookup.
|
1650
|
+
|
1651
|
+
This function normalizes complex PyArrow types to their base type names for
|
1652
|
+
use in reader compatibility validation. Only specific complex types are
|
1653
|
+
normalized; all others return their string representation.
|
1654
|
+
|
1655
|
+
Args:
|
1656
|
+
arrow_type: The PyArrow DataType to normalize
|
1657
|
+
|
1658
|
+
Returns:
|
1659
|
+
str: The normalized type name for compatibility lookup
|
1660
|
+
|
1661
|
+
Examples:
|
1662
|
+
>>> get_base_arrow_type_name(pa.int32())
|
1663
|
+
'int32'
|
1664
|
+
>>> get_base_arrow_type_name(pa.list_(pa.int32()))
|
1665
|
+
'list'
|
1666
|
+
>>> get_base_arrow_type_name(pa.timestamp('s', tz='UTC'))
|
1667
|
+
'timestamp_tz'
|
1668
|
+
"""
|
1669
|
+
# Only normalize specific complex types, otherwise return str(arrow_type)
|
1670
|
+
if isinstance(arrow_type, pa.FixedShapeTensorType):
|
1671
|
+
return "fixed_shape_tensor"
|
1672
|
+
elif pa.types.is_large_list(arrow_type):
|
1673
|
+
return "large_list"
|
1674
|
+
elif pa.types.is_list_view(arrow_type):
|
1675
|
+
return "list_view"
|
1676
|
+
elif pa.types.is_large_list_view(arrow_type):
|
1677
|
+
return "large_list_view"
|
1678
|
+
elif pa.types.is_fixed_size_list(arrow_type):
|
1679
|
+
return "fixed_size_list"
|
1680
|
+
elif pa.types.is_list(arrow_type):
|
1681
|
+
return "list"
|
1682
|
+
elif pa.types.is_map(arrow_type):
|
1683
|
+
return "map"
|
1684
|
+
elif pa.types.is_struct(arrow_type):
|
1685
|
+
return "struct"
|
1686
|
+
elif pa.types.is_dictionary(arrow_type):
|
1687
|
+
return "dictionary"
|
1688
|
+
elif pa.types.is_decimal(arrow_type):
|
1689
|
+
if isinstance(arrow_type, pa.Decimal128Type):
|
1690
|
+
return "decimal128"
|
1691
|
+
elif isinstance(arrow_type, pa.Decimal256Type):
|
1692
|
+
return "decimal256"
|
1693
|
+
elif pa.types.is_timestamp(arrow_type):
|
1694
|
+
# Check if it has timezone info
|
1695
|
+
if arrow_type.tz is not None:
|
1696
|
+
return f"timestamp_tz[{arrow_type.unit}]"
|
1697
|
+
else:
|
1698
|
+
return str(arrow_type)
|
1699
|
+
else:
|
1700
|
+
# For all other types, return the string representation
|
1701
|
+
return str(arrow_type)
|
1702
|
+
|
1703
|
+
|
1704
|
+
def get_supported_test_types() -> List[Tuple[str, str, List[Any]]]:
|
1705
|
+
"""Get comprehensive PyArrow types supported by DeltaCAT writers and readers.
|
1706
|
+
|
1707
|
+
This utility function returns example Arrow arrays for every Arrow type
|
1708
|
+
supported by DeltaCAT writers and readers of tables with schemas. The data
|
1709
|
+
is used for testing compatibility between different dataset types and
|
1710
|
+
content types.
|
1711
|
+
|
1712
|
+
Returns:
|
1713
|
+
List[Tuple[str, str, List[Any]]]: List of tuples containing:
|
1714
|
+
- Test name (str): Human-readable name for the test case
|
1715
|
+
- Arrow type code (str): Python code to create the PyArrow DataType
|
1716
|
+
- Test data (List[Any]): Sample data values for testing
|
1717
|
+
|
1718
|
+
Examples:
|
1719
|
+
>>> test_types = get_supported_test_types()
|
1720
|
+
>>> for name, type_code, data in test_types[:2]:
|
1721
|
+
... print(f"{name}: {type_code} -> {data}")
|
1722
|
+
int8: pa.int8() -> [127, -128, 0]
|
1723
|
+
int16: pa.int16() -> [32767, -32768, 1000]
|
1724
|
+
"""
|
1725
|
+
|
1726
|
+
return [
|
1727
|
+
# Integer types
|
1728
|
+
("int8", "pa.int8()", [127, -128, 0]),
|
1729
|
+
("int16", "pa.int16()", [32767, -32768, 1000]),
|
1730
|
+
("int32", "pa.int32()", [2147483647, -2147483648, 1000]),
|
1731
|
+
("int64", "pa.int64()", [9223372036854775807, -9223372036854775808, 1000]),
|
1732
|
+
("uint8", "pa.uint8()", [255, 0, 128]),
|
1733
|
+
("uint16", "pa.uint16()", [65535, 0, 1000]),
|
1734
|
+
("uint32", "pa.uint32()", [4294967295, 0, 1000]),
|
1735
|
+
("uint64", "pa.uint64()", [18446744073709551615, 0, 1000]),
|
1736
|
+
# Float types
|
1737
|
+
("float16", "pa.float16()", np.array([1.5, np.nan], dtype=np.float16)),
|
1738
|
+
("float32", "pa.float32()", [3.14159, -2.71828, 1.41421]),
|
1739
|
+
("float64", "pa.float64()", [1.123456789, -2.987654321, 3.141592653589793]),
|
1740
|
+
# Boolean and null
|
1741
|
+
("bool_", "pa.bool_()", [True, False, True]),
|
1742
|
+
("null", "pa.null()", [None, None, None]),
|
1743
|
+
# String types
|
1744
|
+
("string", "pa.string()", ["hello", "world", "test"]),
|
1745
|
+
(
|
1746
|
+
"large_string",
|
1747
|
+
"pa.large_string()",
|
1748
|
+
["large hello", "large world", "large test"],
|
1749
|
+
),
|
1750
|
+
# Binary types
|
1751
|
+
("binary", "pa.binary()", [b"hello", b"world", b"test"]),
|
1752
|
+
(
|
1753
|
+
"large_binary",
|
1754
|
+
"pa.large_binary()",
|
1755
|
+
[b"large hello", b"large world", b"large test"],
|
1756
|
+
),
|
1757
|
+
# Date and time types
|
1758
|
+
(
|
1759
|
+
"date32",
|
1760
|
+
"pa.date32()",
|
1761
|
+
[date(2023, 1, 1), date(2023, 12, 31), date(2024, 6, 15)],
|
1762
|
+
),
|
1763
|
+
(
|
1764
|
+
"date64",
|
1765
|
+
"pa.date64()",
|
1766
|
+
[date(2023, 1, 1), date(2023, 12, 31), date(2024, 6, 15)],
|
1767
|
+
),
|
1768
|
+
("time32_s", "pa.time32('s')", [1754962113, 1754962114, 1754962115]),
|
1769
|
+
("time32_ms", "pa.time32('ms')", [1754962113, 1754962114, 1754962115]),
|
1770
|
+
(
|
1771
|
+
"time64_us",
|
1772
|
+
"pa.time64('us')",
|
1773
|
+
[1754962113000000, 1754962114000000, 1754962115000000],
|
1774
|
+
),
|
1775
|
+
(
|
1776
|
+
"time64_ns",
|
1777
|
+
"pa.time64('ns')",
|
1778
|
+
[1754962113000000000, 1754962114000000000, 1754962115000000000],
|
1779
|
+
),
|
1780
|
+
(
|
1781
|
+
"timestamp_s",
|
1782
|
+
"pa.timestamp('s')",
|
1783
|
+
[
|
1784
|
+
datetime(2023, 1, 1, 12, 0, 0),
|
1785
|
+
datetime(2023, 12, 31, 23, 59, 59),
|
1786
|
+
datetime(2024, 6, 15, 10, 30, 45),
|
1787
|
+
],
|
1788
|
+
),
|
1789
|
+
(
|
1790
|
+
"timestamp_ms",
|
1791
|
+
"pa.timestamp('ms')",
|
1792
|
+
[
|
1793
|
+
datetime(2023, 1, 1, 12, 0, 0),
|
1794
|
+
datetime(2023, 12, 31, 23, 59, 59),
|
1795
|
+
datetime(2024, 6, 15, 10, 30, 45),
|
1796
|
+
],
|
1797
|
+
),
|
1798
|
+
(
|
1799
|
+
"timestamp_us",
|
1800
|
+
"pa.timestamp('us')",
|
1801
|
+
[
|
1802
|
+
datetime(2023, 1, 1, 12, 0, 0),
|
1803
|
+
datetime(2023, 12, 31, 23, 59, 59),
|
1804
|
+
datetime(2024, 6, 15, 10, 30, 45),
|
1805
|
+
],
|
1806
|
+
),
|
1807
|
+
(
|
1808
|
+
"timestamp_ns",
|
1809
|
+
"pa.timestamp('ns')",
|
1810
|
+
[
|
1811
|
+
datetime(2023, 1, 1, 12, 0, 0),
|
1812
|
+
datetime(2023, 12, 31, 23, 59, 59),
|
1813
|
+
datetime(2024, 6, 15, 10, 30, 45),
|
1814
|
+
],
|
1815
|
+
),
|
1816
|
+
(
|
1817
|
+
"timestamp_s_utc",
|
1818
|
+
"pa.timestamp('s', tz='UTC')",
|
1819
|
+
[
|
1820
|
+
datetime(2023, 1, 1, 12, 0, 0),
|
1821
|
+
datetime(2023, 12, 31, 23, 59, 59),
|
1822
|
+
datetime(2024, 6, 15, 10, 30, 45),
|
1823
|
+
],
|
1824
|
+
),
|
1825
|
+
(
|
1826
|
+
"timestamp_ms_utc",
|
1827
|
+
"pa.timestamp('ms', tz='UTC')",
|
1828
|
+
[
|
1829
|
+
datetime(2023, 1, 1, 12, 0, 0),
|
1830
|
+
datetime(2023, 12, 31, 23, 59, 59),
|
1831
|
+
datetime(2024, 6, 15, 10, 30, 45),
|
1832
|
+
],
|
1833
|
+
),
|
1834
|
+
(
|
1835
|
+
"timestamp_us_utc",
|
1836
|
+
"pa.timestamp('us', tz='UTC')",
|
1837
|
+
[
|
1838
|
+
datetime(2023, 1, 1, 12, 0, 0),
|
1839
|
+
datetime(2023, 12, 31, 23, 59, 59),
|
1840
|
+
datetime(2024, 6, 15, 10, 30, 45),
|
1841
|
+
],
|
1842
|
+
),
|
1843
|
+
(
|
1844
|
+
"timestamp_ns_utc",
|
1845
|
+
"pa.timestamp('ns', tz='UTC')",
|
1846
|
+
[
|
1847
|
+
datetime(2023, 1, 1, 12, 0, 0),
|
1848
|
+
datetime(2023, 12, 31, 23, 59, 59),
|
1849
|
+
datetime(2024, 6, 15, 10, 30, 45),
|
1850
|
+
],
|
1851
|
+
),
|
1852
|
+
("duration_s", "pa.duration('s')", [1754962113, 1754962114, 1754962115]),
|
1853
|
+
(
|
1854
|
+
"duration_ms",
|
1855
|
+
"pa.duration('ms')",
|
1856
|
+
[1754962113000, 1754962114000, 1754962115000],
|
1857
|
+
),
|
1858
|
+
(
|
1859
|
+
"duration_us",
|
1860
|
+
"pa.duration('us')",
|
1861
|
+
[1754962113000000, 1754962114000000, 1754962115000000],
|
1862
|
+
),
|
1863
|
+
(
|
1864
|
+
"duration_ns",
|
1865
|
+
"pa.duration('ns')",
|
1866
|
+
[1754962113000000000, 1754962114000000000, 1754962115000000000],
|
1867
|
+
),
|
1868
|
+
(
|
1869
|
+
"month_day_nano",
|
1870
|
+
"pa.month_day_nano_interval()",
|
1871
|
+
[
|
1872
|
+
pa.scalar((1, 15, -30), type=pa.month_day_nano_interval()),
|
1873
|
+
pa.scalar((2, 15, -30), type=pa.month_day_nano_interval()),
|
1874
|
+
pa.scalar((3, 15, -30), type=pa.month_day_nano_interval()),
|
1875
|
+
],
|
1876
|
+
),
|
1877
|
+
# Decimal
|
1878
|
+
(
|
1879
|
+
"decimal128_5_2",
|
1880
|
+
"pa.decimal128(5, 2)",
|
1881
|
+
[Decimal("123.45"), Decimal("-67.89"), Decimal("999.99")],
|
1882
|
+
),
|
1883
|
+
(
|
1884
|
+
"decimal128_38_0",
|
1885
|
+
"pa.decimal128(38, 0)",
|
1886
|
+
[
|
1887
|
+
Decimal("12345678901234567890123456789012345678"),
|
1888
|
+
Decimal("-12345678901234567890123456789012345678"),
|
1889
|
+
Decimal("0"),
|
1890
|
+
],
|
1891
|
+
),
|
1892
|
+
(
|
1893
|
+
"decimal128_1_0",
|
1894
|
+
"pa.decimal128(1, 0)",
|
1895
|
+
[Decimal("1"), Decimal("2"), Decimal("3")],
|
1896
|
+
),
|
1897
|
+
(
|
1898
|
+
"decimal128_38_10",
|
1899
|
+
"pa.decimal128(38, 10)",
|
1900
|
+
[
|
1901
|
+
Decimal("1234567890123456789012345678.9012345678"),
|
1902
|
+
Decimal("-1234567890123456789012345678.9012345678"),
|
1903
|
+
Decimal("0.0000000000"),
|
1904
|
+
],
|
1905
|
+
),
|
1906
|
+
(
|
1907
|
+
"decimal256_76_0",
|
1908
|
+
"pa.decimal256(76, 0)",
|
1909
|
+
[
|
1910
|
+
Decimal(
|
1911
|
+
"1234567890123456789012345678901234567812345678901234567890123456789012345678"
|
1912
|
+
),
|
1913
|
+
Decimal("-0"),
|
1914
|
+
Decimal("0"),
|
1915
|
+
],
|
1916
|
+
),
|
1917
|
+
(
|
1918
|
+
"decimal256_1_0",
|
1919
|
+
"pa.decimal256(1, 0)",
|
1920
|
+
[Decimal("1"), Decimal("2"), Decimal("3")],
|
1921
|
+
),
|
1922
|
+
(
|
1923
|
+
"decimal256_5_2",
|
1924
|
+
"pa.decimal256(5, 2)",
|
1925
|
+
[Decimal("123.45"), Decimal("-67.89"), Decimal("999.99")],
|
1926
|
+
),
|
1927
|
+
(
|
1928
|
+
"decimal256_76_38",
|
1929
|
+
"pa.decimal256(76, 38)",
|
1930
|
+
[
|
1931
|
+
Decimal(
|
1932
|
+
"12345678901234567890123456789012345678.12345678901234567890123456789012345678"
|
1933
|
+
),
|
1934
|
+
Decimal("-0.00000000000000000000000000000000000000"),
|
1935
|
+
Decimal("0.00000000000000000000000000000000000000"),
|
1936
|
+
],
|
1937
|
+
),
|
1938
|
+
# List types
|
1939
|
+
("list_int32", "pa.list_(pa.int32())", [[1, 2, 3], [4, 5], [6, 7, 8, 9]]),
|
1940
|
+
("list_string", "pa.list_(pa.string())", [["a", "b"], ["c", "d", "e"], ["f"]]),
|
1941
|
+
# Struct type
|
1942
|
+
(
|
1943
|
+
"struct_simple",
|
1944
|
+
"pa.struct([('name', pa.string()), ('age', pa.int32())])",
|
1945
|
+
[
|
1946
|
+
{"name": "Alice", "age": 30},
|
1947
|
+
{"name": "Bob", "age": 25},
|
1948
|
+
{"name": "Charlie", "age": 35},
|
1949
|
+
],
|
1950
|
+
),
|
1951
|
+
(
|
1952
|
+
"large_list_int32",
|
1953
|
+
"pa.large_list(pa.int32())",
|
1954
|
+
[[1, 2, 3], [4, 5], [6, 7, 8, 9]],
|
1955
|
+
),
|
1956
|
+
(
|
1957
|
+
"fixed_size_list_int32",
|
1958
|
+
"pa.list_(pa.int32(), 3)",
|
1959
|
+
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
1960
|
+
),
|
1961
|
+
(
|
1962
|
+
"list_view_int32",
|
1963
|
+
"pa.list_view(pa.int32())",
|
1964
|
+
[[1, 2, 3], [4, 5], [6, 7, 8, 9]],
|
1965
|
+
),
|
1966
|
+
(
|
1967
|
+
"large_list_view_int32",
|
1968
|
+
"pa.large_list_view(pa.int32())",
|
1969
|
+
[[1, 2, 3], [4, 5], [6, 7, 8, 9]],
|
1970
|
+
),
|
1971
|
+
# Dictionary type
|
1972
|
+
(
|
1973
|
+
"dictionary_string",
|
1974
|
+
"pa.dictionary(pa.int32(), pa.string())",
|
1975
|
+
["apple", "banana", "apple"],
|
1976
|
+
),
|
1977
|
+
# Map type
|
1978
|
+
(
|
1979
|
+
"map_string_int32",
|
1980
|
+
"pa.map_(pa.string(), pa.int32())",
|
1981
|
+
[{"a": 1, "b": 2}, {"c": 3, "d": 4}, {"e": 5}],
|
1982
|
+
),
|
1983
|
+
# Extension Types
|
1984
|
+
(
|
1985
|
+
"fixed_shape_tensor",
|
1986
|
+
"pa.fixed_shape_tensor(pa.int32(), [3, 3])",
|
1987
|
+
[
|
1988
|
+
np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int32),
|
1989
|
+
np.array([10, 11, 12, 13, 14, 15, 16, 17, 18], dtype=np.int32),
|
1990
|
+
np.array([19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int32),
|
1991
|
+
],
|
1992
|
+
),
|
1993
|
+
]
|