deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/utils/daft.py
CHANGED
@@ -1,34 +1,725 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import Optional, List, Any, Dict, Callable
|
2
|
+
from typing import Optional, List, Any, Dict, Callable, Iterator, Union
|
3
|
+
|
4
|
+
from daft.daft import (
|
5
|
+
StorageConfig,
|
6
|
+
PartitionField,
|
7
|
+
Pushdowns as DaftRustPushdowns,
|
8
|
+
ScanTask,
|
9
|
+
FileFormatConfig,
|
10
|
+
ParquetSourceConfig,
|
11
|
+
PartitionTransform as DaftTransform,
|
12
|
+
PartitionField as DaftPartitionField,
|
13
|
+
)
|
14
|
+
from daft.expressions import Expression as DaftExpression
|
15
|
+
from daft.expressions.visitor import PredicateVisitor
|
16
|
+
from pyarrow import Field as PaField
|
17
|
+
|
3
18
|
import daft
|
4
19
|
import ray
|
5
|
-
from daft
|
6
|
-
|
7
|
-
|
20
|
+
from daft import (
|
21
|
+
TimeUnit,
|
22
|
+
DataFrame,
|
23
|
+
Schema as DaftSchema,
|
24
|
+
DataType,
|
25
|
+
)
|
26
|
+
from daft.logical.schema import Field as DaftField
|
27
|
+
from daft.recordbatch import read_parquet_into_pyarrow
|
28
|
+
from daft.io import (
|
29
|
+
IOConfig,
|
30
|
+
S3Config,
|
31
|
+
)
|
32
|
+
from daft.io.scan import (
|
33
|
+
ScanOperator,
|
34
|
+
make_partition_field,
|
35
|
+
)
|
8
36
|
import pyarrow as pa
|
37
|
+
import pyarrow.fs as pafs
|
38
|
+
from fsspec import AbstractFileSystem
|
9
39
|
|
10
40
|
from deltacat import logs
|
11
41
|
from deltacat.utils.common import ReadKwargsProvider
|
12
42
|
from deltacat.utils.schema import coerce_pyarrow_table_to_schema
|
13
|
-
|
14
43
|
from deltacat.types.media import ContentType, ContentEncoding
|
15
44
|
from deltacat.aws.constants import (
|
16
45
|
BOTO_MAX_RETRIES,
|
17
46
|
DAFT_MAX_S3_CONNECTIONS_PER_FILE,
|
18
47
|
AWS_REGION,
|
19
|
-
DEFAULT_FILE_READ_TIMEOUT_MS,
|
20
48
|
)
|
49
|
+
from deltacat.constants import DEFAULT_FILE_READ_TIMEOUT_MS
|
21
50
|
from deltacat.utils.performance import timed_invocation
|
22
51
|
|
23
52
|
from deltacat.types.partial_download import (
|
24
53
|
PartialFileDownloadParams,
|
25
54
|
)
|
26
55
|
|
56
|
+
# Import directly from storage model modules to avoid circular import
|
57
|
+
from deltacat.storage.model.transform import (
|
58
|
+
Transform,
|
59
|
+
IdentityTransform,
|
60
|
+
HourTransform,
|
61
|
+
DayTransform,
|
62
|
+
MonthTransform,
|
63
|
+
YearTransform,
|
64
|
+
BucketTransform,
|
65
|
+
BucketingStrategy,
|
66
|
+
TruncateTransform,
|
67
|
+
TruncateStrategy,
|
68
|
+
)
|
69
|
+
from deltacat.storage.model.partition import PartitionKey
|
70
|
+
from deltacat.storage.model.schema import Schema
|
71
|
+
from deltacat.storage.model.interop import ModelMapper
|
72
|
+
from deltacat.storage.model.expression import (
|
73
|
+
Expression,
|
74
|
+
Reference,
|
75
|
+
Literal,
|
76
|
+
Equal,
|
77
|
+
NotEqual,
|
78
|
+
GreaterThan,
|
79
|
+
LessThan,
|
80
|
+
GreaterThanEqual,
|
81
|
+
LessThanEqual,
|
82
|
+
And,
|
83
|
+
Or,
|
84
|
+
Not,
|
85
|
+
IsNull,
|
86
|
+
)
|
87
|
+
from deltacat.storage.model.scan.push_down import (
|
88
|
+
PartitionFilter,
|
89
|
+
Pushdown as DeltaCatPushdown,
|
90
|
+
)
|
27
91
|
|
28
92
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
29
93
|
|
30
94
|
|
31
|
-
def
|
95
|
+
def translate_pushdown(pushdown: DaftRustPushdowns) -> DeltaCatPushdown:
|
96
|
+
"""
|
97
|
+
Helper method to translate a Daft Pushdowns object into a Deltacat Pushdown.
|
98
|
+
Args:
|
99
|
+
pushdown: Daft Daft Pushdowns object
|
100
|
+
Returns:
|
101
|
+
Pushdown: Deltacat Pushdown object with translated filters
|
102
|
+
"""
|
103
|
+
translator = DaftToDeltacatVisitor()
|
104
|
+
|
105
|
+
partition_filters = None
|
106
|
+
if pushdown.partition_filters is not None:
|
107
|
+
daft_expr = DaftExpression._from_pyexpr(pushdown.partition_filters)
|
108
|
+
partition_filters = PartitionFilter.of(translator.visit(daft_expr))
|
109
|
+
|
110
|
+
filters = None
|
111
|
+
if pushdown.filters is not None:
|
112
|
+
daft_expr = DaftExpression._from_pyexpr(pushdown.filters)
|
113
|
+
# TODO: support deltacat row filters
|
114
|
+
# filters = RowFilter.of(translator.visit(daft_expr))
|
115
|
+
|
116
|
+
columns = None
|
117
|
+
limit = None
|
118
|
+
|
119
|
+
return DeltaCatPushdown.of(
|
120
|
+
partition_filter=partition_filters,
|
121
|
+
column_filter=columns,
|
122
|
+
row_filter=filters,
|
123
|
+
limit=limit,
|
124
|
+
)
|
125
|
+
|
126
|
+
|
127
|
+
class DaftToDeltacatVisitor(PredicateVisitor[Expression]):
|
128
|
+
"""PredicateVisitor implementation to translate Daft Expressions into Deltacat Expressions"""
|
129
|
+
|
130
|
+
def visit_col(self, name: str) -> Expression:
|
131
|
+
return Reference.of(name)
|
132
|
+
|
133
|
+
def visit_lit(self, value: Any) -> Expression:
|
134
|
+
return Literal.of(value)
|
135
|
+
|
136
|
+
def visit_cast(self, expr: DaftExpression, dtype: DataType) -> Expression:
|
137
|
+
# deltacat expressions do not support explicit casting
|
138
|
+
# pyarrow should handle any type casting
|
139
|
+
return self.visit(expr)
|
140
|
+
|
141
|
+
def visit_alias(self, expr: DaftExpression, alias: str) -> Expression:
|
142
|
+
return self.visit(expr)
|
143
|
+
|
144
|
+
def visit_function(self, name: str, args: List[DaftExpression]) -> Expression:
|
145
|
+
# TODO: Add Deltacat expression function support
|
146
|
+
raise ValueError("Function not supported")
|
147
|
+
|
148
|
+
def visit_and(self, left: DaftExpression, right: DaftExpression) -> Expression:
|
149
|
+
"""Visit an 'and' expression."""
|
150
|
+
return And.of(self.visit(left), self.visit(right))
|
151
|
+
|
152
|
+
def visit_or(self, left: DaftExpression, right: DaftExpression) -> Expression:
|
153
|
+
"""Visit an 'or' expression."""
|
154
|
+
return Or.of(self.visit(left), self.visit(right))
|
155
|
+
|
156
|
+
def visit_not(self, expr: DaftExpression) -> Expression:
|
157
|
+
"""Visit a 'not' expression."""
|
158
|
+
return Not.of(self.visit(expr))
|
159
|
+
|
160
|
+
def visit_equal(self, left: DaftExpression, right: DaftExpression) -> Expression:
|
161
|
+
"""Visit an 'equals' comparison predicate."""
|
162
|
+
return Equal.of(self.visit(left), self.visit(right))
|
163
|
+
|
164
|
+
def visit_not_equal(
|
165
|
+
self, left: DaftExpression, right: DaftExpression
|
166
|
+
) -> Expression:
|
167
|
+
"""Visit a 'not equals' comparison predicate."""
|
168
|
+
return NotEqual.of(self.visit(left), self.visit(right))
|
169
|
+
|
170
|
+
def visit_less_than(
|
171
|
+
self, left: DaftExpression, right: DaftExpression
|
172
|
+
) -> Expression:
|
173
|
+
"""Visit a 'less than' comparison predicate."""
|
174
|
+
return LessThan.of(self.visit(left), self.visit(right))
|
175
|
+
|
176
|
+
def visit_less_than_or_equal(
|
177
|
+
self, left: DaftExpression, right: DaftExpression
|
178
|
+
) -> Expression:
|
179
|
+
"""Visit a 'less than or equal' comparison predicate."""
|
180
|
+
return LessThanEqual.of(self.visit(left), self.visit(right))
|
181
|
+
|
182
|
+
def visit_greater_than(
|
183
|
+
self, left: DaftExpression, right: DaftExpression
|
184
|
+
) -> Expression:
|
185
|
+
"""Visit a 'greater than' comparison predicate."""
|
186
|
+
return GreaterThan.of(self.visit(left), self.visit(right))
|
187
|
+
|
188
|
+
def visit_greater_than_or_equal(
|
189
|
+
self, left: DaftExpression, right: DaftExpression
|
190
|
+
) -> Expression:
|
191
|
+
"""Visit a 'greater than or equal' comparison predicate."""
|
192
|
+
return GreaterThanEqual.of(self.visit(left), self.visit(right))
|
193
|
+
|
194
|
+
def visit_between(
|
195
|
+
self, expr: DaftExpression, lower: DaftExpression, upper: DaftExpression
|
196
|
+
) -> Expression:
|
197
|
+
"""Visit a 'between' predicate."""
|
198
|
+
# Implement BETWEEN as lower <= expr <= upper
|
199
|
+
lower_bound = LessThanEqual.of(self.visit(lower), self.visit(expr))
|
200
|
+
upper_bound = LessThanEqual.of(self.visit(expr), self.visit(upper))
|
201
|
+
return And.of(lower_bound, upper_bound)
|
202
|
+
|
203
|
+
def visit_is_in(
|
204
|
+
self, expr: DaftExpression, items: list[DaftExpression]
|
205
|
+
) -> Expression:
|
206
|
+
"""Visit an 'is_in' predicate."""
|
207
|
+
# For empty list, return false literal
|
208
|
+
if not items:
|
209
|
+
return Literal(pa.scalar(False))
|
210
|
+
|
211
|
+
# Implement IN as a series of equality checks combined with OR
|
212
|
+
visited_expr = self.visit(expr)
|
213
|
+
equals_exprs = [Equal.of(visited_expr, self.visit(item)) for item in items]
|
214
|
+
|
215
|
+
# Combine with OR
|
216
|
+
result = equals_exprs[0]
|
217
|
+
for eq_expr in equals_exprs[1:]:
|
218
|
+
result = Or.of(result, eq_expr)
|
219
|
+
|
220
|
+
return result
|
221
|
+
|
222
|
+
def visit_is_null(self, expr: DaftExpression) -> Expression:
|
223
|
+
"""Visit an 'is_null' predicate."""
|
224
|
+
return IsNull.of(self.visit(expr))
|
225
|
+
|
226
|
+
def visit_not_null(self, expr: DaftExpression) -> Expression:
|
227
|
+
"""Visit an 'not_null' predicate."""
|
228
|
+
# NOT NULL is implemented as NOT(IS NULL)
|
229
|
+
return Not.of(IsNull.of(self.visit(expr)))
|
230
|
+
|
231
|
+
|
232
|
+
class DeltaCatScanOperator(ScanOperator):
|
233
|
+
def __init__(self, table, storage_config: StorageConfig) -> None:
|
234
|
+
# Import inside method to avoid circular import
|
235
|
+
from deltacat.catalog.model.table_definition import TableDefinition
|
236
|
+
|
237
|
+
if not isinstance(table, TableDefinition):
|
238
|
+
raise TypeError("table must be a TableDefinition instance")
|
239
|
+
super().__init__()
|
240
|
+
self.table = table
|
241
|
+
self._schema = self._infer_schema()
|
242
|
+
self.partition_keys = self._infer_partition_keys()
|
243
|
+
self.storage_config = storage_config
|
244
|
+
|
245
|
+
def schema(self) -> DaftSchema:
|
246
|
+
return self._schema
|
247
|
+
|
248
|
+
def name(self) -> str:
|
249
|
+
return "DeltaCatScanOperator"
|
250
|
+
|
251
|
+
def display_name(self) -> str:
|
252
|
+
return f"DeltaCATScanOperator({self.table.table.namespace}.{self.table.table.table_name})"
|
253
|
+
|
254
|
+
def partitioning_keys(self) -> list[PartitionField]:
|
255
|
+
return self.partition_keys
|
256
|
+
|
257
|
+
def multiline_display(self) -> list[str]:
|
258
|
+
return [
|
259
|
+
self.display_name(),
|
260
|
+
f"Schema = {self._schema}",
|
261
|
+
f"Partitioning keys = {self.partitioning_keys}",
|
262
|
+
f"Storage config = {self.storage_config}",
|
263
|
+
]
|
264
|
+
|
265
|
+
def to_scan_tasks(self, pushdowns: DaftRustPushdowns) -> Iterator[ScanTask]:
|
266
|
+
dc_pushdown = translate_pushdown(pushdowns)
|
267
|
+
dc_scan_plan = self.table.create_scan_plan(pushdown=dc_pushdown)
|
268
|
+
scan_tasks = []
|
269
|
+
file_format_config = FileFormatConfig.from_parquet_config(
|
270
|
+
# maybe this: ParquetSourceConfig(field_id_mapping=self._field_id_mapping)
|
271
|
+
ParquetSourceConfig()
|
272
|
+
)
|
273
|
+
for dc_scan_task in dc_scan_plan.scan_tasks:
|
274
|
+
for data_file in dc_scan_task.data_files():
|
275
|
+
st = ScanTask.catalog_scan_task(
|
276
|
+
file=data_file.file_path,
|
277
|
+
file_format=file_format_config,
|
278
|
+
schema=self._schema._schema,
|
279
|
+
storage_config=self.storage_config,
|
280
|
+
pushdowns=pushdowns,
|
281
|
+
)
|
282
|
+
scan_tasks.append(st)
|
283
|
+
return iter(scan_tasks)
|
284
|
+
|
285
|
+
def can_absorb_filter(self) -> bool:
|
286
|
+
return False
|
287
|
+
|
288
|
+
def can_absorb_limit(self) -> bool:
|
289
|
+
return False
|
290
|
+
|
291
|
+
def can_absorb_select(self) -> bool:
|
292
|
+
return True
|
293
|
+
|
294
|
+
def _infer_schema(self) -> DaftSchema:
|
295
|
+
|
296
|
+
if not (
|
297
|
+
self.table and self.table.table_version and self.table.table_version.schema
|
298
|
+
):
|
299
|
+
raise RuntimeError(
|
300
|
+
f"Failed to infer schema for DeltaCAT Table "
|
301
|
+
f"{self.table.table.namespace}.{self.table.table.table_name}"
|
302
|
+
)
|
303
|
+
|
304
|
+
return DaftSchema.from_pyarrow_schema(self.table.table_version.schema.arrow)
|
305
|
+
|
306
|
+
def _infer_partition_keys(self) -> list[PartitionField]:
|
307
|
+
if not (
|
308
|
+
self.table
|
309
|
+
and self.table.table_version
|
310
|
+
and self.table.table_version.partition_scheme
|
311
|
+
and self.table.table_version.schema
|
312
|
+
):
|
313
|
+
raise RuntimeError(
|
314
|
+
f"Failed to infer partition keys for DeltaCAT Table "
|
315
|
+
f"{self.table.table.namespace}.{self.table.table.table_name}"
|
316
|
+
)
|
317
|
+
|
318
|
+
schema = self.table.table_version.schema
|
319
|
+
partition_keys = self.table.table_version.partition_scheme.keys
|
320
|
+
if not partition_keys:
|
321
|
+
return []
|
322
|
+
|
323
|
+
partition_fields = []
|
324
|
+
for key in partition_keys:
|
325
|
+
field = DaftPartitionKeyMapper.unmap(key, schema)
|
326
|
+
# Assert that the returned value is not None.
|
327
|
+
assert field is not None, f"Unmapping failed for key {key}"
|
328
|
+
partition_fields.append(field)
|
329
|
+
|
330
|
+
return partition_fields
|
331
|
+
|
332
|
+
|
333
|
+
def read_csv(
|
334
|
+
path: Union[str, List[str]],
|
335
|
+
*,
|
336
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
337
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
338
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
339
|
+
content_type: Optional[str] = None,
|
340
|
+
**read_kwargs,
|
341
|
+
) -> DataFrame:
|
342
|
+
"""
|
343
|
+
Read a CSV file into a Daft DataFrame.
|
344
|
+
|
345
|
+
Args:
|
346
|
+
path: Path to the CSV file
|
347
|
+
filesystem: Optional filesystem to use
|
348
|
+
fs_open_kwargs: Optional filesystem open kwargs
|
349
|
+
content_encoding: Content encoding (IDENTITY or GZIP supported)
|
350
|
+
content_type: Optional content type (PARQUET, JSON, CSV, etc.)
|
351
|
+
**read_kwargs: Additional arguments passed to daft.read_csv
|
352
|
+
|
353
|
+
Returns:
|
354
|
+
DataFrame: The Daft DataFrame
|
355
|
+
"""
|
356
|
+
logger.debug(
|
357
|
+
f"Reading CSV file {path} into Daft DataFrame with kwargs: {read_kwargs}"
|
358
|
+
)
|
359
|
+
|
360
|
+
# If content_type is provided, add appropriate reader kwargs
|
361
|
+
if content_type is not None:
|
362
|
+
content_kwargs = content_type_to_reader_kwargs(content_type)
|
363
|
+
read_kwargs.update(content_kwargs)
|
364
|
+
logger.debug(f"Added content type kwargs for {content_type}: {content_kwargs}")
|
365
|
+
|
366
|
+
# Files should now be written with proper extensions, so we can read them directly
|
367
|
+
logger.debug(f"Reading CSV with Daft from: {path}")
|
368
|
+
df, latency = timed_invocation(daft.read_csv, path, **read_kwargs)
|
369
|
+
|
370
|
+
logger.debug(f"Time to read CSV {path} into Daft DataFrame: {latency}s")
|
371
|
+
return df
|
372
|
+
|
373
|
+
|
374
|
+
def read_json(
|
375
|
+
path: Union[str, List[str]],
|
376
|
+
*,
|
377
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
378
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
379
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
380
|
+
**read_kwargs,
|
381
|
+
) -> DataFrame:
|
382
|
+
"""
|
383
|
+
Read a JSON file into a Daft DataFrame.
|
384
|
+
|
385
|
+
Args:
|
386
|
+
path: Path to the JSON file (supports line-delimited JSON)
|
387
|
+
filesystem: Optional filesystem to use
|
388
|
+
fs_open_kwargs: Optional filesystem open kwargs
|
389
|
+
content_encoding: Content encoding (IDENTITY or GZIP supported)
|
390
|
+
**read_kwargs: Additional arguments passed to daft.read_json
|
391
|
+
|
392
|
+
Returns:
|
393
|
+
DataFrame: The Daft DataFrame
|
394
|
+
"""
|
395
|
+
logger.debug(
|
396
|
+
f"Reading JSON file {path} into Daft DataFrame with kwargs: {read_kwargs}"
|
397
|
+
)
|
398
|
+
|
399
|
+
# Files should now be written with proper extensions, so we can read them directly
|
400
|
+
logger.debug(f"Reading JSON with Daft from: {path}")
|
401
|
+
df, latency = timed_invocation(daft.read_json, path, **read_kwargs)
|
402
|
+
|
403
|
+
logger.debug(f"Time to read JSON {path} into Daft DataFrame: {latency}s")
|
404
|
+
return df
|
405
|
+
|
406
|
+
|
407
|
+
def read_parquet(
|
408
|
+
path: Union[str, List[str]],
|
409
|
+
*,
|
410
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
411
|
+
fs_open_kwargs: Dict[str, Any] = {},
|
412
|
+
content_encoding: str = ContentEncoding.IDENTITY.value,
|
413
|
+
**read_kwargs,
|
414
|
+
) -> DataFrame:
|
415
|
+
"""
|
416
|
+
Read a Parquet file into a Daft DataFrame.
|
417
|
+
|
418
|
+
Args:
|
419
|
+
path: Path to the Parquet file
|
420
|
+
filesystem: Optional filesystem to use
|
421
|
+
fs_open_kwargs: Optional filesystem open kwargs
|
422
|
+
content_encoding: Content encoding (IDENTITY or GZIP supported)
|
423
|
+
**read_kwargs: Additional arguments passed to daft.read_parquet
|
424
|
+
|
425
|
+
Returns:
|
426
|
+
DataFrame: The Daft DataFrame
|
427
|
+
"""
|
428
|
+
logger.debug(
|
429
|
+
f"Reading Parquet file {path} into Daft DataFrame with kwargs: {read_kwargs}"
|
430
|
+
)
|
431
|
+
logger.debug(f"Reading Parquet with Daft from: {path}")
|
432
|
+
df, latency = timed_invocation(daft.read_parquet, path=path, **read_kwargs)
|
433
|
+
logger.debug(f"Time to read Parquet {path} into Daft DataFrame: {latency}s")
|
434
|
+
return df
|
435
|
+
|
436
|
+
|
437
|
+
# Map content types to their respective Daft read functions
|
438
|
+
CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
|
439
|
+
ContentType.UNESCAPED_TSV.value: read_csv,
|
440
|
+
ContentType.TSV.value: read_csv,
|
441
|
+
ContentType.CSV.value: read_csv,
|
442
|
+
ContentType.PSV.value: read_csv,
|
443
|
+
ContentType.PARQUET.value: read_parquet,
|
444
|
+
ContentType.JSON.value: read_json,
|
445
|
+
}
|
446
|
+
|
447
|
+
|
448
|
+
def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
|
449
|
+
"""
|
450
|
+
Returns reader kwargs for the given content type when reading with Daft.
|
451
|
+
"""
|
452
|
+
if content_type == ContentType.UNESCAPED_TSV.value:
|
453
|
+
return {
|
454
|
+
"delimiter": "\t",
|
455
|
+
"has_headers": False,
|
456
|
+
"double_quote": False,
|
457
|
+
"allow_variable_columns": True,
|
458
|
+
}
|
459
|
+
if content_type == ContentType.TSV.value:
|
460
|
+
return {
|
461
|
+
"delimiter": "\t",
|
462
|
+
"has_headers": False,
|
463
|
+
"allow_variable_columns": True,
|
464
|
+
}
|
465
|
+
if content_type == ContentType.CSV.value:
|
466
|
+
return {
|
467
|
+
"delimiter": ",",
|
468
|
+
"has_headers": False,
|
469
|
+
"allow_variable_columns": True,
|
470
|
+
}
|
471
|
+
if content_type == ContentType.PSV.value:
|
472
|
+
return {
|
473
|
+
"delimiter": "|",
|
474
|
+
"has_headers": False,
|
475
|
+
"allow_variable_columns": True,
|
476
|
+
}
|
477
|
+
if content_type in {
|
478
|
+
ContentType.PARQUET.value,
|
479
|
+
ContentType.JSON.value,
|
480
|
+
}:
|
481
|
+
return {}
|
482
|
+
raise ValueError(f"Unsupported content type for Daft reader: {content_type}")
|
483
|
+
|
484
|
+
|
485
|
+
class DaftFieldMapper(ModelMapper[DaftField, PaField]):
|
486
|
+
@staticmethod
|
487
|
+
def map(
|
488
|
+
obj: Optional[DaftField],
|
489
|
+
**kwargs,
|
490
|
+
) -> Optional[PaField]:
|
491
|
+
"""Convert Daft Field to PyArrow Field.
|
492
|
+
|
493
|
+
Args:
|
494
|
+
obj: The Daft Field to convert
|
495
|
+
**kwargs: Additional arguments
|
496
|
+
|
497
|
+
Returns:
|
498
|
+
Converted PyArrow Field object
|
499
|
+
"""
|
500
|
+
if obj is None:
|
501
|
+
return None
|
502
|
+
|
503
|
+
return pa.field(
|
504
|
+
name=obj.name,
|
505
|
+
type=obj.dtype.to_arrow_dtype(),
|
506
|
+
)
|
507
|
+
|
508
|
+
@staticmethod
|
509
|
+
def unmap(
|
510
|
+
obj: Optional[PaField],
|
511
|
+
**kwargs,
|
512
|
+
) -> Optional[DaftField]:
|
513
|
+
"""Convert PyArrow Field to Daft Field.
|
514
|
+
|
515
|
+
Args:
|
516
|
+
obj: The PyArrow Field to convert
|
517
|
+
**kwargs: Additional arguments
|
518
|
+
|
519
|
+
Returns:
|
520
|
+
Converted Daft Field object
|
521
|
+
"""
|
522
|
+
if obj is None:
|
523
|
+
return None
|
524
|
+
|
525
|
+
return DaftField.create(
|
526
|
+
name=obj.name,
|
527
|
+
dtype=DataType.from_arrow_type(obj.type), # type: ignore
|
528
|
+
)
|
529
|
+
|
530
|
+
|
531
|
+
class DaftTransformMapper(ModelMapper[DaftTransform, Transform]):
|
532
|
+
@staticmethod
|
533
|
+
def map(
|
534
|
+
obj: Optional[DaftTransform],
|
535
|
+
**kwargs,
|
536
|
+
) -> Optional[Transform]:
|
537
|
+
"""Convert DaftTransform to DeltaCAT Transform.
|
538
|
+
|
539
|
+
Args:
|
540
|
+
obj: The DaftTransform to convert
|
541
|
+
**kwargs: Additional arguments
|
542
|
+
|
543
|
+
Returns:
|
544
|
+
Converted Transform object
|
545
|
+
"""
|
546
|
+
|
547
|
+
# daft.PartitionTransform doesn't have a Python interface for accessing its attributes,
|
548
|
+
# thus conversion is not possible.
|
549
|
+
# TODO: request Daft to expose Python friendly interface for daft.PartitionTransform
|
550
|
+
raise NotImplementedError(
|
551
|
+
"Converting transform from Daft to DeltaCAT is not supported"
|
552
|
+
)
|
553
|
+
|
554
|
+
@staticmethod
|
555
|
+
def unmap(
|
556
|
+
obj: Optional[Transform],
|
557
|
+
**kwargs,
|
558
|
+
) -> Optional[DaftTransform]:
|
559
|
+
"""Convert DeltaCAT Transform to DaftTransform.
|
560
|
+
|
561
|
+
Args:
|
562
|
+
obj: The Transform to convert
|
563
|
+
**kwargs: Additional arguments
|
564
|
+
|
565
|
+
Returns:
|
566
|
+
Converted DaftTransform object
|
567
|
+
"""
|
568
|
+
if obj is None:
|
569
|
+
return None
|
570
|
+
|
571
|
+
# Map DeltaCAT transforms to Daft transforms using isinstance
|
572
|
+
|
573
|
+
if isinstance(obj, IdentityTransform):
|
574
|
+
return DaftTransform.identity()
|
575
|
+
elif isinstance(obj, HourTransform):
|
576
|
+
return DaftTransform.hour()
|
577
|
+
elif isinstance(obj, DayTransform):
|
578
|
+
return DaftTransform.day()
|
579
|
+
elif isinstance(obj, MonthTransform):
|
580
|
+
return DaftTransform.month()
|
581
|
+
elif isinstance(obj, YearTransform):
|
582
|
+
return DaftTransform.year()
|
583
|
+
elif isinstance(obj, BucketTransform):
|
584
|
+
if obj.parameters.bucketing_strategy == BucketingStrategy.ICEBERG:
|
585
|
+
return DaftTransform.iceberg_bucket(obj.parameters.num_buckets)
|
586
|
+
else:
|
587
|
+
raise ValueError(
|
588
|
+
f"Unsupported Bucketing Strategy: {obj.parameters.bucketing_strategy}"
|
589
|
+
)
|
590
|
+
elif isinstance(obj, TruncateTransform):
|
591
|
+
if obj.parameters.truncate_strategy == TruncateStrategy.ICEBERG:
|
592
|
+
return DaftTransform.iceberg_truncate(obj.parameters.width)
|
593
|
+
else:
|
594
|
+
raise ValueError(
|
595
|
+
f"Unsupported Truncate Strategy: {obj.parameters.truncate_strategy}"
|
596
|
+
)
|
597
|
+
|
598
|
+
raise ValueError(f"Unsupported Transform: {obj}")
|
599
|
+
|
600
|
+
|
601
|
+
class DaftPartitionKeyMapper(ModelMapper[DaftPartitionField, PartitionKey]):
|
602
|
+
@staticmethod
|
603
|
+
def map(
|
604
|
+
obj: Optional[DaftPartitionField],
|
605
|
+
schema: Optional[DaftSchema] = None,
|
606
|
+
**kwargs,
|
607
|
+
) -> Optional[PartitionKey]:
|
608
|
+
"""Convert DaftPartitionField to PartitionKey.
|
609
|
+
|
610
|
+
Args:
|
611
|
+
obj: The DaftPartitionField to convert
|
612
|
+
schema: The Daft schema containing field information
|
613
|
+
**kwargs: Additional arguments
|
614
|
+
|
615
|
+
Returns:
|
616
|
+
Converted PartitionKey object
|
617
|
+
"""
|
618
|
+
# Daft PartitionField only exposes 1 attribute `field` which is not enough
|
619
|
+
# to convert to DeltaCAT PartitionKey
|
620
|
+
# TODO: request Daft to expose more Python friendly interface for PartitionField
|
621
|
+
raise NotImplementedError(
|
622
|
+
f"Converting Daft PartitionField to DeltaCAT PartitionKey is not supported"
|
623
|
+
)
|
624
|
+
|
625
|
+
@staticmethod
|
626
|
+
def unmap(
|
627
|
+
obj: Optional[PartitionKey],
|
628
|
+
schema: Optional[Schema] = None,
|
629
|
+
**kwargs,
|
630
|
+
) -> Optional[DaftPartitionField]:
|
631
|
+
"""Convert PartitionKey to DaftPartitionField.
|
632
|
+
|
633
|
+
Args:
|
634
|
+
obj: The DeltaCAT PartitionKey to convert
|
635
|
+
schema: The Schema containing field information
|
636
|
+
**kwargs: Additional arguments
|
637
|
+
|
638
|
+
Returns:
|
639
|
+
Converted DaftPartitionField object
|
640
|
+
"""
|
641
|
+
if obj is None:
|
642
|
+
return None
|
643
|
+
if obj.name is None:
|
644
|
+
raise ValueError("Name is required for PartitionKey conversion")
|
645
|
+
if not schema:
|
646
|
+
raise ValueError("Schema is required for PartitionKey conversion")
|
647
|
+
if len(obj.key) < 1:
|
648
|
+
raise ValueError(
|
649
|
+
f"At least 1 PartitionKey FieldLocator is expected, instead got {len(obj.key)}. FieldLocators: {obj.key}."
|
650
|
+
)
|
651
|
+
|
652
|
+
# Get the source field from schema - FieldLocator in PartitionKey.key points to the source field of partition field
|
653
|
+
dc_source_field = schema.field(obj.key[0]).arrow
|
654
|
+
daft_source_field = DaftFieldMapper.unmap(obj=dc_source_field)
|
655
|
+
# Convert transform if present
|
656
|
+
daft_transform = DaftTransformMapper.unmap(obj.transform)
|
657
|
+
daft_partition_field = DaftPartitionKeyMapper.get_daft_partition_field(
|
658
|
+
partition_field_name=obj.name,
|
659
|
+
daft_source_field=daft_source_field,
|
660
|
+
dc_transform=obj.transform,
|
661
|
+
)
|
662
|
+
|
663
|
+
# Create DaftPartitionField
|
664
|
+
return make_partition_field(
|
665
|
+
field=daft_partition_field,
|
666
|
+
source_field=daft_source_field,
|
667
|
+
transform=daft_transform,
|
668
|
+
)
|
669
|
+
|
670
|
+
@staticmethod
|
671
|
+
def get_daft_partition_field(
|
672
|
+
partition_field_name: str,
|
673
|
+
daft_source_field: Optional[DaftField],
|
674
|
+
# TODO: replace DeltaCAT transform with Daft Transform for uniformality
|
675
|
+
# We cannot use Daft Transform here because Daft Transform doesn't have a Python interface for us to
|
676
|
+
# access its attributes.
|
677
|
+
# TODO: request Daft to provide a more python friendly interface for Daft Tranform
|
678
|
+
dc_transform: Optional[Transform],
|
679
|
+
) -> DaftField:
|
680
|
+
"""Generate Daft Partition Field given partition field name, source field and transform.
|
681
|
+
Partition field type is inferred using source field type and transform.
|
682
|
+
|
683
|
+
Args:
|
684
|
+
partition_field_name (str): the specified result field name
|
685
|
+
daft_source_field (DaftField): the source field of the partition field
|
686
|
+
daft_transform (DaftTransform): transform applied on the source field to create partition field
|
687
|
+
|
688
|
+
Returns:
|
689
|
+
DaftField: Daft Field representing the partition field
|
690
|
+
"""
|
691
|
+
if daft_source_field is None:
|
692
|
+
raise ValueError("Source field is required for PartitionField conversion")
|
693
|
+
if dc_transform is None:
|
694
|
+
raise ValueError("Transform is required for PartitionField conversion")
|
695
|
+
|
696
|
+
result_type = None
|
697
|
+
# Below type conversion logic references Daft - Iceberg conversion logic:
|
698
|
+
# https://github.com/Eventual-Inc/Daft/blob/7f2e9b5fb50fdfe858be17572f132b37dd6e5ab2/daft/iceberg/iceberg_scan.py#L61-L85
|
699
|
+
if isinstance(dc_transform, IdentityTransform):
|
700
|
+
result_type = daft_source_field.dtype
|
701
|
+
elif isinstance(dc_transform, YearTransform):
|
702
|
+
result_type = DataType.int32()
|
703
|
+
elif isinstance(dc_transform, MonthTransform):
|
704
|
+
result_type = DataType.int32()
|
705
|
+
elif isinstance(dc_transform, DayTransform):
|
706
|
+
result_type = DataType.int32()
|
707
|
+
elif isinstance(dc_transform, HourTransform):
|
708
|
+
result_type = DataType.int32()
|
709
|
+
elif isinstance(dc_transform, BucketTransform):
|
710
|
+
result_type = DataType.int32()
|
711
|
+
elif isinstance(dc_transform, TruncateTransform):
|
712
|
+
result_type = daft_source_field.dtype
|
713
|
+
else:
|
714
|
+
raise ValueError(f"Unsupported transform: {dc_transform}")
|
715
|
+
|
716
|
+
return DaftField.create(
|
717
|
+
name=partition_field_name,
|
718
|
+
dtype=result_type,
|
719
|
+
)
|
720
|
+
|
721
|
+
|
722
|
+
def files_to_dataframe(
|
32
723
|
uris: List[str],
|
33
724
|
content_type: str,
|
34
725
|
content_encoding: str,
|
@@ -36,68 +727,158 @@ def s3_files_to_dataframe(
|
|
36
727
|
include_columns: Optional[List[str]] = None,
|
37
728
|
read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
38
729
|
ray_options_provider: Optional[Callable[[int, Any], Dict[str, Any]]] = None,
|
39
|
-
s3_client_kwargs: Optional[Any] = None,
|
40
730
|
ray_init_options: Optional[Dict[str, Any]] = None,
|
731
|
+
**kwargs,
|
41
732
|
) -> DataFrame:
|
733
|
+
"""
|
734
|
+
Read multiple files into a Daft DataFrame using any filesystem.
|
735
|
+
|
736
|
+
This function supports reading PARQUET, CSV, JSON, TSV, and PSV files.
|
737
|
+
|
738
|
+
Args:
|
739
|
+
uris: List of file URIs to read
|
740
|
+
content_type: The content type (PARQUET, CSV, JSON, TSV, UNESCAPED_TSV, PSV)
|
741
|
+
content_encoding: The content encoding (currently only IDENTITY is supported)
|
742
|
+
column_names: Optional column names to assign
|
743
|
+
include_columns: Optional columns to include in the result
|
744
|
+
read_func_kwargs_provider: Optional kwargs provider for customization
|
745
|
+
ray_options_provider: Optional Ray options provider
|
746
|
+
ray_init_options: Optional Ray initialization options
|
747
|
+
**kwargs: Additional kwargs, including optional 'io_config' for filesystem configuration
|
748
|
+
|
749
|
+
Returns:
|
750
|
+
DataFrame: The Daft DataFrame
|
751
|
+
|
752
|
+
Raises:
|
753
|
+
AssertionError: If content_type is not supported or content_encoding is not IDENTITY
|
754
|
+
|
755
|
+
Examples:
|
756
|
+
# Read local parquet files (filesystem auto-inferred)
|
757
|
+
df = files_to_dataframe(
|
758
|
+
uris=["file1.parquet", "file2.parquet"],
|
759
|
+
content_type=ContentType.PARQUET.value,
|
760
|
+
content_encoding=ContentEncoding.IDENTITY.value
|
761
|
+
)
|
42
762
|
|
763
|
+
# Read CSV files
|
764
|
+
df = files_to_dataframe(
|
765
|
+
uris=["file1.csv", "file2.csv"],
|
766
|
+
content_type=ContentType.CSV.value,
|
767
|
+
content_encoding=ContentEncoding.IDENTITY.value
|
768
|
+
)
|
769
|
+
|
770
|
+
# Read S3 files with custom IOConfig
|
771
|
+
from daft.io import IOConfig, S3Config
|
772
|
+
s3_config = IOConfig(s3=S3Config(...))
|
773
|
+
df = files_to_dataframe(
|
774
|
+
uris=["s3://bucket/file1.parquet", "s3://bucket/file2.parquet"],
|
775
|
+
content_type=ContentType.PARQUET.value,
|
776
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
777
|
+
io_config=s3_config
|
778
|
+
)
|
779
|
+
"""
|
43
780
|
if ray_init_options is None:
|
44
781
|
ray_init_options = {}
|
45
782
|
|
46
|
-
|
47
|
-
|
48
|
-
|
783
|
+
if content_type not in CONTENT_TYPE_TO_READ_FN.keys():
|
784
|
+
raise NotImplementedError(
|
785
|
+
f"Daft native reader supports {CONTENT_TYPE_TO_READ_FN.keys()}, got {content_type}."
|
786
|
+
f"Try using the Ray Dataset reader instead."
|
787
|
+
)
|
49
788
|
|
50
|
-
|
51
|
-
|
52
|
-
|
789
|
+
# Handle content encoding - for now, we only support identity and gzip
|
790
|
+
if content_encoding not in [
|
791
|
+
ContentEncoding.IDENTITY.value,
|
792
|
+
ContentEncoding.GZIP.value,
|
793
|
+
]:
|
794
|
+
raise NotImplementedError(
|
795
|
+
f"Daft native reader currently supports identity and gzip encoding, got {content_encoding}"
|
796
|
+
)
|
53
797
|
|
54
798
|
if not ray.is_initialized():
|
55
|
-
ray.init(
|
799
|
+
ray.init(**ray_init_options)
|
56
800
|
|
57
801
|
daft.context.set_runner_ray(noop_if_initialized=True)
|
58
802
|
|
59
|
-
|
60
|
-
s3_client_kwargs = {}
|
61
|
-
|
62
|
-
kwargs = {}
|
803
|
+
read_kwargs = {}
|
63
804
|
if read_func_kwargs_provider is not None:
|
64
|
-
|
805
|
+
read_kwargs = read_func_kwargs_provider(content_type, read_kwargs)
|
65
806
|
|
66
|
-
#
|
67
|
-
|
807
|
+
# Add content-type-specific reader kwargs
|
808
|
+
content_type_kwargs = content_type_to_reader_kwargs(content_type)
|
809
|
+
read_kwargs.update(content_type_kwargs)
|
68
810
|
|
69
|
-
io_config
|
811
|
+
# Extract io_config from kwargs if provided, otherwise use None
|
812
|
+
io_config = kwargs.pop("io_config", None)
|
70
813
|
|
71
|
-
|
72
|
-
|
73
|
-
)
|
814
|
+
# Merge any remaining kwargs into read_kwargs (including file_path_column for native Daft support)
|
815
|
+
read_kwargs.update(kwargs)
|
74
816
|
|
75
|
-
|
76
|
-
|
77
|
-
)
|
817
|
+
logger.debug(f"Preparing to read {len(uris)} files into daft dataframe")
|
818
|
+
logger.debug(f"Content type: {content_type}")
|
819
|
+
logger.debug(f"Final read_kwargs: {read_kwargs}")
|
78
820
|
|
79
|
-
|
821
|
+
# Get the appropriate Daft reader function based on content type
|
822
|
+
daft_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
|
823
|
+
if not daft_read_func:
|
824
|
+
raise NotImplementedError(
|
825
|
+
f"Daft reader for content type '{content_type}' not implemented. "
|
826
|
+
f"Known content types: {list(CONTENT_TYPE_TO_READ_FN.keys())}"
|
827
|
+
)
|
80
828
|
|
81
|
-
|
829
|
+
# Handle schema for all supported formats
|
830
|
+
table_version_schema = kwargs.get("table_version_schema")
|
831
|
+
if table_version_schema is not None:
|
832
|
+
# Convert PyArrow schema to Daft schema using the official API
|
833
|
+
daft_schema = daft.Schema.from_pyarrow_schema(table_version_schema)
|
834
|
+
# Convert DaftSchema to dictionary format required by Daft readers
|
835
|
+
schema_dict = {field.name: field.dtype for field in daft_schema}
|
836
|
+
# Remove table_version_schema from kwargs since Daft readers don't recognize it
|
837
|
+
read_kwargs.pop("table_version_schema", None)
|
838
|
+
# Use explicit schema with infer_schema=False for correctness and performance
|
839
|
+
read_kwargs.update({"infer_schema": False, "schema": schema_dict})
|
840
|
+
else:
|
841
|
+
# Remove table_version_schema parameter if present but None
|
842
|
+
read_kwargs.pop("table_version_schema", None)
|
843
|
+
|
844
|
+
logger.debug(f"Reading {len(uris)} files with Daft using {daft_read_func}.")
|
82
845
|
|
83
|
-
|
846
|
+
# Call the appropriate Daft reader function
|
847
|
+
if io_config is not None and content_type == ContentType.PARQUET.value:
|
848
|
+
# Only parquet reader supports io_config parameter
|
849
|
+
df, latency = timed_invocation(
|
850
|
+
daft_read_func, path=uris, io_config=io_config, **read_kwargs
|
851
|
+
)
|
852
|
+
else:
|
853
|
+
df, latency = timed_invocation(daft_read_func, path=uris, **read_kwargs)
|
854
|
+
|
855
|
+
logger.debug(f"Daft read {len(uris)} files in {latency}s.")
|
856
|
+
|
857
|
+
# Apply column selection after reading
|
858
|
+
columns_to_read = include_columns or column_names
|
859
|
+
file_path_column = read_kwargs.get("file_path_column")
|
860
|
+
if file_path_column and columns_to_read and file_path_column not in columns_to_read:
|
861
|
+
# Add file_path_column to selection if it was specified
|
862
|
+
columns_to_read.append(file_path_column)
|
84
863
|
|
85
864
|
if columns_to_read:
|
865
|
+
logger.debug(f"Selecting columns {columns_to_read} with Daft.")
|
86
866
|
return df.select(*columns_to_read)
|
87
867
|
else:
|
88
868
|
return df
|
89
869
|
|
90
870
|
|
91
|
-
def
|
92
|
-
|
871
|
+
def daft_file_to_pyarrow_table(
|
872
|
+
path: str,
|
93
873
|
content_type: str,
|
94
874
|
content_encoding: str,
|
875
|
+
filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
|
95
876
|
column_names: Optional[List[str]] = None,
|
96
877
|
include_columns: Optional[List[str]] = None,
|
97
878
|
pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
98
879
|
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
99
|
-
**
|
100
|
-
):
|
880
|
+
**kwargs,
|
881
|
+
) -> pa.Table:
|
101
882
|
assert (
|
102
883
|
content_type == ContentType.PARQUET.value
|
103
884
|
), f"daft native reader currently only supports parquet, got {content_type}"
|
@@ -122,13 +903,16 @@ def daft_s3_file_to_table(
|
|
122
903
|
):
|
123
904
|
row_groups = partial_file_download_params.row_groups_to_download
|
124
905
|
|
125
|
-
io_config
|
906
|
+
# Extract io_config from kwargs if provided
|
907
|
+
io_config = kwargs.pop("io_config", None)
|
908
|
+
if not io_config and path.startswith("s3://"):
|
909
|
+
io_config = _get_s3_io_config(kwargs)
|
126
910
|
|
127
|
-
logger.debug(f"Preparing to read
|
911
|
+
logger.debug(f"Preparing to read object from {path} into daft table")
|
128
912
|
|
129
913
|
pa_table, latency = timed_invocation(
|
130
914
|
read_parquet_into_pyarrow,
|
131
|
-
path=
|
915
|
+
path=path,
|
132
916
|
columns=include_columns or column_names,
|
133
917
|
row_groups=row_groups,
|
134
918
|
io_config=io_config,
|
@@ -137,7 +921,7 @@ def daft_s3_file_to_table(
|
|
137
921
|
file_timeout_ms=file_timeout_ms,
|
138
922
|
)
|
139
923
|
|
140
|
-
logger.debug(f"Time to read
|
924
|
+
logger.debug(f"Time to read object from {path} into daft table: {latency}s")
|
141
925
|
|
142
926
|
if kwargs.get("schema") is not None:
|
143
927
|
input_schema = kwargs["schema"]
|